# Vehicle Sales Data Analysis
This notebook outlines the code and results of the ten data analysis tasks for the BS3220 Parallel Programming assignment. 


### Import libraries and load data

In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, count, max, min, avg, col, rank, to_date, date_format, regexp_extract, row_number, first, last, format_number, substring, year, expr, quarter, round, month
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark session
spark = SparkSession.builder \
    .appName("VehicleSalesCleaning") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

df = spark.read.csv("Superstore.csv", header=True, inferSchema=True)


In [None]:
df.printSchema()

In [None]:
df.show(truncate=False)

### Data cleaning

In [None]:
from pyspark.sql.functions import month, year, count, to_date, round, when, isnan



df = df.withColumn("Sales", round("Sales", 2))
df = df.withColumn("Order Date", to_date("Order Date", "dd/MM/yyyy"))

df.select("Order Date").show(5, truncate=False)


# Extract the year and month from 'Order Date'
sales_by_month = df.withColumn("Year", year("Order Date")).withColumn("Month", month("Order Date"))

# Group by 'Year' and 'Month' and count the total rows, not just distinct 'Order ID'
sales_by_month = sales_by_month.groupBy("Year", "Month").agg(
    count("*").alias("Number of Sales")  # Count all rows to capture total sales entries
).orderBy("Number of Sales")  # Order the results by year and month for better readability

# Show the results
sales_by_month.show(100)

df = df.filter(~col("Product Name").contains('"'))

filtered_count = df.count()
print(f"Filtered row count: {filtered_count}")



### Task 1: Find the total sales for each item, both the number of units and the total price/cost

In [None]:
total_sales_per_item = df.groupBy("Product Name").agg(
    sum("Quantity").alias("Total Units"),
    format_number(sum("Sales"), 2).alias("Total Sales")
)
total_sales_per_item.show(truncate=False)

### Task 2: Summarise the total sales of all items at each location

In [None]:
total_sales_per_location = df.filter(df["Sales"].isNotNull()) \
    .groupBy("City") \
    .agg(sum("Sales").alias("Total Sales")) \
    .orderBy("Total Sales")

# Format the 'Total Sales' column after sorting
total_sales_per_location = total_sales_per_location.withColumn(
    "Total Sales", format_number("Total Sales", 2)
)

total_sales_per_location.show(truncate=False)


### Task 3: List all products and their combined sales, grouped by their location of sale.

In [None]:
sales_by_product_and_location = df.groupBy("City", "Product Name").agg(
    sum("Sales").alias("Sum Sales")
)

# Format the 'Sum Sales' and order by 'City' alphabetically
sales_by_product_and_location = sales_by_product_and_location.withColumn(
    "Combined Sales", format_number("Sum Sales", 2)
).drop("Sum Sales")  # Optionally drop the unformatted sum column if it's no longer needed

# Order the results by 'City'
sales_by_product_and_location = sales_by_product_and_location.orderBy("City")

# Display the results
sales_by_product_and_location.show(truncate=False)

### Task 4: Show the sales numbers for the item which sold the most units at each location

In [None]:
windowSpec = Window.partitionBy("City").orderBy(col("Total Units").desc())

# Group by 'City' and 'Product Name', then aggregate sum of 'Quantity' and sum of 'Sales'
best_selling_items_per_location = df.groupBy("City", "Product Name").agg(
    sum("Quantity").alias("Total Units"),
    sum("Sales").alias("Total Sales")
).withColumn("rank", rank().over(windowSpec))  # Apply ranking within each city

# Filter to get only the top selling item per location and then drop the 'rank' column
best_selling_items_per_location = best_selling_items_per_location.filter(col("rank") == 1).drop("rank")

# Show the results
best_selling_items_per_location.show()

### Task 5: List all items that were sold within two months of your choosing

In [None]:
from pyspark.sql.functions import to_date, month, year, sum, round

df = df.withColumn("Sales", round("Sales", 2))
df = df.withColumn("Order Date", to_date("Order Date", "dd/MM/yyyy"))

# Filter and aggregate sales for January 2014
jan_2014_sales = df.filter((month("Order Date") == 1) & (year("Order Date") == 2014))
jan_2014_total_sales = jan_2014_sales.groupBy("Product Name").agg(sum("Sales").alias("Total Sales"))
total_jan_2014 = jan_2014_sales.agg(sum("Sales").alias("Total Sales for January 2014")).collect()[0][0]

# Filter and aggregate sales for December 2014
dec_2014_sales = df.filter((month("Order Date") == 12) & (year("Order Date") == 2014))
dec_2014_total_sales = dec_2014_sales.groupBy("Product Name").agg(sum("Sales").alias("Total Sales"))
total_dec_2014 = dec_2014_sales.agg(sum("Sales").alias("Total Sales for December 2014")).collect()[0][0]

# Print the total sales for January 2014
print(f"Total Sales for January 2014: {total_jan_2014:.2f}")

print("Sales Details for January 2014:")
jan_2014_total_sales.show(truncate=False)

# Print the total sales for December 2014
print(f"Total Sales for December 2014: {total_dec_2014:.2f}")

print("Sales Details for December 2014:")
dec_2014_total_sales.show(truncate=False)


### Task 6: Identify the item which has the lowest overall sales, both for the dataset as a whole and for each sales location

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

lowest_sales_overall = df.groupBy("Product Name").agg(
    count("*").alias("Number of Sales")
).orderBy("Number of Sales").limit(1)

# Calculate the lowest number of sales occurrences per location
lowest_sales_by_location = df.groupBy("City", "Product Name").agg(
    count("*").alias("Number of Sales")
)

# Define a window specification that partitions by 'City' and orders by 'Number of Sales' and 'Product Name'
windowSpec = Window.partitionBy("City").orderBy(col("Number of Sales"), col("Product Name"))

# Apply a window function to rank products by number of sales per city, and to sort ties alphabetically by product name
ranked_sales_by_location = lowest_sales_by_location.withColumn("rank", rank().over(windowSpec))

# Filter to get only the product with the lowest sales (rank 1) per city
lowest_sales_details_by_location = ranked_sales_by_location.filter(col("rank") == 1).select(
    "City", "Product Name", "Number of Sales")

print("Item with the Lowest Number of Sales Occurrences Overall:")
lowest_sales_overall.show(truncate=False)

print("Item with the Lowest Number of Sales Occurrences per Location:")
lowest_sales_details_by_location.show(truncate=False)


### Task 7: Find the most expensive and least expensive item for each location where sales occurred

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import max, min, col, rank

# Assuming 'df' has already been prepared with correct data types and necessary columns
# Group by City and Product Name to find the most and least expensive sales
expensiveCheapPerLocation = df.groupBy("City", "Product Name").agg(
    max("Sales").alias("Most Expensive Sale"), 
    min("Sales").alias("Least Expensive Sale")
)

# Define window specifications for both the most expensive and the least expensive items
windowSpecMost = Window.partitionBy("City").orderBy(col("Most Expensive Sale").desc())
windowSpecLeast = Window.partitionBy("City").orderBy(col("Least Expensive Sale"))

# Apply the window function to rank the products within each city for most expensive
mostExpensivePerLocation = expensiveCheapPerLocation.withColumn("rank", rank().over(windowSpecMost))
mostExpensivePerLocation = mostExpensivePerLocation.filter(col("rank") == 1).select(
    "City", "Product Name", "Most Expensive Sale")

# Apply the window function to rank the products within each city for least expensive
leastExpensivePerLocation = expensiveCheapPerLocation.withColumn("rank", rank().over(windowSpecLeast))
leastExpensivePerLocation = leastExpensivePerLocation.filter(col("rank") == 1).select(
    "City", "Product Name", "Least Expensive Sale")

# Display the results
print("Most Expensive Items per Location:")
mostExpensivePerLocation.show()

print("Least Expensive Items per Location:")
leastExpensivePerLocation.show()


### Task 8: Calculate the average cost of an item at each location within your dataset

In [None]:
from pyspark.sql.functions import avg, col, round

# Group by 'City' and calculate the average 'Sales', then round to 2 decimal places
average_cost_per_location = df.groupBy("City").agg(
    round(avg("Sales"), 2).alias("Average Sales")
).orderBy("City")  # Ordering by 'City' for a better presentation

# Show the results
print("Average Cost of an Item at Each Location:")
average_cost_per_location.show(truncate=False)


### Task 9: Based on your individual dataset, create a set of variables which can be used as broadcast variables.

### Task 10: Complete one other query to analyse the data, based on your individual dataset.

### Machine Learning

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as spark_sum, lit, col
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import to_date, quarter, year


df = df.withColumn("Order Date", to_date("Order Date", "dd/MM/yyyy"))
df = df.withColumn("Year", year("Order Date"))
df = df.withColumn("Quarter", quarter("Order Date"))
df = df.withColumn("Quantity", col("Quantity").cast(IntegerType()))
df = df.withColumn("Discount", col("Discount").cast("double"))

# Filter data for the year 2013 only
df_2013 = df.filter(col("Year") == 2013)

# Split data into training (Q1 to Q3) and testing (Q4) sets
train_data = df_2013.filter(df_2013["Quarter"].isin([1, 2, 3]))
test_data = df_2013.filter(df_2013["Quarter"] == 4)

# Aggregate the total sales from Q1-Q3 for training
total_sales_q1_q3 = train_data.agg(spark_sum("Sales").alias("Total Sales Q1-Q3"))
total_sales_q4 = test_data.agg(spark_sum("Sales").alias("Actual Total Sales Q4"))

# Prepare features using VectorAssembler even for a constant feature
assembler = VectorAssembler(inputCols=[], outputCol="features")
train_data_features = assembler.transform(total_sales_q1_q3).select("features", "Total Sales Q1-Q3")
test_data_features = assembler.transform(total_sales_q4).select("features", "Actual Total Sales Q4")

# Define the linear regression model
lr = LinearRegression(featuresCol="features", labelCol="Total Sales Q1-Q3")

# Create a pipeline
pipeline = Pipeline(stages=[lr])

# Fit the model on aggregated training data
model = pipeline.fit(train_data_features)

# Predict on the aggregated test data
predictions = model.transform(test_data_features)

# Show predicted vs actual
predictions.select("Actual Total Sales Q4", "prediction").show()

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="Actual Total Sales Q4", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on total sales prediction: {rmse}")


In [110]:
from pyspark.sql.functions import quarter, to_timestamp, avg, col 
df = df.withColumn("Order Date", to_timestamp("Order Date", "dd/MM/yyyy"))
df = df.withColumn("Quarter", quarter("Order Date"))

# Grouping data by year, quarter, and calculating the sum of sales, quantity, and profit
quarterly_data = df.groupBy("Year", "Quarter").agg(
    sum("Sales").alias("TotalSales"),
    sum("Quantity").alias("TotalQuantity"),
    sum("Profit").alias("TotalProfit")
)

# Split data into training (Q1-Q3) and testing (Q4)
train_data = quarterly_data.filter(col("Quarter") < 4)
test_data = quarterly_data.filter((col("Quarter") == 4) & (col("Year") == 2014))

# Preparing the assembler
assembler = VectorAssembler(
    inputCols=["TotalQuantity", "TotalProfit"], 
    outputCol="features"
)

# Transforming data
train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

# Defining the linear regression model
lr = LinearRegression(featuresCol="features", labelCol="TotalSales")

# Training the model
lr_model = lr.fit(train_data)

# Making predictions
predictions = lr_model.transform(test_data)

# Evaluating the model
evaluator = RegressionEvaluator(labelCol="TotalSales", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# Show predictions
predictions.select("Year", "Quarter", "TotalSales", "prediction").show()

Root Mean Squared Error (RMSE) on test data = 31780.4
+----+-------+-----------------+------------------+
|Year|Quarter|       TotalSales|        prediction|
+----+-------+-----------------+------------------+
|2014|      4|270048.4300000002|238268.05966878505|
+----+-------+-----------------+------------------+

