# Vehicle Sales Data Analysis
This notebook outlines the code and results of the ten data analysis tasks for the BS3220 Parallel Programming assignment. 


### Import libraries and load data

In [2]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StringType


# Initialize Spark session
spark = SparkSession.builder \
    .appName("Discount Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "120s") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

spark.sparkContext.setLogLevel("DEBUG")


df = spark.read.csv("Superstore.csv", header=True, inferSchema=True)


### Data cleaning

In [3]:
from pyspark.sql.functions import month, year, count, to_date, round

df = df.withColumn("Sales", round("Sales", 2))
df = df.withColumn("Order Date", to_date("Order Date", "dd/MM/yyyy"))

df.select("Order Date").show(5, truncate=False)
df = df.filter(~col("Product Name").contains('"'))

filtered_count = df.count()
print(f"Filtered row count: {filtered_count}")



+----------+
|Order Date|
+----------+
|2013-11-09|
|2013-11-09|
|2013-06-13|
|2012-10-11|
|2012-10-11|
+----------+
only showing top 5 rows

Filtered row count: 9183


### Task 1: Find the total sales for each item, both the number of units and the total price/cost

In [4]:
total_sales_per_item = df.groupBy("Product Name").agg(
    sum("Quantity").alias("Total Units"),
    format_number(sum("Sales"), 2).alias("Total Sales")
)
total_sales_per_item.show(truncate=False)

+---------------------------------------------------------------------------------+-----------+-----------+
|Product Name                                                                     |Total Units|Total Sales|
+---------------------------------------------------------------------------------+-----------+-----------+
|GBC ProClick 150 Presentation Binding System                                     |22.0       |3,222.99   |
|Wasp CCD Handheld Bar Code Reader                                                |3.0        |336.51     |
|Wilson Jones 14 Line Acrylic Coated Pressboard Data Binders                      |23.0       |100.39     |
|Aastra 6757i CT Wireless VoIP phone                                              |16.0       |2,929.98   |
|Global Armless Task Chair, Royal Blue                                            |28.0       |1,396.44   |
|Sortfiler Multipurpose Personal File Organizer, Black                            |24.0       |483.42     |
|Bretford �Just In Time� Hei

### Task 2: Summarise the total sales of all items at each location

In [6]:
total_sales_per_location = df.filter(df["Sales"].isNotNull()) \
    .groupBy("City") \
    .agg(sum("Sales").alias("Total Sales")) \
    .orderBy(col("Total Sales").desc()) 

# Format the column after sorting
total_sales_per_location = total_sales_per_location.withColumn(
    "Total Sales", format_number("Total Sales", 2)
)

total_sales_per_location.show(truncate=False)


+-------------+-----------+
|City         |Total Sales|
+-------------+-----------+
|New York City|241,000.45 |
|Los Angeles  |168,539.72 |
|Seattle      |113,898.95 |
|San Francisco|109,699.89 |
|Philadelphia |104,549.69 |
|Houston      |63,716.90  |
|Chicago      |47,004.49  |
|San Diego    |46,052.02  |
|Jacksonville |44,289.18  |
|Detroit      |42,276.50  |
|Springfield  |40,729.08  |
|Columbus     |36,474.45  |
|Newark       |27,832.00  |
|Lafayette    |25,001.83  |
|Columbia     |23,649.72  |
|Burlington   |21,623.33  |
|Jackson      |21,315.85  |
|San Antonio  |21,164.86  |
|Dallas       |19,529.04  |
|Richmond     |17,849.57  |
+-------------+-----------+
only showing top 20 rows



### Task 3: List all products and their combined sales, grouped by their location of sale.

In [None]:
sales_by_product_and_location = df.groupBy("City", "Product Name").agg(
    sum("Sales").alias("Sum Sales")
)

sales_by_product_and_location = sales_by_product_and_location.withColumn(
    "Combined Sales", format_number("Sum Sales", 2)
).drop("Sum Sales")

# Ordering the results by City
sales_by_product_and_location = sales_by_product_and_location.orderBy("City")

sales_by_product_and_location.show(truncate=False)

+--------+---------------------------------------------------------------+--------------+
|City    |Product Name                                                   |Combined Sales|
+--------+---------------------------------------------------------------+--------------+
|Aberdeen|Acme Titanium Bonded Scissors                                  |25.50         |
|Abilene |Hoover Commercial Lightweight Upright Vacuum                   |1.39          |
|Akron   |Newell 315                                                     |14.35         |
|Akron   |Cisco IP Phone 7961G-GE VoIP phone                             |259.90        |
|Akron   |GBC Recycled VeloBinder Covers                                 |25.56         |
|Akron   |Deflect-o Glass Clear Studded Chair Mats                       |149.23        |
|Akron   |Belkin F8E887 USB Wired Ergonomic Keyboard                     |71.98         |
|Akron   |Southworth 25% Cotton Linen-Finish Paper & Envelopes           |21.74         |
|Akron   |

### Task 4: Show the sales numbers for the item which sold the most units at each location

In [None]:
windowSpec = Window.partitionBy("City").orderBy(col("Total Units").desc())

best_selling_items_per_location = df.groupBy("City", "Product Name").agg(
    sum("Quantity").alias("Total Units"),
    sum("Sales").alias("Total Sales")
).withColumn("rank", rank().over(windowSpec)) 

best_selling_items_per_location = best_selling_items_per_location.filter(col("rank") == 1).drop("rank")

best_selling_items_per_location.show()

+------------+--------------------+-----------+-----------+
|        City|        Product Name|Total Units|Total Sales|
+------------+--------------------+-----------+-----------+
|    Aberdeen|Acme Titanium Bon...|        3.0|       25.5|
|     Abilene|Hoover Commercial...|        2.0|       1.39|
|       Akron|OIC Colored Binde...|        6.0|      17.18|
| Albuquerque|OtterBox Commuter...|        8.0|     140.74|
| Albuquerque|SimpliFile Person...|        8.0|       90.8|
|  Alexandria|DAX Wood Document...|       14.0|     192.22|
|       Allen|           Xerox 198|        4.0|      15.94|
|       Allen|GBC Imprintable C...|        4.0|       8.78|
|   Allentown|SAFCO Commercial ...|        6.0|     663.07|
|     Altoona|Eldon Spacemaker ...|        6.0|      16.03|
|    Amarillo|HON 5400 Series T...|        5.0|    2453.43|
|    Amarillo|           Xerox 212|        5.0|      25.92|
|     Anaheim|GBC Recycled Velo...|        9.0|     122.69|
|     Anaheim|Carina Mini Syste...|     

### Task 5: List all items that were sold within two months of your choosing

In [None]:
df = df.withColumn("Sales", round("Sales", 2))
df = df.withColumn("Order Date", to_date("Order Date", "dd/MM/yyyy"))

jan_2014_sales = df.filter((month("Order Date") == 1) & (year("Order Date") == 2014))
jan_2014_total_sales = jan_2014_sales.groupBy("Product Name").agg(sum("Sales").alias("Total Sales"))
total_jan_2014 = jan_2014_sales.agg(sum("Sales").alias("Total Sales for January 2014")).collect()[0][0]

dec_2014_sales = df.filter((month("Order Date") == 12) & (year("Order Date") == 2014))
dec_2014_total_sales = dec_2014_sales.groupBy("Product Name").agg(sum("Sales").alias("Total Sales"))
total_dec_2014 = dec_2014_sales.agg(sum("Sales").alias("Total Sales for December 2014")).collect()[0][0]

print(f"Total Sales for January 2014: {total_jan_2014:.2f}")

print("Sales Details for January 2014:")
jan_2014_total_sales.show(truncate=False)

print(f"Total Sales for December 2014: {total_dec_2014:.2f}")

print("Sales Details for December 2014:")
dec_2014_total_sales.show(truncate=False)


Total Sales for January 2014: 44048.21
Sales Details for January 2014:
+----------------------------------------------------+-----------+
|Product Name                                        |Total Sales|
+----------------------------------------------------+-----------+
|GBC ProClick 150 Presentation Binding System        |2022.27    |
|Avery Durable Binders                               |14.4       |
|Xerox 212                                           |25.92      |
|Flat Face Poster Frame                              |37.68      |
|Bagged Rubber Bands                                 |2.52       |
|GBC Instant Report Kit                              |38.82      |
|Avanti 4.4 Cu. Ft. Refrigerator                     |542.94     |
|Office Star - Contemporary Task Swivel Chair        |310.74     |
|Quartet Omega Colored Chalk, 12/Pack                |9.34       |
|Airmail Envelopes                                   |268.58     |
|Strathmore Photo Mount Cards                        |21.7

### Task 6: Identify the item which has the lowest overall sales, both for the dataset as a whole and for each sales location

In [None]:
lowest_sales_overall = df.groupBy("Product Name").agg(
    count("*").alias("Number of Sales")
).orderBy("Number of Sales").limit(1)

lowest_sales_by_location = df.groupBy("City", "Product Name").agg(
    count("*").alias("Number of Sales")
)

windowSpec = Window.partitionBy("City").orderBy(col("Number of Sales"), col("Product Name"))

ranked_sales_by_location = lowest_sales_by_location.withColumn("rank", rank().over(windowSpec))

lowest_sales_details_by_location = ranked_sales_by_location.filter(col("rank") == 1).select(
    "City", "Product Name", "Number of Sales")

print("Item with the Lowest Number of Sales Occurrences Overall:")
lowest_sales_overall.show(truncate=False)

print("Item with the Lowest Number of Sales Occurrences per Location:")
lowest_sales_details_by_location.show(truncate=False)


Item with the Lowest Number of Sales Occurrences Overall:
+---------------------------------+---------------+
|Product Name                     |Number of Sales|
+---------------------------------+---------------+
|Wasp CCD Handheld Bar Code Reader|1              |
+---------------------------------+---------------+

Item with the Lowest Number of Sales Occurrences per Location:
+-----------------+-------------------------------------------------------+---------------+
|City             |Product Name                                           |Number of Sales|
+-----------------+-------------------------------------------------------+---------------+
|Aberdeen         |Acme Titanium Bonded Scissors                          |1              |
|Abilene          |Hoover Commercial Lightweight Upright Vacuum           |1              |
|Akron            |Acco Expandable Hanging Binders                        |1              |
|Albuquerque      |AT&T TR1909W                                   

### Task 7: Find the most expensive and least expensive item for each location where sales occurred

In [None]:
expensiveCheapPerLocation = df.groupBy("City", "Product Name").agg(
    max("Sales").alias("Most Expensive Sale"), 
    min("Sales").alias("Least Expensive Sale")
)

windowSpecMost = Window.partitionBy("City").orderBy(col("Most Expensive Sale").desc())
windowSpecLeast = Window.partitionBy("City").orderBy(col("Least Expensive Sale"))

mostExpensivePerLocation = expensiveCheapPerLocation.withColumn("rank", rank().over(windowSpecMost))
mostExpensivePerLocation = mostExpensivePerLocation.filter(col("rank") == 1).select(
    "City", "Product Name", "Most Expensive Sale")

leastExpensivePerLocation = expensiveCheapPerLocation.withColumn("rank", rank().over(windowSpecLeast))
leastExpensivePerLocation = leastExpensivePerLocation.filter(col("rank") == 1).select(
    "City", "Product Name", "Least Expensive Sale")

print("Most Expensive Items per Location:")
mostExpensivePerLocation.show(truncate=False)

print("Least Expensive Items per Location:")
leastExpensivePerLocation.show(truncate=False)


Most Expensive Items per Location:
+-----------------+------------------------------------------------------------------+-------------------+
|City             |Product Name                                                      |Most Expensive Sale|
+-----------------+------------------------------------------------------------------+-------------------+
|Aberdeen         |Acme Titanium Bonded Scissors                                     |25.5               |
|Abilene          |Hoover Commercial Lightweight Upright Vacuum                      |1.39               |
|Akron            |Deluxe Rollaway Locking File with Drawer                          |665.41             |
|Albuquerque      |WD My Passport Ultra 2TB Portable External Hard Drive             |595.0              |
|Alexandria       |Martin Yale Chadless Opener Electric Letter Opener                |4164.05            |
|Allen            |Chromcraft Round Conference Tables                                |244.01             |
|A

### Task 8: Calculate the average cost of an item at each location within your dataset

In [None]:
average_cost_per_location = df.groupBy("City").agg(
    round(avg("Sales"), 2).alias("Average Sales")
).orderBy("City") 

print("Average Cost of an Item at Each Location:")
average_cost_per_location.show(truncate=False)


Average Cost of an Item at Each Location:
+-----------------+-------------+
|City             |Average Sales|
+-----------------+-------------+
|Aberdeen         |25.5         |
|Abilene          |1.39         |
|Akron            |135.92       |
|Albuquerque      |158.58       |
|Alexandria       |391.68       |
|Allen            |72.55        |
|Allentown        |121.89       |
|Altoona          |16.03        |
|Amarillo         |416.66       |
|Anaheim          |337.12       |
|Andover          |108.96       |
|Ann Arbor        |177.85       |
|Antioch          |19.44        |
|Apopka           |142.6        |
|Apple Valley     |228.11       |
|Appleton         |835.66       |
|Arlington        |203.4        |
|Arlington Heights|14.11        |
|Arvada           |125.85       |
|Asheville        |210.77       |
+-----------------+-------------+
only showing top 20 rows



### Task 9: Based on your individual dataset, create a set of variables which can be used as broadcast variables.

In [None]:
df = df.withColumn("Discount", col("Discount").cast("double"))

discount_categories = {
    0.0: "No Discount",
    0.1: "Student Discount",
    0.15: "Seasonal Discount",
    0.2: "Promotional Discount",
    0.3: "Blue Light Discount - Basic",
    0.32: "Blue Light Discount - Enhanced",
    0.4: "Frequent Shopper Discount",
    0.45: "Affiliate Discount",
    0.5: "Standard Employee Discount",
    0.6: "Senior Employee Discount",
    0.7: "Management Discount",
    0.8: "Executive Employee Discount"
}

# Broadcast the discount categories dictionary
broadcast_discount_categories = spark.sparkContext.broadcast(discount_categories)

def get_discount_type(discount):
    return broadcast_discount_categories.value.get(discount, "Unknown Discount")

discount_type_udf = udf(get_discount_type, StringType())

df = df.withColumn("Discount Type", discount_type_udf(col("Discount")))

discount_summary = df.groupBy("Discount Type").agg(
    count("Order ID").alias("Number of Orders"),
    round(sum("Sales"), 2).alias("Total Sales"),
    round(avg("Sales"), 2).alias("Average Sales")
).orderBy("Total Sales", ascending=False)

discount_summary.show()

+--------------------+----------------+-----------+-------------+
|       Discount Type|Number of Orders|Total Sales|Average Sales|
+--------------------+----------------+-----------+-------------+
|         No Discount|            4418| 1020973.96|       231.09|
|Promotional Discount|            3367|  742528.69|       220.53|
|Frequent Shopper ...|             194|  109750.51|       565.72|
|Blue Light Discou...|             218|   98445.86|       451.59|
|    Student Discount|              94|    54369.3|        578.4|
|Standard Employee...|              61|   54358.16|       891.12|
| Management Discount|             366|   39507.12|       107.94|
|   Seasonal Discount|              51|   27122.59|       531.82|
|Executive Employe...|             268|   16566.29|        61.81|
|Blue Light Discou...|              27|   14493.45|       536.79|
|  Affiliate Discount|              11|    5484.98|       498.63|
|Senior Employee D...|             108|    4354.96|        40.32|
+---------

### Task 10: Complete one other query to analyse the data, based on your individual dataset.

In [None]:
df = df.withColumn("Order Date", to_date(col("Order Date"), "yyyy-MM-dd"))

order_counts = df.groupBy("Order Date").agg(countDistinct("Order ID").alias("Order Count"))

top_days = order_counts.orderBy(desc("Order Count")).limit(5)

least_days = order_counts.orderBy("Order Count").limit(5)

print("Top 5 Days with the Most Orders:")
top_days.show()

print("Top 5 Days with the Least Orders:")
least_days.show()

Top 5 Days with the Most Orders:
+----------+-----------+
|Order Date|Order Count|
+----------+-----------+
|2013-09-06|         17|
|2014-12-03|         16|
|2014-11-20|         15|
|2014-09-05|         15|
|2014-11-25|         15|
+----------+-----------+

Top 5 Days with the Least Orders:
+----------+-----------+
|Order Date|Order Count|
+----------+-----------+
|2012-07-14|          1|
|2011-07-02|          1|
|2012-05-02|          1|
|2012-11-11|          1|
|2011-08-30|          1|
+----------+-----------+



### Machine Learning

In [None]:
df = df.withColumn("Order Date", to_timestamp("Order Date", "dd/MM/yyyy"))
df = df.withColumn("Year", year("Order Date"))
df = df.withColumn("Quarter", quarter("Order Date"))

quarterly_data = df.groupBy("Year", "Quarter").agg(
    sum("Sales").alias("TotalSales"),
    sum("Quantity").alias("TotalQuantity"),
    sum("Profit").alias("TotalProfit")
)

# Split data into training (Q1-Q3) and testing (Q4)
train_data = quarterly_data.filter(col("Quarter") < 4)
test_data = quarterly_data.filter((col("Quarter") == 4) & (col("Year") == 2014))

assembler = VectorAssembler(
    inputCols=["TotalQuantity", "TotalProfit"], 
    outputCol="features"
)

train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

lr = LinearRegression(featuresCol="features", labelCol="TotalSales")

lr_model = lr.fit(train_data)

predictions = lr_model.transform(test_data)

evaluator = RegressionEvaluator(labelCol="TotalSales", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

predictions.select("Year", "Quarter", "TotalSales", "prediction").show()

Root Mean Squared Error (RMSE) on test data = 31780.4
+----+-------+-----------------+------------------+
|Year|Quarter|       TotalSales|        prediction|
+----+-------+-----------------+------------------+
|2014|      4|270048.4300000002|238268.05966878505|
+----+-------+-----------------+------------------+

