# Process sales history with PySpark  

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, count, year, month, round, regexp_replace

In [0]:
# Start Spark session
spark = SparkSession.builder \
    .appName("Sales History Processing") \
    .getOrCreate()

- Load the cleaned dataset

In [0]:
df = spark.read.csv("dbfs:/FileStore/Retail_Store_Chain_Analysis/2_cleaned_data.csv", header=True, inferSchema=True, multiLine=True, escape='"')


In [0]:
# Checking missing values for all columns
null_count = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
display(null_count)

Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
df.printSchema()

root
 |-- Row ID: integer (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Order Date: date (nullable = true)
 |-- Ship Date: date (nullable = true)
 |-- Ship Mode: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal Code: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [0]:
# Round Sales column to 2 decimal places
df = df.withColumn("Sales", round(col("Sales"), 3))
df.select('Sales').show(truncate=False)


+-------+
|Sales  |
+-------+
|16.448 |
|3.54   |
|272.736|
|11.784 |
|19.536 |
|500.641|
|391.98 |
|19.44  |
|500.641|
|6.54   |
|500.641|
|12.78  |
|5.48   |
|31.12  |
|10.43  |
|76.728 |
|9.344  |
|31.2   |
|51.94  |
|2.89   |
+-------+
only showing top 20 rows



In [0]:
display(df.summary())


summary,Row ID,Order ID,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales
count,9800.0,9800,9800,9800,9800,9800,9800,9800,9800,9800.0,9800,9800,9800,9800,9800,9800.0
mean,4900.5,,,,,,,,,55273.3224026969,,,,,,140.81584877550767
stddev,2829.160652914572,,,,,,,,,32041.223412812866,,,,,,169.5517360055789
min,1.0,CA-2015-100006,First Class,AA-10315,Aaron Bergman,Consumer,United States,Aberdeen,Alabama,10009.0,Central,FUR-BO-10000112,Furniture,Accessories,"""While you Were Out"" Message Book, One Form per Page",0.444
25%,2449.0,,,,,,,,,23223.0,,,,,,17.24
50%,4899.0,,,,,,,,,58103.0,,,,,,54.384
75%,7349.0,,,,,,,,,90008.0,,,,,,210.564
max,9800.0,US-2018-169551,Standard Class,ZD-21925,Zuschuss Donatelli,Home Office,United States,Yuma,Wyoming,,West,TEC-PH-10004977,Technology,Tables,netTALK DUO VoIP Telephone Service,500.641


In [0]:
df.isEmpty()

False

In [0]:
df.count()

9800

In [0]:
df.select('Row ID').distinct().count()

9800

In [0]:
df.select('Order ID').distinct().count()

4922

In [0]:
df.select('Customer ID').distinct().count()

793

In [0]:
df.select('Order Date').distinct().show()

+----------+
|Order Date|
+----------+
|2015-01-20|
|2015-01-23|
|2015-01-11|
|2015-01-10|
|2015-01-26|
|2015-01-15|
|2015-01-14|
|2015-01-30|
|2015-01-03|
|2015-01-06|
|2015-01-05|
|2015-01-04|
|2015-01-16|
|2015-01-19|
|2015-01-13|
|2015-01-28|
|2015-01-18|
|2015-01-09|
|2015-01-27|
|2015-01-07|
+----------+
only showing top 20 rows



In [0]:
# Extract Year and Month
df = df.withColumn('Order Year', year('Order Date'))
df = df.withColumn('Order Month', month('Order Date'))

In [0]:
df.select('Order Year').distinct().show()

+----------+
|Order Year|
+----------+
|      2018|
|      2015|
|      2016|
|      2017|
+----------+



In [0]:
# Aggregation: Sales Summary by Year
sales_by_year = df.groupBy('Order Year').agg(
    sum("Sales").alias("Total Sales"),
    avg("Sales").alias("Average Sales"),
    count("Order ID").alias("Total Orders")
)
sales_by_year.show()

+----------------+------------------+------------------+------------+
|year(Order Date)|       Total Sales|     Average Sales|Total Orders|
+----------------+------------------+------------------+------------+
|            2018| 445421.1410000021|136.71612676488706|        3258|
|            2015|280887.89200000087| 143.8238054275478|        1953|
|            2016| 297655.9430000007|144.84474111922177|        2055|
|            2017| 356030.3420000017|140.50131886345764|        2534|
+----------------+------------------+------------------+------------+



In [0]:
# Aggregate sales by Year and Month
seasonal_sales = df.groupBy('Order Year', 'Order Month').agg(
    sum('Sales').alias('Total Sales'),
    avg('Sales').alias('Average Sales'),
    count('Order ID').alias('Total Orders')
)
seasonal_sales.show()


+----------------+-----------------+------------------+------------------+------------+
|year(Order Date)|month(Order Date)|       Total Sales|     Average Sales|Total Orders|
+----------------+-----------------+------------------+------------------+------------+
|            2015|                2|3764.3130000000006| 81.83289130434784|          46|
|            2017|                3|26654.688999999988|165.55707453416142|         161|
|            2017|                8|21750.265000000003|124.28722857142859|         175|
|            2017|               10| 25285.31299999998|131.69433854166655|         192|
|            2018|               10| 39056.90300000003| 132.8466088435375|         294|
|            2015|               12| 40913.90099999999|149.32080656934303|         274|
|            2016|                7| 21369.58799999999| 157.1293235294117|         136|
|            2016|               11| 49471.73300000009|154.11754828660463|         321|
|            2018|              

In [0]:
df.columns

['Row ID',
 'Order ID',
 'Order Date',
 'Ship Date',
 'Ship Mode',
 'Customer ID',
 'Customer Name',
 'Segment',
 'Country',
 'City',
 'State',
 'Postal Code',
 'Region',
 'Product ID',
 'Category',
 'Sub-Category',
 'Product Name',
 'Sales',
 'Order Year',
 'Order Month']

In [0]:
df.printSchema()

root
 |-- Row ID: integer (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Order Date: date (nullable = true)
 |-- Ship Date: date (nullable = true)
 |-- Ship Mode: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal Code: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Sales: double (nullable = true)
 |-- Order Year: integer (nullable = true)
 |-- Order Month: integer (nullable = true)



In [0]:
df = df.withColumn("Postal Code", regexp_replace("Postal Code", "\\.0$", ""))

In [0]:
#Save the DataFrame as a CSV file
df.write.option("header", "true").csv("dbfs:/FileStore/Retail_Store_Chain_Analysis/3_processed_data.csv")

In [0]:
# Save the processed data to a Parquet file for future analysis
df.write.partitionBy("Region", "Category", "Order Year").parquet("dbfs:/FileStore/Retail_Store_Chain_Analysis/3_processed_data.parquet", mode="overwrite")

###  Conclusion

The sales history data was processed using PySpark. The following steps were performed:

* Loaded the cleaned dataset.

* Checked for missing values and rounded the "Sales" column.

* Extracted year and month from the "Order Date" column.

* Aggregated sales by year and by year-month.

* Cleaned the "Postal Code" column.

The processed dataset is saved as both a CSV and a Parquet file for future analysis.