PySpark Assignment – Product Sales Analysis

(Intermediate)
Part 1: Environment Setup
1. Install Spark + Java in Google Colab.
2. Initialize Spark with app name "ProductSalesAnalysis" .

In [10]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark=SparkSession.builder.appName('ProductSalesAnalysis').getOrCreate()

Part 2: Load Sales Data from CSV

Create and load the following CSV as sales.csv :

OrderID,Product,Category,Quantity,UnitPrice,Region

1001,Mobile,Electronics,2,15000,North

1002,Laptop,Electronics,1,55000,South

1003,T-Shirt,Apparel,3,500,East

1004,Jeans,Apparel,2,1200,North

1005,TV,Electronics,1,40000,West

1006,Shoes,Footwear,4,2000,South

1007,Watch,Accessories,2,3000,East

1008,Headphones,Electronics,3,2500,North

Task:
Read the file into a PySpark DataFrame with header and inferred schema.
Print schema and show top 5 rows.

In [11]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# Recreate the correct CSV content
csv_data = """OrderID,Product,Category,Quantity,UnitPrice,Region
1001,Mobile,Electronics,2,15000,North
1002,Laptop,Electronics,1,55000,South
1003,T-Shirt,Apparel,3,500,East
1004,Jeans,Apparel,2,1200,North
1005,TV,Electronics,1,40000,West
1006,Shoes,Footwear,4,2000,South
1007,Watch,Accessories,2,3000,East
1008,Headphones,Electronics,3,2500,North"""

# Write it to the correct path
with open("/content/drive/MyDrive/Colab Notebooks/sales_data.csv", "w") as f:
    f.write(csv_data)

# Reload the correct file
df = spark.read.csv("/content/drive/MyDrive/Colab Notebooks/sales_data.csv", header=True, inferSchema=True)
df.printSchema()
df.show()


root
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Region: string (nullable = true)

+-------+----------+-----------+--------+---------+------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|
+-------+----------+-----------+--------+---------+------+
|   1001|    Mobile|Electronics|       2|    15000| North|
|   1002|    Laptop|Electronics|       1|    55000| South|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|
|   1004|     Jeans|    Apparel|       2|     1200| North|
|   1005|        TV|Electronics|       1|    40000|  West|
|   1006|     Shoes|   Footwear|       4|     2000| South|
|   1007|     Watch|Accessories|       2|     3000|  East|
|   1008|Headphones|Electronics|       3|     2500| North|
+-------+----------+-----------+--------+---------+------+



Part 3: Business Questions

1. Add a new column TotalPrice = Quantity × UnitPrice



In [17]:
from pyspark.sql.functions import col

df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df.show()



+-------+----------+-----------+--------+---------+------+----------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+----------+-----------+--------+---------+------+----------+
|   1001|    Mobile|Electronics|       2|    15000| North|     30000|
|   1002|    Laptop|Electronics|       1|    55000| South|     55000|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|      1500|
|   1004|     Jeans|    Apparel|       2|     1200| North|      2400|
|   1005|        TV|Electronics|       1|    40000|  West|     40000|
|   1006|     Shoes|   Footwear|       4|     2000| South|      8000|
|   1007|     Watch|Accessories|       2|     3000|  East|      6000|
|   1008|Headphones|Electronics|       3|     2500| North|      7500|
+-------+----------+-----------+--------+---------+------+----------+



2. Total revenue generated across all regions.


In [18]:
from pyspark.sql.functions import sum

total_revenue = df.agg(sum("TotalPrice").alias("TotalRevenue"))
total_revenue.show()


+------------+
|TotalRevenue|
+------------+
|      150400|
+------------+




3. Category-wise revenue sorted in descending order.



In [19]:
category_revenue = df.groupBy("Category").agg(sum("TotalPrice").alias("CategoryRevenue"))
sorted_category_revenue = category_revenue.orderBy(col("CategoryRevenue").desc())
sorted_category_revenue.show()


+-----------+---------------+
|   Category|CategoryRevenue|
+-----------+---------------+
|Electronics|         132500|
|   Footwear|           8000|
|Accessories|           6000|
|    Apparel|           3900|
+-----------+---------------+



4. Region with the highest number of orders



In [20]:
region_orders = df.groupBy("Region").count().orderBy(col("count").desc())
region_orders.show(1)


+------+-----+
|Region|count|
+------+-----+
| North|    3|
+------+-----+
only showing top 1 row



5. Average Unit Price per Category


In [21]:
from pyspark.sql.functions import avg

avg_price_category = df.groupBy("Category").agg(avg("UnitPrice").alias("AvgUnitPrice"))
avg_price_category.show()


+-----------+------------+
|   Category|AvgUnitPrice|
+-----------+------------+
|    Apparel|       850.0|
|Electronics|     28125.0|
|   Footwear|      2000.0|
|Accessories|      3000.0|
+-----------+------------+




6. All orders where TotalPrice is more than
30,000

In [22]:
high_value_orders = df.filter(col("TotalPrice") > 30000)
high_value_orders.show()


+-------+-------+-----------+--------+---------+------+----------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+-------+-----------+--------+---------+------+----------+
|   1002| Laptop|Electronics|       1|    55000| South|     55000|
|   1005|     TV|Electronics|       1|    40000|  West|     40000|
+-------+-------+-----------+--------+---------+------+----------+



Part 4: Data Transformations
1. Create a new column HighValueOrder which is "Yes" if TotalPrice > 20,000,
else "No" .


In [23]:
from pyspark.sql.functions import when

df = df.withColumn(
    "HighValueOrder",
    when(col("TotalPrice") > 20000, "Yes").otherwise("No")
)
df.show()


+-------+----------+-----------+--------+---------+------+----------+--------------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|TotalPrice|HighValueOrder|
+-------+----------+-----------+--------+---------+------+----------+--------------+
|   1001|    Mobile|Electronics|       2|    15000| North|     30000|           Yes|
|   1002|    Laptop|Electronics|       1|    55000| South|     55000|           Yes|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|      1500|            No|
|   1004|     Jeans|    Apparel|       2|     1200| North|      2400|            No|
|   1005|        TV|Electronics|       1|    40000|  West|     40000|           Yes|
|   1006|     Shoes|   Footwear|       4|     2000| South|      8000|            No|
|   1007|     Watch|Accessories|       2|     3000|  East|      6000|            No|
|   1008|Headphones|Electronics|       3|     2500| North|      7500|            No|
+-------+----------+-----------+--------+---------+------+-------

2. Filter and display all high-value orders in the North region.


In [None]:
north_high_value = df.filter((col("HighValueOrder") == "Yes") & (col("Region") == "North"))
north_high_value.show()


3. Count how many high-value orders exist per region.

In [None]:
high_value_count = df.filter(col("HighValueOrder") == "Yes") \
                     .groupBy("Region") \
                     .count() \
                     .withColumnRenamed("count", "HighValueOrderCount")
high_value_count.show()


Part 5: Save Results
Save the transformed DataFrame as a CSV file named high_value_orders.csv with
headers.

In [24]:

high_value_df = df.filter(col("HighValueOrder") == "Yes")
high_value_df.write.csv(
    "/content/drive/MyDrive/Colab Notebooks/high_value_orders.csv",
    header=True,
    mode="overwrite")
