In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [32]:
spark = SparkSession.builder.appName("PySparkPrac").getOrCreate()

In [33]:
df = spark.read.option("header", True).option("inferSchema", True).csv("ecommerce_orders.csv")

In [34]:
df.show(2)

+--------+-----------+----------+--------+--------------+-------------------+------+--------+------------+---------+
|order_id|customer_id|product_id|category|payment_method|         order_time| price|quantity|total_amount|   status|
+--------+-----------+----------+--------+--------------+-------------------+------+--------+------------+---------+
| O898230|       C187|       P40|   Books|    Debit Card|2025-08-23 00:06:00|297.45|       1|      297.45|Completed|
| O483710|       C178|       P44|  Sports|           UPI|2025-08-24 03:01:00|478.34|       3|     1435.02|Completed|
+--------+-----------+----------+--------+--------------+-------------------+------+--------+------------+---------+
only showing top 2 rows



In [35]:
df = df.withColumn("order_time", to_timestamp("order_time")).withColumn("date", date_format("order_time", "yyyy-MM-dd"))
df.show(5)

+--------+-----------+----------+-----------+--------------+-------------------+------+--------+------------+---------+----------+
|order_id|customer_id|product_id|   category|payment_method|         order_time| price|quantity|total_amount|   status|      date|
+--------+-----------+----------+-----------+--------------+-------------------+------+--------+------------+---------+----------+
| O898230|       C187|       P40|      Books|    Debit Card|2025-08-23 00:06:00|297.45|       1|      297.45|Completed|2025-08-23|
| O483710|       C178|       P44|     Sports|           UPI|2025-08-24 03:01:00|478.34|       3|     1435.02|Completed|2025-08-24|
| O556243|       C139|       P46|Electronics|           UPI|2025-08-21 05:18:00|187.59|       4|      750.36|Completed|2025-08-21|
| O745017|        C42|       P30|       Home|   Credit Card|2025-08-23 01:32:00|478.53|       4|     1914.12|Completed|2025-08-23|
| O986474|        C79|       P23|       NULL|    Debit Card|2025-08-22 09:39:00|224

In [36]:
df = df.fillna({"category":"Unkown", "payment_method":"Unknown"})

In [37]:
df = df.withColumn("actual_revenue", when(col("status") == "Completed", col("total_amount")).otherwise(0))

In [38]:
daily_metrics = df.groupBy("date").agg(
    count("order_id").alias("total_orders"),
    sum("actual_revenue").alias("total_revenue"),
    avg(col("actual_revenue")).alias("avg_revenue"),
    countDistinct("customer_id").alias("Unique_customers_per_day"),
    (sum(when(col("status") == "Cancelled", 1).otherwise(0)) / count("order_id") * 100).alias("Percentage_cancelled_order")
)
daily_metrics.show()

+----------+------------+------------------+-----------------+------------------------+--------------------------+
|      date|total_orders|     total_revenue|      avg_revenue|Unique_customers_per_day|Percentage_cancelled_order|
+----------+------------+------------------+-----------------+------------------------+--------------------------+
|2025-08-20|         300|211577.96000000005|705.2598666666669|                     163|        10.333333333333334|
|2025-08-22|         300|         198231.27|         660.7709|                     158|         9.666666666666666|
|2025-08-21|         300|         198488.13|         661.6271|                     164|         9.333333333333334|
|2025-08-23|         300|226806.84000000003|756.0228000000001|                     167|         7.333333333333333|
|2025-08-24|         300|221393.38999999993|737.9779666666665|                     154|        12.666666666666668|
+----------+------------+------------------+-----------------+------------------

In [39]:
top_products = df.groupBy("product_id").agg(sum("quantity").alias("most_sold_products")).orderBy(col("most_sold_products").desc())
top_products.show()

+----------+------------------+
|product_id|most_sold_products|
+----------+------------------+
|       P12|               133|
|       P25|               128|
|       P36|               128|
|       P30|               125|
|       P22|               121|
|        P7|               121|
|        P3|               121|
|       P45|               120|
|       P23|               119|
|       P40|               116|
|       P19|               110|
|       P31|               109|
|       P43|               107|
|       P11|               102|
|       P13|               101|
|       P29|                99|
|       P27|                97|
|       P33|                97|
|        P2|                97|
|       P47|                97|
+----------+------------------+
only showing top 20 rows



In [43]:
highest_rev_products = df.groupBy("date","product_id").agg(sum("actual_revenue").alias("rev"))
highest_rev_products.show()

+----------+----------+------------------+
|      date|product_id|               rev|
+----------+----------+------------------+
|2025-08-22|       P26|           1328.03|
|2025-08-22|       P38| 7517.869999999999|
|2025-08-20|       P40|14498.219999999998|
|2025-08-23|       P38|            532.12|
|2025-08-23|       P15|           2352.79|
|2025-08-20|       P23|           4673.17|
|2025-08-23|       P18|           2541.54|
|2025-08-23|       P46|           5322.26|
|2025-08-24|       P47|4095.3999999999996|
|2025-08-24|        P9|            543.26|
|2025-08-20|        P7| 8725.320000000002|
|2025-08-22|       P27|            3388.5|
|2025-08-21|       P22|           8301.67|
|2025-08-22|       P12|           5341.13|
|2025-08-24|       P32| 8445.380000000001|
|2025-08-21|        P5|           6317.35|
|2025-08-24|       P40|           3313.92|
|2025-08-21|       P36|           2514.89|
|2025-08-22|       P18|            1887.2|
|2025-08-22|       P24|2555.5699999999997|
+----------

In [45]:
w = Window.partitionBy("date").orderBy(col("rev").desc())

ranked_products = highest_rev_products.withColumn("rn", dense_rank().over(w)).filter(col("rn") == 1)
ranked_products.show()

+----------+----------+------------------+---+
|      date|product_id|               rev| rn|
+----------+----------+------------------+---+
|2025-08-20|       P40|14498.219999999998|  1|
|2025-08-21|       P13|          10724.41|  1|
|2025-08-22|       P47|           8487.88|  1|
|2025-08-23|       P30|           9419.18|  1|
|2025-08-24|       P30|10765.349999999999|  1|
+----------+----------+------------------+---+



In [46]:
category_dist = df.groupBy("category").agg(sum("actual_revenue").alias("total_sales"))
category_dist.show()

+-----------+------------------+
|   category|       total_sales|
+-----------+------------------+
|       Home|205324.75999999995|
|     Sports|         197590.01|
|Electronics|236803.71000000002|
|   Clothing|208260.71999999997|
|      Books|191035.45000000007|
|     Unkown|          17482.94|
+-----------+------------------+



In [56]:
order_totals = df.groupBy("customer_id", "order_id").agg(
    sum("actual_revenue").alias("order_total")
)

cust_insights = order_totals.groupBy("customer_id").agg(
    avg("order_total").alias("avg_order_value"),
    count("order_id").alias("total_orders"),
    sum("order_total").alias("total_paid")
)
cust_insights.orderBy("customer_id").show()

+-----------+------------------+------------+------------------+
|customer_id|   avg_order_value|total_orders|        total_paid|
+-----------+------------------+------------+------------------+
|         C1| 645.5866666666666|           9|           5810.28|
|        C10| 638.9875000000001|           4|2555.9500000000003|
|       C100|1192.1828571428573|           7|           8345.28|
|       C101|         746.19125|           8|           5969.53|
|       C102| 733.6211111111111|           9| 6602.589999999999|
|       C103| 593.5022222222221|           9|5341.5199999999995|
|       C104| 433.4728571428572|           7|3034.3100000000004|
|       C105| 703.6812500000001|           8| 5629.450000000001|
|       C106|           637.462|          10|           6374.62|
|       C107|            894.37|           7|           6260.59|
|       C108|1047.5745454545456|          11|11523.320000000002|
|       C109| 766.3299999999999|           5|3831.6499999999996|
|        C11|217.48333333

In [None]:
top_cust = cust_insights.orderBy(col("total_paid").desc())
top_cust.show()

+-----------+------------------+------------+------------------+
|customer_id|   avg_order_value|total_orders|        total_paid|
+-----------+------------------+------------+------------------+
|        C63|1067.2341666666666|          12|          12806.81|
|        C33|1147.8627272727272|          11|          12626.49|
|        C54| 831.3026666666665|          15|12469.539999999997|
|       C108|1047.5745454545456|          11|11523.320000000002|
|        C94|  658.665882352941|          17|11197.319999999998|
|        C45|        1374.10125|           8|          10992.81|
|        C56|          1032.618|          10|          10326.18|
|        C43|1069.0244444444445|           9| 9621.220000000001|
|       C189| 857.8127272727273|          11|           9435.94|
|       C160| 943.2560000000001|          10| 9432.560000000001|
|        C62| 845.1899999999998|          11| 9297.089999999998|
|       C147|           1012.51|           9|           9112.59|
|       C175|           9

In [58]:
payment_method_analysis = df.groupBy("payment_method").agg(
    count("order_id").alias("orders_per_pay"),
    sum("actual_revenue").alias("rev_split")
)

payment_method_analysis.show()

+--------------+--------------+------------------+
|payment_method|orders_per_pay|         rev_split|
+--------------+--------------+------------------+
|   Credit Card|           300|208856.76999999981|
|       Unknown|            29|          15916.08|
|        PayPal|           292|207488.95999999985|
|          Cash|           281|199835.12000000002|
|    Debit Card|           293|198436.87000000017|
|           UPI|           305|225963.79000000012|
+--------------+--------------+------------------+

