In [0]:
%python
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("fashionRetailSales").getOrCreate()
df=spark.read.csv("dbfs:/FileStore/tables/Fashion_Retail_Sales.csv",header=True,inferSchema=True)

In [0]:
%python
df.show(1000)

+---------------------+--------------+---------------------+-------------+-------------+--------------+
|Customer Reference ID|Item Purchased|Purchase Amount (USD)|Date Purchase|Review Rating|Payment Method|
+---------------------+--------------+---------------------+-------------+-------------+--------------+
|                 4018|       Handbag|               4619.0|   2023-02-05|         null|   Credit Card|
|                 4115|         Tunic|               2456.0|   2023-07-11|          2.0|   Credit Card|
|                 4019|      Tank Top|               2102.0|   2023-03-23|          4.1|          Cash|
|                 4097|      Leggings|               3126.0|   2023-03-15|          3.2|          Cash|
|                 3997|        Wallet|               3003.0|   2022-11-27|          4.7|          Cash|
|                 4080|        Onesie|               2914.0|   2022-12-11|          4.5|   Credit Card|
|                 4055|        Jacket|               2571.0|   2

In [0]:
%python
df.printSchema()

root
 |-- Customer Reference ID: integer (nullable = true)
 |-- Item Purchased: string (nullable = true)
 |-- Purchase Amount (USD): double (nullable = true)
 |-- Date Purchase: date (nullable = true)
 |-- Review Rating: double (nullable = true)
 |-- Payment Method: string (nullable = true)



In [0]:
%python
distinct_customers=df.select("Customer Reference ID").distinct()
No_of_distinct_Items=df.select("Item Purchased").distinct()

In [0]:
%python
distinct_customers.count()

Out[6]: 166

In [0]:
%python
df.count()

Out[7]: 3400

In [0]:
%python
No_of_distinct_Items.count()

Out[8]: 50

In [0]:
%python
df_no_nulls=df.dropna()

In [0]:
%python
df_no_nulls.count()

Out[10]: 2487

In [0]:
%python
distinct_customers=df_no_nulls.select("Customer Reference ID").distinct()
No_of_distinct_Items=df_no_nulls.select("Item Purchased").distinct()

In [0]:
%python
distinct_customers.count()

Out[12]: 166

In [0]:
%python
No_of_distinct_Items.count()

Out[13]: 50

In [0]:
%python
df_no_nulls.select("Item Purchased").distinct().orderBy("Item Purchased").show(50)

+--------------+
|Item Purchased|
+--------------+
|      Backpack|
|          Belt|
|        Blazer|
|        Blouse|
|         Boots|
|        Bowtie|
|      Camisole|
|      Cardigan|
|          Coat|
|         Dress|
| Flannel Shirt|
|    Flip-Flops|
|        Gloves|
|       Handbag|
|           Hat|
|        Hoodie|
|        Jacket|
|         Jeans|
|      Jumpsuit|
|        Kimono|
|      Leggings|
|       Loafers|
|        Onesie|
|      Overalls|
|       Pajamas|
|         Pants|
|    Polo Shirt|
|        Poncho|
|      Raincoat|
|        Romper|
|       Sandals|
|         Scarf|
|        Shorts|
|         Skirt|
|      Slippers|
|      Sneakers|
|         Socks|
|       Sun Hat|
|    Sunglasses|
|       Sweater|
|      Swimsuit|
|       T-shirt|
|      Tank Top|
|           Tie|
|   Trench Coat|
|      Trousers|
|         Tunic|
|      Umbrella|
|          Vest|
|        Wallet|
+--------------+



In [0]:
%python
a=df_no_nulls.groupBy("Customer Reference Id").count().alias("no_of_Purchases")
c=df_no_nulls.groupBy("Item Purchased").count().alias("no_of_items_purchased")

In [0]:
%python
from pyspark.sql.functions import *
b=a.withColumnRenamed("count","no_of_purchases").orderBy("no_of_purchases",ascending=False)
d=c.withColumnRenamed("count","no_of_purchases").orderBy("no_of_purchases",ascending=False)

In [0]:
%python
b.show()

+---------------------+---------------+
|Customer Reference Id|no_of_purchases|
+---------------------+---------------+
|                 4078|             25|
|                 4122|             24|
|                 3973|             24|
|                 3978|             23|
|                 4073|             23|
|                 4075|             23|
|                 4111|             22|
|                 4002|             22|
|                 4068|             22|
|                 4081|             21|
|                 4065|             21|
|                 4034|             20|
|                 4017|             20|
|                 4110|             20|
|                 4008|             20|
|                 4082|             20|
|                 4067|             20|
|                 4084|             20|
|                 4051|             20|
|                 3968|             20|
+---------------------+---------------+
only showing top 20 rows



In [0]:
%python
d.show()

+--------------+---------------+
|Item Purchased|no_of_purchases|
+--------------+---------------+
|        Shorts|             69|
|          Belt|             65|
|         Skirt|             63|
|      Tank Top|             60|
|      Camisole|             60|
|         Pants|             59|
|      Cardigan|             57|
|        Hoodie|             57|
|       Pajamas|             56|
|       T-shirt|             56|
|        Wallet|             56|
|        Kimono|             55|
|         Scarf|             55|
|   Trench Coat|             54|
|    Sunglasses|             54|
|        Poncho|             54|
|        Onesie|             53|
|      Sneakers|             53|
|       Handbag|             53|
|       Loafers|             53|
+--------------+---------------+
only showing top 20 rows



In [0]:
%python
df_no_nulls.select("Customer Reference Id", "Purchase Amount (USD)")\
    .groupBy("Customer Reference Id")\
        .agg(sum("Purchase Amount (USD)").alias("total Amount spent"))\
            .orderBy("total Amount spent",ascending =False).show(10)

+---------------------+------------------+
|Customer Reference Id|total Amount spent|
+---------------------+------------------+
|                 4044|            8745.0|
|                 4075|            7067.0|
|                 4108|            6828.0|
|                 4067|            6528.0|
|                 4040|            6519.0|
|                 3984|            6327.0|
|                 4002|            6320.0|
|                 4099|            6002.0|
|                 3986|            5908.0|
|                 3976|            5845.0|
+---------------------+------------------+
only showing top 10 rows



In [0]:
%python
df_no_nulls.select("Item Purchased", "Purchase Amount (USD)")\
    .groupBy("Item Purchased")\
        .agg(sum("Purchase Amount (USD)").alias("total Amount spent"))\
            .orderBy("total Amount spent",ascending =False).show(10)

+--------------+------------------+
|Item Purchased|total Amount spent|
+--------------+------------------+
|         Tunic|           16980.0|
|         Jeans|           12475.0|
|        Shorts|           12246.0|
|        Gloves|           11793.0|
|         Boots|           11174.0|
|    Flip-Flops|           10776.0|
|      Slippers|           10702.0|
|       Sweater|           10160.0|
|          Belt|           10018.0|
|   Trench Coat|            9622.0|
+--------------+------------------+
only showing top 10 rows



In [0]:
%python
df_no_nulls.select("Item Purchased", "Purchase Amount (USD)")\
    .groupBy("Item Purchased")\
        .agg(sum("Purchase Amount (USD)").alias("total Amount spent"))\
            .orderBy("total Amount spent",ascending =False)

Out[22]: DataFrame[Item Purchased: string, total Amount spent: double]

In [0]:
df_no_nulls.where(df["Item Purchased"]=="Wallet").show()

+---------------------+--------------+---------------------+-------------+-------------+--------------+
|Customer Reference ID|Item Purchased|Purchase Amount (USD)|Date Purchase|Review Rating|Payment Method|
+---------------------+--------------+---------------------+-------------+-------------+--------------+
|                 3997|        Wallet|               3003.0|   2022-11-27|          4.7|          Cash|
|                 4032|        Wallet|                155.0|   2023-03-08|          1.1|   Credit Card|
|                 4001|        Wallet|                151.0|   2023-06-09|          3.9|          Cash|
|                 4042|        Wallet|                154.0|   2023-08-31|          2.9|   Credit Card|
|                 3965|        Wallet|                 89.0|   2023-08-18|          4.5|          Cash|
|                 4014|        Wallet|                 92.0|   2022-12-25|          4.8|   Credit Card|
|                 3988|        Wallet|                181.0|   2

In [0]:
df_no_nulls = df_no_nulls.withColumn("day",dayofmonth("Date Purchase")).withColumn("Month", month("Date Purchase")).withColumn("Year", year("Date Purchase"))

In [0]:
df_no_nulls.show()

+---------------------+--------------+---------------------+-------------+-------------+--------------+---+-----+----+
|Customer Reference ID|Item Purchased|Purchase Amount (USD)|Date Purchase|Review Rating|Payment Method|day|Month|Year|
+---------------------+--------------+---------------------+-------------+-------------+--------------+---+-----+----+
|                 4115|         Tunic|               2456.0|   2023-07-11|          2.0|   Credit Card| 11|    7|2023|
|                 4019|      Tank Top|               2102.0|   2023-03-23|          4.1|          Cash| 23|    3|2023|
|                 4097|      Leggings|               3126.0|   2023-03-15|          3.2|          Cash| 15|    3|2023|
|                 3997|        Wallet|               3003.0|   2022-11-27|          4.7|          Cash| 27|   11|2022|
|                 4080|        Onesie|               2914.0|   2022-12-11|          4.5|   Credit Card| 11|   12|2022|
|                 4055|        Jacket|          

In [0]:
sales_by_year=df_no_nulls.groupBy("Year").agg(sum("Purchase Amount (USD)").alias("TotalSalesAmount"))

In [0]:
sales_by_year.show()

+----+----------------+
|Year|TotalSalesAmount|
+----+----------------+
|2023|        273736.0|
|2022|         98344.0|
+----+----------------+



In [0]:
sales_by_yea_month=df_no_nulls.groupBy("Year","month").agg(sum("Purchase Amount (USD)").alias("TotalSalesAmount"))

In [0]:
sales_by_yea_month.orderBy("year",ascending=False).orderBy("month",ascending=False).show()

+----+-----+----------------+
|Year|month|TotalSalesAmount|
+----+-----+----------------+
|2022|   12|         41496.0|
|2022|   11|         28489.0|
|2022|   10|         28359.0|
|2023|   10|           423.0|
|2023|    9|         22395.0|
|2023|    8|         32807.0|
|2023|    7|         30775.0|
|2023|    6|         29503.0|
|2023|    5|         39539.0|
|2023|    4|         35293.0|
|2023|    3|         30853.0|
|2023|    2|         24294.0|
|2023|    1|         27854.0|
+----+-----+----------------+



In [0]:
payment_wise_no_purchases=df_no_nulls.groupBy("Payment Method").count().show()

+--------------+-----+
|Payment Method|count|
+--------------+-----+
|   Credit Card| 1303|
|          Cash| 1184|
+--------------+-----+



In [0]:
average_rating_given_to_item=df_no_nulls.groupBy("Item Purchased").agg(format_number(avg("Review Rating"),2)\
    .alias("avg_rating")).orderBy("avg_rating",ascending=False).show()


+--------------+----------+
|Item Purchased|avg_rating|
+--------------+----------+
|       Sun Hat|      3.51|
|    Flip-Flops|      3.35|
|         Skirt|      3.33|
|      Jumpsuit|      3.27|
|       Handbag|      3.24|
|       Sandals|      3.23|
|   Trench Coat|      3.20|
|      Tank Top|      3.19|
|       T-shirt|      3.16|
|      Swimsuit|      3.16|
|         Jeans|      3.16|
|        Wallet|      3.14|
|         Boots|      3.13|
|      Umbrella|      3.10|
|       Sweater|      3.10|
|          Coat|      3.10|
|        Romper|      3.10|
|        Gloves|      3.10|
|         Scarf|      3.10|
|      Slippers|      3.07|
+--------------+----------+
only showing top 20 rows



In [0]:
df_no_nulls.groupBy("Item Purchased").agg(max("Review Rating").alias("maximum rating")\
    ,min("Review Rating").alias("minimum rating")).orderBy("maximum rating",ascending=False).show(50)

+--------------+--------------+--------------+
|Item Purchased|maximum rating|minimum rating|
+--------------+--------------+--------------+
|       T-shirt|           5.0|           1.1|
|      Cardigan|           5.0|           1.1|
|      Sneakers|           5.0|           1.0|
|          Belt|           5.0|           1.0|
|        Onesie|           5.0|           1.1|
|       Sweater|           5.0|           1.0|
|   Trench Coat|           5.0|           1.2|
|      Raincoat|           5.0|           1.0|
|         Pants|           5.0|           1.2|
|       Pajamas|           5.0|           1.1|
|       Sun Hat|           5.0|           1.3|
|       Handbag|           5.0|           1.1|
|       Loafers|           5.0|           1.0|
|      Backpack|           5.0|           1.0|
|         Skirt|           5.0|           1.1|
|        Romper|           5.0|           1.1|
|        Wallet|           5.0|           1.1|
|      Jumpsuit|           5.0|           1.1|
|         Jea

In [0]:
segmented_customers = df_no_nulls.groupBy("Customer Reference ID").agg(format_number(avg("Purchase Amount (USD)"),2).alias("AvgPurchaseAmount")).show(166)

+---------------------+-----------------+
|Customer Reference ID|AvgPurchaseAmount|
+---------------------+-----------------+
|                 3997|           317.25|
|                 4101|           122.29|
|                 3986|           328.22|
|                 4078|           115.24|
|                 4042|           111.32|
|                 4000|            83.94|
|                 4092|           127.43|
|                 4097|           490.75|
|                 4119|           104.38|
|                 4061|            81.67|
|                 4088|           108.27|
|                 4083|           272.13|
|                 4107|           118.41|
|                 4036|           139.45|
|                 3972|           274.33|
|                 4104|           123.00|
|                 4025|           129.00|
|                 4033|           127.31|
|                 3990|           106.69|
|                 4014|            71.20|
|                 4047|           

In [0]:
df_no_nulls.show()

+---------------------+--------------+---------------------+-------------+-------------+--------------+---+-----+----+
|Customer Reference ID|Item Purchased|Purchase Amount (USD)|Date Purchase|Review Rating|Payment Method|day|Month|Year|
+---------------------+--------------+---------------------+-------------+-------------+--------------+---+-----+----+
|                 4115|         Tunic|               2456.0|   2023-07-11|          2.0|   Credit Card| 11|    7|2023|
|                 4019|      Tank Top|               2102.0|   2023-03-23|          4.1|          Cash| 23|    3|2023|
|                 4097|      Leggings|               3126.0|   2023-03-15|          3.2|          Cash| 15|    3|2023|
|                 3997|        Wallet|               3003.0|   2022-11-27|          4.7|          Cash| 27|   11|2022|
|                 4080|        Onesie|               2914.0|   2022-12-11|          4.5|   Credit Card| 11|   12|2022|
|                 4055|        Jacket|          

In [0]:
df_no_nulls_new=df_no_nulls.withColumnRenamed("Customer Reference Id", "Customer_Reference_Id")\
    .withColumnRenamed("Item Purchased", "Item_Purchased")\
        .withColumnRenamed("Purchase Amount (USD)", "Purchase_Amount_inUSD")\
            .withColumnRenamed("Date Purchase", "Date_Purchase")\
                .withColumnRenamed("Review Rating", "Review_Rating")\
                    .withColumnRenamed("Payment Method", "Payment_Method")

In [0]:
df_no_nulls_new.show()

+---------------------+--------------+---------------------+-------------+-------------+--------------+---+-----+----+
|Customer_Reference_Id|Item_Purchased|Purchase_Amount_inUSD|Date_Purchase|Review_Rating|Payment_Method|day|Month|Year|
+---------------------+--------------+---------------------+-------------+-------------+--------------+---+-----+----+
|                 4115|         Tunic|               2456.0|   2023-07-11|          2.0|   Credit Card| 11|    7|2023|
|                 4019|      Tank Top|               2102.0|   2023-03-23|          4.1|          Cash| 23|    3|2023|
|                 4097|      Leggings|               3126.0|   2023-03-15|          3.2|          Cash| 15|    3|2023|
|                 3997|        Wallet|               3003.0|   2022-11-27|          4.7|          Cash| 27|   11|2022|
|                 4080|        Onesie|               2914.0|   2022-12-11|          4.5|   Credit Card| 11|   12|2022|
|                 4055|        Jacket|          

In [0]:
df_no_nulls_new.createOrReplaceTempView("mahi")

In [0]:
spark.sql("select * from mahi").show()

+---------------------+--------------+---------------------+-------------+-------------+--------------+---+-----+----+
|Customer_Reference_Id|Item_Purchased|Purchase_Amount_inUSD|Date_Purchase|Review_Rating|Payment_Method|day|Month|Year|
+---------------------+--------------+---------------------+-------------+-------------+--------------+---+-----+----+
|                 4115|         Tunic|               2456.0|   2023-07-11|          2.0|   Credit Card| 11|    7|2023|
|                 4019|      Tank Top|               2102.0|   2023-03-23|          4.1|          Cash| 23|    3|2023|
|                 4097|      Leggings|               3126.0|   2023-03-15|          3.2|          Cash| 15|    3|2023|
|                 3997|        Wallet|               3003.0|   2022-11-27|          4.7|          Cash| 27|   11|2022|
|                 4080|        Onesie|               2914.0|   2022-12-11|          4.5|   Credit Card| 11|   12|2022|
|                 4055|        Jacket|          

In [0]:
spark.sql("select Customer_Reference_Id, round(avg(Purchase_Amount_inUSD),2) from mahi group by Customer_Reference_Id order by 2 desc").show()

+---------------------+------------------------------------+
|Customer_Reference_Id|round(avg(Purchase_Amount_inUSD), 2)|
+---------------------+------------------------------------+
|                 4109|                              629.89|
|                 4035|                               601.5|
|                 4097|                              490.75|
|                 4044|                              485.83|
|                 3984|                              451.93|
|                 3981|                               392.0|
|                 3976|                              389.67|
|                 4054|                               389.5|
|                 4040|                              383.47|
|                 4108|                              379.33|
|                 4099|                              375.13|
|                 4080|                              345.67|
|                 3986|                              328.22|
|                 4067| 