# Streak Analysis

## Boiler Plate Code

In [None]:
# Import PySpark
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("pareto_sql") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [None]:
!kaggle datasets download -d mohammadtalib786/retail-sales-dataset

Dataset URL: https://www.kaggle.com/datasets/mohammadtalib786/retail-sales-dataset
License(s): CC0-1.0
Downloading retail-sales-dataset.zip to /content
  0% 0.00/11.2k [00:00<?, ?B/s]
100% 11.2k/11.2k [00:00<00:00, 12.1MB/s]


In [None]:
#unpacing the zip file
import zipfile
zip_ref = zipfile.ZipFile('retail-sales-dataset.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [None]:
#loading the data
df1 = spark.read\
    .format("csv")\
    .option("inferSchema","true")\
    .option("header","true")\
    .option("delimiter",",")\
    .load("/content/retail_sales_dataset.csv")

In [None]:
# renaming columns
cols = df1.columns
cols_new = [col.replace(" ", "_").lower() for col in cols]
df1 = df1.toDF(*cols_new)

In [None]:
#creating a view for Spark SQL
df1.createOrReplaceTempView("sales")

##Streak Question

In [None]:
df1.show(5)

+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+
|transaction_id|      date|customer_id|gender|age|product_category|quantity|price_per_unit|total_amount|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|       1|            30|          30|
|             4|2023-05-21|    CUST004|  Male| 37|        Clothing|       1|           500|         500|
|             5|2023-05-06|    CUST005|  Male| 30|          Beauty|       2|            50|         100|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+
only showing top 5 rows



In [None]:
# aggregate at the level of the date  -> sum of total amount

df2 = df1.\
  groupBy("date").\
  agg(sum("total_amount").alias("total_sales"))

df2.sort("date",ascending = True).show()
#some dates are missing

+----------+-----------+
|      date|total_sales|
+----------+-----------+
|2023-01-01|       3600|
|2023-01-02|       1765|
|2023-01-03|        600|
|2023-01-04|       1240|
|2023-01-05|       1100|
|2023-01-06|        620|
|2023-01-07|        150|
|2023-01-08|        625|
|2023-01-09|        200|
|2023-01-10|        230|
|2023-01-11|        280|
|2023-01-13|       1930|
|2023-01-14|       1550|
|2023-01-15|        660|
|2023-01-16|       4000|
|2023-01-17|       1645|
|2023-01-19|         30|
|2023-01-20|        125|
|2023-01-21|       2090|
|2023-01-22|        325|
+----------+-----------+
only showing top 20 rows



In [None]:
thres = 1000;

df3 = df2.filter(df2.total_sales >= thres)
df3.sort("date",ascending = True).show()

+----------+-----------+
|      date|total_sales|
+----------+-----------+
|2023-01-01|       3600|
|2023-01-02|       1765|
|2023-01-04|       1240|
|2023-01-05|       1100|
|2023-01-13|       1930|
|2023-01-14|       1550|
|2023-01-16|       4000|
|2023-01-17|       1645|
|2023-01-21|       2090|
|2023-01-23|       3120|
|2023-01-24|       2250|
|2023-01-26|       2570|
|2023-01-28|       1020|
|2023-01-31|       2100|
|2023-02-01|       4700|
|2023-02-02|       1375|
|2023-02-03|       1230|
|2023-02-04|       1120|
|2023-02-05|       1805|
|2023-02-07|       2100|
+----------+-----------+
only showing top 20 rows



In [None]:


df4 = df3.withColumn("rnk1", row_number().over(Window.orderBy(asc("date"))))
df5 = df4.withColumn("club", column("date")-column("rnk1"))
df5.show()

+----------+-----------+----+----------+
|      date|total_sales|rnk1|      club|
+----------+-----------+----+----------+
|2023-01-01|       3600|   1|2022-12-31|
|2023-01-02|       1765|   2|2022-12-31|
|2023-01-04|       1240|   3|2023-01-01|
|2023-01-05|       1100|   4|2023-01-01|
|2023-01-13|       1930|   5|2023-01-08|
|2023-01-14|       1550|   6|2023-01-08|
|2023-01-16|       4000|   7|2023-01-09|
|2023-01-17|       1645|   8|2023-01-09|
|2023-01-21|       2090|   9|2023-01-12|
|2023-01-23|       3120|  10|2023-01-13|
|2023-01-24|       2250|  11|2023-01-13|
|2023-01-26|       2570|  12|2023-01-14|
|2023-01-28|       1020|  13|2023-01-15|
|2023-01-31|       2100|  14|2023-01-17|
|2023-02-01|       4700|  15|2023-01-17|
|2023-02-02|       1375|  16|2023-01-17|
|2023-02-03|       1230|  17|2023-01-17|
|2023-02-04|       1120|  18|2023-01-17|
|2023-02-05|       1805|  19|2023-01-17|
|2023-02-07|       2100|  20|2023-01-18|
+----------+-----------+----+----------+
only showing top

In [None]:
df5.withColumn("streak",count("*").over(Window.partitionBy("club"))).agg(max("streak")).show()

+-----------+
|max(streak)|
+-----------+
|          7|
+-----------+



In [None]:

# create a row_number function partition by customer_id and order by date
win_1 = Window.partitionBy("date")

df3 = df2.withColumn("rnk1", row_number().over(win_1.orderBy(asc("date"))))
df3.show()

+----------+-----------+-----------+----------+
|      date|customer_id|total_sales|row_number|
+----------+-----------+-----------+----------+
|2023-11-24|    CUST001|        150|         1|
|2023-02-27|    CUST002|       1000|         1|
|2023-01-13|    CUST003|         30|         1|
|2023-05-21|    CUST004|        500|         1|
|2023-05-06|    CUST005|        100|         1|
|2023-04-25|    CUST006|         30|         1|
|2023-03-13|    CUST007|         50|         1|
|2023-02-22|    CUST008|        100|         1|
|2023-12-13|    CUST009|        600|         1|
|2023-10-07|    CUST010|        200|         1|
|2023-02-14|    CUST011|        100|         1|
|2023-10-30|    CUST012|         75|         1|
|2023-08-05|    CUST013|       1500|         1|
|2023-01-17|    CUST014|        120|         1|
|2023-01-16|    CUST015|       2000|         1|
|2023-02-17|    CUST016|       1500|         1|
|2023-04-22|    CUST017|        100|         1|
|2023-04-30|    CUST018|         50|    

In [None]:
# create a threshold
thres = 50;

df4 = df3.filter(df3.total_sales >= thres)
df5 = df4.withColumn("new_rnk",row_number().over(win_1.orderBy(asc("date")))).\
       withColumn("rnk_diff", col("row_number") - col("new_rnk"))

df5.sort(["customer_id","date"], ascending = [True, False]).show()

+----------+-----------+-----------+----------+-------+--------+
|      date|customer_id|total_sales|row_number|new_rnk|rnk_diff|
+----------+-----------+-----------+----------+-------+--------+
|2023-11-24|    CUST001|        150|         1|      1|       0|
|2023-02-27|    CUST002|       1000|         1|      1|       0|
|2023-05-21|    CUST004|        500|         1|      1|       0|
|2023-05-06|    CUST005|        100|         1|      1|       0|
|2023-03-13|    CUST007|         50|         1|      1|       0|
|2023-02-22|    CUST008|        100|         1|      1|       0|
|2023-12-13|    CUST009|        600|         1|      1|       0|
|2023-10-07|    CUST010|        200|         1|      1|       0|
|2023-02-14|    CUST011|        100|         1|      1|       0|
|2023-10-30|    CUST012|         75|         1|      1|       0|
|2023-08-05|    CUST013|       1500|         1|      1|       0|
|2023-01-17|    CUST014|        120|         1|      1|       0|
|2023-01-16|    CUST015| 

In [None]:
df5.groupBy("customer_id").agg(max("rnk_diff").alias("max_rnk_diff")).show()

+-----------+------------+
|customer_id|max_rnk_diff|
+-----------+------------+
|    CUST001|           0|
|    CUST002|           0|
|    CUST004|           0|
|    CUST005|           0|
|    CUST007|           0|
|    CUST008|           0|
|    CUST009|           0|
|    CUST010|           0|
|    CUST011|           0|
|    CUST012|           0|
|    CUST013|           0|
|    CUST014|           0|
|    CUST015|           0|
|    CUST016|           0|
|    CUST017|           0|
|    CUST018|           0|
|    CUST019|           0|
|    CUST020|           0|
|    CUST021|           0|
|    CUST022|           0|
+-----------+------------+
only showing top 20 rows

