In [1]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("Streaming SQL")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

sc = spark.sparkContext

In [3]:
df = spark.read.parquet("/stream/*")

In [4]:
df.show(truncate=False)

+-------------------+----------+---------+-------------------+-------------------------+--------+-----------+-------------+--------------------+-----------------------+
|eventType          |customerId|productId|timestamp          |metadata                 |quantity|totalAmount|paymentMethod|recommendedProductId|algorithm              |
+-------------------+----------+---------+-------------------+-------------------------+--------+-----------+-------------+--------------------+-----------------------+
|addToCart          |35107     |4271     |2024-07-06T17:34:40|{null, null}             |4       |null       |null         |null                |null                   |
|recommendationClick|44729     |1504     |2024-07-06T17:34:43|{null, null}             |null    |null       |null         |2655                |content_based          |
|recommendationClick|12662     |8457     |2024-07-06T17:34:44|{null, null}             |null    |null       |null         |1786                |collaborati

In [5]:
df.registerTempTable("stream_data")

In [6]:
# how often every event occur ?
spark.sql("""
SELECT eventType, count(*) as event_ooccurance 
FROM stream_data
GROUP BY 1
ORDER BY 2 DESC
""").show()

+-------------------+----------------+
|          eventType|event_ooccurance|
+-------------------+----------------+
|          addToCart|             376|
|           purchase|             365|
|recommendationClick|             362|
|        productView|             354|
+-------------------+----------------+



In [7]:
# what is most payment method used ?
spark.sql("""
SELECT paymentMethod, count(*) as method_ooccurance 
FROM stream_data
where paymentMethod is not null
GROUP BY 1
ORDER BY 2 DESC
""").show()

+-------------+-----------------+
|paymentMethod|method_ooccurance|
+-------------+-----------------+
|   Debit Card|              131|
|       PayPal|              121|
|  Credit Card|              113|
+-------------+-----------------+



In [8]:
# what is the highest payment method in total Amount
spark.sql("""
SELECT paymentMethod, round(sum(totalamount),2) as method_total_amount 
FROM stream_data
where paymentMethod is not null
GROUP BY 1
ORDER BY 2 DESC
""").show()

+-------------+-------------------+
|paymentMethod|method_total_amount|
+-------------+-------------------+
|   Debit Card|           33731.37|
|       PayPal|           31757.23|
|  Credit Card|            30137.9|
+-------------+-------------------+



In [9]:
# top 5 customer spending
spark.sql("""
SELECT customerId, sum(totalAmount) as spent 
FROM stream_data
GROUP BY 1
ORDER BY 2 DESC
limit 5
""").show()

+----------+------+
|customerId| spent|
+----------+------+
|     10006|497.79|
|     77589|495.21|
|     75980|492.53|
|     80009|489.45|
|     46932| 488.9|
+----------+------+



In [10]:
# top 5 products in total Amount
spark.sql("""
SELECT productId, sum(totalAmount) as total_spend 
FROM stream_data
GROUP BY 1
ORDER BY 2 DESC
limit 5
""").show()

+---------+-----------+
|productId|total_spend|
+---------+-----------+
|     1149|     564.95|
|     2020|     497.79|
|     3152|     495.21|
|     9392|     492.53|
|     7187|     489.45|
+---------+-----------+



In [11]:
# compare the addToCart events with Purchase events
result = spark.sql("""
SELECT distinct s.productId,  
       purchase, 
       addToCart
FROM (SELECT  productId, 
       COUNT(*) OVER (PARTITION BY productId) AS purchase
    FROM stream_data
    WHERE eventType = 'purchase'
) AS s 
JOIN (
    SELECT  productId, 
       COUNT(*) OVER (PARTITION BY productId) AS addToCart
    FROM stream_data
    WHERE eventType = 'addToCart'
    
) AS p 
ON s.productId = p.productId
order by addtocart desc
""").show()

+---------+--------+---------+
|productId|purchase|addToCart|
+---------+--------+---------+
|     6380|       1|        2|
|     6806|       1|        1|
|     9133|       1|        1|
|     3952|       1|        1|
|     1315|       1|        1|
|     3748|       1|        1|
|     1921|       1|        1|
|     7235|       1|        1|
|     2234|       1|        1|
|     3815|       1|        1|
|     6836|       1|        1|
|     3275|       1|        1|
|     7118|       1|        1|
|     1150|       1|        1|
|     7546|       1|        1|
|     6665|       1|        1|
+---------+--------+---------+



In [12]:
# total income from purchases
spark.sql("""
SELECT round(sum(totalAmount),2) as total_income 
FROM stream_data
where eventType = 'purchase'
""").show()

+------------+
|total_income|
+------------+
|     95626.5|
+------------+



In [13]:
# cmparing products categories 
spark.sql("""
SELECT metadata.category, count(*) as category_count
FROM stream_data
where metadata.source is not null
group by 1
order by 2 desc
""").show()

+--------------+--------------+
|      category|category_count|
+--------------+--------------+
|         Books|           102|
|      Clothing|            89|
|Home & Kitchen|            85|
|   Electronics|            78|
+--------------+--------------+



In [14]:
# comparing products sources
spark.sql("""
SELECT metadata.source, count(*) as source_count
FROM stream_data
where metadata.source is not null
group by 1
order by 2 desc
""").show()

+-------------+------------+
|       source|source_count|
+-------------+------------+
|       Direct|         123|
|Advertisement|         121|
|       Search|         110|
+-------------+------------+



In [16]:
spark.stop()