In [97]:
import findspark
findspark.init()

In [98]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQLStructuredStreaming-Files") \
    .master("spark://be6296989c4d:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

In [99]:
page_df = spark \
            .read \
            .parquet("/home/jovyan/notebooks/data/page_views/")

page_df.show(4, False)

[Stage 1:>                                                          (0 + 1) / 1]

+------------------------------------+------------------------------------+-------------+--------------------------+-----------+------+--------------------------+
|user_id                             |session_id                          |page_url     |referrer_url              |category   |price |timestamp                 |
+------------------------------------+------------------------------------+-------------+--------------------------+-----------+------+--------------------------+
|152afdbb-c405-486b-82eb-b8e98368c092|7eae79db-60f0-4f8c-b35a-6959180a1c40|/product/1088|tags/tag                  |apparel    |24.93 |2025-05-11T03:29:05.170238|
|152afdbb-c405-486b-82eb-b8e98368c092|7eae79db-60f0-4f8c-b35a-6959180a1c40|/product/1024|tag/search                |electronics|13.86 |2025-05-11T03:29:05.171131|
|152afdbb-c405-486b-82eb-b8e98368c092|7eae79db-60f0-4f8c-b35a-6959180a1c40|/product/1008|categories/categories/tags|books      |274.06|2025-05-11T03:29:05.172619|
|152afdbb-c405-486b-82

                                                                                

In [100]:
click_df = spark \
            .read \
            .parquet("/home/jovyan/notebooks/data/click_events/")

click_df.show(4, False)

+------------------------------------+------------------------------------+------------+-------------+--------+------+--------------------------+-------+-------+
|user_id                             |session_id                          |element_id  |page_url     |category|price |timestamp                 |x_coord|y_coord|
+------------------------------------+------------------------------------+------------+-------------+--------+------+--------------------------+-------+-------+
|152afdbb-c405-486b-82eb-b8e98368c092|7eae79db-60f0-4f8c-b35a-6959180a1c40|link_product|/product/1064|apparel |321.51|2025-05-11T03:29:05.173186|1622.14|1032.42|
|152afdbb-c405-486b-82eb-b8e98368c092|7eae79db-60f0-4f8c-b35a-6959180a1c40|btn_checkout|/product/1085|toys    |101.15|2025-05-11T03:29:05.173280|375.61 |836.97 |
|152afdbb-c405-486b-82eb-b8e98368c092|7eae79db-60f0-4f8c-b35a-6959180a1c40|btn_checkout|/product/1150|home    |172.41|2025-05-11T03:29:05.173392|1636.84|20.09  |
|e849242b-d028-4c09-9fd1-f59

In [101]:
interact_df = spark \
            .read \
            .parquet("/home/jovyan/notebooks/data/user_interactions/")

interact_df.show(4, False)

+------------------------------------+------------------------------------+----------------+-------------+-----------+------+------------------------+--------------------------+
|user_id                             |session_id                          |interaction_type|page_url     |category   |price |details                 |timestamp                 |
+------------------------------------+------------------------------------+----------------+-------------+-----------+------+------------------------+--------------------------+
|152afdbb-c405-486b-82eb-b8e98368c092|7eae79db-60f0-4f8c-b35a-6959180a1c40|copy_text       |/product/1192|toys       |352.92|{"length":98}           |2025-05-11T03:29:05.173472|
|e849242b-d028-4c09-9fd1-f598ae17cf88|22a2cd9d-a181-4f3e-9a7a-85d5f6fa77f4|copy_text       |/product/1010|electronics|32.21 |{"length":99}           |2025-05-11T03:29:07.175655|
|e849242b-d028-4c09-9fd1-f598ae17cf88|22a2cd9d-a181-4f3e-9a7a-85d5f6fa77f4|scroll          |/product/1169|appa

In [102]:
from pyspark.sql.functions import when, col, sum, avg, max
from pyspark.sql.functions import get_json_object
result = page_df.join(click_df, 
      page_df['session_id'] == click_df['session_id'] ,
      "left").join(interact_df, page_df['session_id'] == interact_df['session_id']).drop(interact_df['session_id'],interact_df['user_id'] ).drop(click_df['session_id'],click_df['user_id'])

result.drop('page_url')
df_final=result.withColumn("Click_buy", when(result['element_id']=='btn_buy', 1).otherwise(0))

session_interaction_def = df_final.groupBy("session_id").agg(
    sum(when(col("interaction_type") == "form_submit", 1).otherwise(0)).alias("form_submit_count"),
    sum(when(col("interaction_type") == "scroll", 1).otherwise(0)).alias("scroll_count"),
    sum(when(col("interaction_type") == "hover", 1).otherwise(0)).alias("hover_count"),
    sum(when(col("interaction_type") == "zoom", 1).otherwise(0)).alias("zoom_count"),
    sum(when(col("interaction_type") == "copy_text", 1).otherwise(0)).alias("copy_text_count"),
    avg(when(col("interaction_type") == "zoom", get_json_object(df_final.details,"$.zoom_level")).otherwise(0)).alias("zoom_level_avg"),
    avg(when(col("interaction_type") == "scroll", get_json_object(df_final.details,"$.scroll_depth")).otherwise(0)).alias("scroll_depth_avg"),
    avg(when(col("interaction_type") == "copy_text", get_json_object(df_final.details,"$.length")).otherwise(0)).alias("copy_length_avg"),
    sum(when(col("element_id") == "nav_login", 1).otherwise(0)).alias("nav_login_clicks"),
    sum(when(col("element_id") == "btn_add_to_cart", 1).otherwise(0)).alias("add_to_cart_clicks"),
    sum(when(col("element_id") == "link_product", 1).otherwise(0)).alias("link_product_clicks"),
    max(when(col("element_id") == "btn_checkout", 1).otherwise(0)).alias("btn_checkout"),
    max(when(col("element_id") == "btn_buy", 1).otherwise(0)).alias("btn_buy")
)

session_interaction_def.show()
print(session_interaction_def.count())

                                                                                

+--------------------+-----------------+------------+-----------+----------+---------------+------------------+------------------+------------------+----------------+------------------+-------------------+------------+-------+
|          session_id|form_submit_count|scroll_count|hover_count|zoom_count|copy_text_count|    zoom_level_avg|  scroll_depth_avg|   copy_length_avg|nav_login_clicks|add_to_cart_clicks|link_product_clicks|btn_checkout|btn_buy|
+--------------------+-----------------+------------+-----------+----------+---------------+------------------+------------------+------------------+----------------+------------------+-------------------+------------+-------+
|5aaacaec-66c2-411...|                0|          27|          0|         0|              0|               0.0|              95.0|               0.0|               9|                 0|                  0|           1|      1|
|53cae47d-1a88-45a...|              160|           0|        240|        80|              0|



261


                                                                                

In [103]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

label_indexer=StringIndexer(inputCol="btn_buy",outputCol="label")
df_label = label_indexer.fit(session_interaction_def).transform(session_interaction_def)

assembler = VectorAssembler(inputCols=["form_submit_count", "scroll_count","hover_count","zoom_count",
                                       "copy_text_count","zoom_level_avg","scroll_depth_avg",
                                       "copy_length_avg","nav_login_clicks","add_to_cart_clicks",
                                       "link_product_clicks","btn_checkout"], outputCol="features")
data_with_features = assembler.transform(df_label).select("label", "features")

                                                                                

In [109]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=57)

In [110]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

In [111]:
dt_model = dt.fit(train_df)

# Display model summary
print("Decision Tree model summary:{0}".format(dt_model.toDebugString))

                                                                                

Decision Tree model summary:DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f5a62ec52f73, depth=5, numNodes=33, numClasses=2, numFeatures=12
  If (feature 1 <= 25.5)
   If (feature 2 <= 43.5)
    If (feature 4 <= 29.0)
     If (feature 7 <= 3.533333333333333)
      If (feature 0 <= 38.0)
       Predict: 0.0
      Else (feature 0 > 38.0)
       Predict: 1.0
     Else (feature 7 > 3.533333333333333)
      Predict: 1.0
    Else (feature 4 > 29.0)
     If (feature 4 <= 76.0)
      Predict: 0.0
     Else (feature 4 > 76.0)
      Predict: 1.0
   Else (feature 2 > 43.5)
    If (feature 10 <= 61.5)
     Predict: 0.0
    Else (feature 10 > 61.5)
     If (feature 0 <= 0.5)
      Predict: 0.0
     Else (feature 0 > 0.5)
      Predict: 1.0
  Else (feature 1 > 25.5)
   If (feature 9 <= 82.5)
    If (feature 8 <= 71.0)
     If (feature 10 <= 142.0)
      Predict: 0.0
     Else (feature 10 > 142.0)
      Predict: 1.0
    Else (feature 8 > 71.0)
     If (feature 3 <= 55.0)
      If (featur

In [107]:
# Use the trained model to make predictions on the test data
predictions = dt_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction").show()



+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|(12,[0,1,3,5,6,11...|       1.0|
|(12,[0,1,6,9,10,1...|       1.0|
|(12,[0,3,5,9,11],...|       0.0|
|(12,[1,2,3,5,6,8]...|       1.0|
|(12,[1,6,8,10],[1...|       0.0|
|(12,[2,3,5,8,10],...|       0.0|
|(12,[2,3,5,10],[6...|       0.0|
|(12,[2,4,7,9],[10...|       1.0|
|(12,[2,8,9],[16.0...|       0.0|
|(12,[3,5,8,9,10,1...|       0.0|
|(12,[3,5,9,10,11]...|       0.0|
|(12,[4,7,8,10,11]...|       1.0|
|[0.0,0.0,0.0,49.0...|       1.0|
|[0.0,0.0,56.0,112...|       0.0|
|[0.0,0.0,72.0,36....|       0.0|
|[0.0,24.0,48.0,24...|       0.0|
|[0.0,40.0,0.0,20....|       0.0|
|[0.0,54.0,0.0,54....|       0.0|
|[0.0,60.0,0.0,30....|       1.0|
|[9.0,18.0,9.0,18....|       1.0|
+--------------------+----------+
only showing top 20 rows



                                                                                

In [108]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                            predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, 
                  {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")
precision = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")
recall = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")
f1 = evaluator.evaluate(predictions,
                {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")    

                                                                                

Accuracy: 0.6333333333333333


                                                                                

Precision: 0.6562594268476621


                                                                                

Recall: 0.6333333333333333




F1 Score: 0.6333333333333334


                                                                                

In [95]:
sc.stop()