In [19]:
import findspark
findspark.init()

In [20]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkModel") \
    .master("spark://be6296989c4d:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

In [21]:
page_df = spark \
            .read \
            .parquet("/home/jovyan/notebooks/data/page_views/")

page_df.show(4, False)

[Stage 1:>                                                          (0 + 1) / 1]

+------------------------------------+------------------------------------+------------------+-------------------------+--------+-----+--------------------------+
|user_id                             |session_id                          |page_url          |referrer_url             |category|price|timestamp                 |
+------------------------------------+------------------------------------+------------------+-------------------------+--------+-----+--------------------------+
|0e2e3eb3-bd19-4f9a-866e-3521d7e58d99|71b3618f-a74c-480f-9ead-c701a8c72750|/product/book_4009|wp-content/search/explore|books   |29.58|2025-05-12T04:13:36.990108|
|0e2e3eb3-bd19-4f9a-866e-3521d7e58d99|71b3618f-a74c-480f-9ead-c701a8c72750|/product/book_4009|category/posts           |books   |29.58|2025-05-12T04:13:36.990491|
|0e2e3eb3-bd19-4f9a-866e-3521d7e58d99|71b3618f-a74c-480f-9ead-c701a8c72750|/product/book_4009|list/tags                |books   |29.58|2025-05-12T04:13:36.990604|
|0e2e3eb3-bd19-4f9a-86

                                                                                

In [22]:
click_df = spark \
            .read \
            .parquet("/home/jovyan/notebooks/data/click_events/")

click_df.show(4, False)

+------------------------------------+------------------------------------+------------+------------------+--------+-----+--------------------------+-------+-------+
|user_id                             |session_id                          |element_id  |page_url          |category|price|timestamp                 |x_coord|y_coord|
+------------------------------------+------------------------------------+------------+------------------+--------+-----+--------------------------+-------+-------+
|0e2e3eb3-bd19-4f9a-866e-3521d7e58d99|71b3618f-a74c-480f-9ead-c701a8c72750|link_product|/product/book_4009|books   |29.58|2025-05-12T04:13:36.991520|559.59 |142.47 |
|0e2e3eb3-bd19-4f9a-866e-3521d7e58d99|71b3618f-a74c-480f-9ead-c701a8c72750|link_product|/product/book_4009|books   |29.58|2025-05-12T04:13:36.993579|559.59 |142.47 |
|0e2e3eb3-bd19-4f9a-866e-3521d7e58d99|71b3618f-a74c-480f-9ead-c701a8c72750|link_product|/product/book_4009|books   |29.58|2025-05-12T04:13:36.993702|559.59 |142.47 |
|0e2

In [23]:
interact_df = spark \
            .read \
            .parquet("/home/jovyan/notebooks/data/user_interactions/")

interact_df.show(4, False)

+------------------------------------+------------------------------------+----------------+------------------+--------+-----+------------------------+--------------------------+
|user_id                             |session_id                          |interaction_type|page_url          |category|price|details                 |timestamp                 |
+------------------------------------+------------------------------------+----------------+------------------+--------+-----+------------------------+--------------------------+
|0e2e3eb3-bd19-4f9a-866e-3521d7e58d99|71b3618f-a74c-480f-9ead-c701a8c72750|form_submit     |/product/book_4009|books   |29.58|{"form_id":"signup_for"}|2025-05-12T04:13:36.994205|
|0e2e3eb3-bd19-4f9a-866e-3521d7e58d99|71b3618f-a74c-480f-9ead-c701a8c72750|form_submit     |/product/book_4009|books   |29.58|{"form_id":"signup_for"}|2025-05-12T04:13:36.994298|
|0e2e3eb3-bd19-4f9a-866e-3521d7e58d99|71b3618f-a74c-480f-9ead-c701a8c72750|form_submit     |/product/book

In [53]:
from pyspark.sql.functions import when, col, sum, avg, max
from pyspark.sql.functions import get_json_object
click_df.drop('page_url','category','price','x_coord','y_coord')
interact_df.drop('page_url','category','price')
df_final = click_df.join(interact_df, click_df['session_id'] == interact_df['session_id']).drop(interact_df['session_id'],interact_df['user_id'],interact_df['timestamp'] ).dropDuplicates()




session_interaction_def = df_final.groupBy("session_id").agg(
    sum(when(col("interaction_type") == "form_submit", 1).otherwise(0)).alias("form_submit_count"),
    sum(when(col("interaction_type") == "scroll", 1).otherwise(0)).alias("scroll_count"),
    sum(when(col("interaction_type") == "hover", 1).otherwise(0)).alias("hover_count"),
    sum(when(col("interaction_type") == "zoom", 1).otherwise(0)).alias("zoom_count"),
    sum(when(col("interaction_type") == "copy_text", 1).otherwise(0)).alias("copy_text_count"),
    avg(when(col("interaction_type") == "zoom", get_json_object(df_final.details,"$.zoom_level")).otherwise(0)).alias("zoom_level_avg"),
    avg(when(col("interaction_type") == "scroll", get_json_object(df_final.details,"$.scroll_depth")).otherwise(0)).alias("scroll_depth_avg"),
    avg(when(col("interaction_type") == "copy_text", get_json_object(df_final.details,"$.length")).otherwise(0)).alias("copy_length_avg"),
    sum(when(col("element_id") == "nav_login", 1).otherwise(0)).alias("nav_login_clicks"),
    sum(when(col("element_id") == "btn_add_to_cart", 1).otherwise(0)).alias("add_to_cart_clicks"),
    sum(when(col("element_id") == "link_product", 1).otherwise(0)).alias("link_product_clicks"),
    max(when(col("element_id") == "btn_checkout", 1).otherwise(0)).alias("btn_checkout"),
    max(when(col("element_id") == "btn_buy", 1).otherwise(0)).alias("btn_buy")
)

session_interaction_def.show()




+--------------------+-----------------+------------+-----------+----------+---------------+--------------+----------------+---------------+----------------+------------------+-------------------+------------+-------+
|          session_id|form_submit_count|scroll_count|hover_count|zoom_count|copy_text_count|zoom_level_avg|scroll_depth_avg|copy_length_avg|nav_login_clicks|add_to_cart_clicks|link_product_clicks|btn_checkout|btn_buy|
+--------------------+-----------------+------------+-----------+----------+---------------+--------------+----------------+---------------+----------------+------------------+-------------------+------------+-------+
|374df097-3ffc-4a9...|                0|          19|          0|         0|              0|           0.0|            45.0|            0.0|               0|                17|                  0|           1|      0|
|82d2cbc7-e850-42c...|                0|           0|          0|        14|              0|         125.0|             0.0|    

                                                                                

In [46]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

label_indexer=StringIndexer(inputCol="btn_buy",outputCol="label")
df_label = label_indexer.fit(session_interaction_def).transform(session_interaction_def)

assembler = VectorAssembler(inputCols=["form_submit_count","nav_login_clicks","add_to_cart_clicks",
                                       "link_product_clicks","btn_checkout"], outputCol="features")
data_with_features = assembler.transform(df_label).select("label", "features")

                                                                                

In [47]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=57)

In [48]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

In [49]:
dt_model = dt.fit(train_df)

# Display model summary
print("Decision Tree model summary:{0}".format(dt_model.toDebugString))

                                                                                

Decision Tree model summary:DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d089cbc90899, depth=4, numNodes=9, numClasses=2, numFeatures=5
  If (feature 3 <= 4.5)
   If (feature 3 <= 0.5)
    If (feature 2 <= 0.5)
     If (feature 4 <= 0.5)
      Predict: 0.0
     Else (feature 4 > 0.5)
      Predict: 1.0
    Else (feature 2 > 0.5)
     Predict: 0.0
   Else (feature 3 > 0.5)
    Predict: 0.0
  Else (feature 3 > 4.5)
   Predict: 1.0



In [50]:
# Use the trained model to make predictions on the test data
predictions = dt_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction").show()



+-------------------+----------+
|           features|prediction|
+-------------------+----------+
|(5,[0,2],[3.0,3.0])|       0.0|
|(5,[0,2],[3.0,3.0])|       0.0|
|(5,[0,2],[6.0,6.0])|       0.0|
|(5,[0,3],[1.0,1.0])|       0.0|
|(5,[0,3],[2.0,2.0])|       0.0|
|      (5,[2],[2.0])|       0.0|
|      (5,[2],[2.0])|       0.0|
|      (5,[2],[2.0])|       0.0|
|      (5,[2],[6.0])|       0.0|
|      (5,[2],[6.0])|       0.0|
|(5,[2,4],[1.0,1.0])|       0.0|
|(5,[2,4],[2.0,1.0])|       0.0|
|(5,[2,4],[4.0,1.0])|       0.0|
|(5,[2,4],[4.0,1.0])|       0.0|
|(5,[2,4],[6.0,1.0])|       0.0|
|(5,[2,4],[6.0,1.0])|       0.0|
|(5,[2,4],[7.0,1.0])|       0.0|
|(5,[2,4],[7.0,1.0])|       0.0|
|(5,[2,4],[8.0,1.0])|       0.0|
|(5,[2,4],[9.0,1.0])|       0.0|
+-------------------+----------+
only showing top 20 rows



                                                                                

In [51]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                            predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, 
                  {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")
precision = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")
recall = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")
f1 = evaluator.evaluate(predictions,
                {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")    

                                                                                

Accuracy: 0.88


                                                                                

Precision: 0.8797058823529411


                                                                                

Recall: 0.88




F1 Score: 0.8782887700534759


                                                                                

In [18]:
sc.stop()