In [0]:
# Import required libraries
from pyspark.sql import functions as F

# Load saved bronze layer data
bronze_df = spark.read.format("delta").load("/Volumes/workspace/advecom/advecom_data/delta/bronze/events")

# Create user feature dataframe, with necessary features only 
user_features_df = bronze_df.filter(F.col("price") > 0) \
    .filter("user_id IS NOT NULL") \
    .dropDuplicates(["user_id"]) \
    .groupBy("user_id") \
    .agg(
    F.sum(
    F.when(F.col("event_type")=="view",1)
    .otherwise(0)
    ).alias("total_views"),

    F.sum(
    F.when(F.col("event_type")=="cart",1)
    .otherwise(0)
    ).alias("total_cart"),

    F.sum(
    F.when(F.col("event_type")=="purchase",1)
    .otherwise(0)
    ).alias("total_purchases"),

    F.avg("price").alias("avg_price"))
    
# Check user features
display(user_features_df.limit(5))

user_id,total_views,total_cart,total_purchases,avg_price
515993713,1,0,0,111.7
526855580,1,0,0,154.42
513017380,1,0,0,241.71
513218277,1,0,0,98.46
513387588,1,0,0,118.66


In [0]:
# Create target label dataframe from user features
labeled_df = user_features_df.withColumn(
    "label",
    (user_features_df.total_purchases > 0).cast("int")
)

# Save dataframe containing features and label in delta format
labeled_df.write.format("delta") \
.mode("overwrite") \
.save("/Volumes/workspace/advecom/advecom_data/gold/ml_dataset")

# Check labeled dataframe
display(labeled_df)

user_id,total_views,total_cart,total_purchases,avg_price,label
515993713,1,0,0,111.7,0
526855580,1,0,0,154.42,0
513017380,1,0,0,241.71,0
513218277,1,0,0,98.46,0
513387588,1,0,0,118.66,0
524457561,1,0,0,230.02,0
520308752,1,0,0,61.01,0
559699559,1,0,0,121.99,0
559700544,1,0,0,250.83,0
513127233,1,0,0,349.82,0


In [0]:
# Check distribution of labels
labeled_df.groupBy("label").count().show()

# Here a huge imbalance is seen, therefore we need to balance the dataset

+-----+-------+
|label|  count|
+-----+-------+
|    1|    783|
|    0|3020652|
+-----+-------+



In [0]:
# Confirm if the user features were correctly created and the max - purchase is not 0
labeled_df.select("total_purchases").describe().show()

+-------+--------------------+
|summary|     total_purchases|
+-------+--------------------+
|  count|             3021435|
|   mean|2.591483847906706...|
| stddev| 0.01609600300239475|
|    min|                   0|
|    max|                   1|
+-------+--------------------+



In [0]:
# Create two dataframes for purchase and non-purchase, and then combine them with a smaller fraction of non purchase. As a result the model will be trained on a balanced dataset. Otherwise, due to the imbalance the model would predict 0(no purchase) for all the records, which might give 100% accuracy but will be useless.
purchase_df = labeled_df.filter("label=1")

non_purchase_df = labeled_df.filter("label=0") \
.sample(fraction=0.001)

balanced_df = purchase_df.union(non_purchase_df)

balanced_df.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1|  783|
|    0| 3063|
+-----+-----+



In [0]:
# Split the balanced dataset into train and test
train_df, test_df = balanced_df.randomSplit([0.8,0.2], seed=5)

# Check the split
print("Train count:",train_df.count())
print("Test count:",test_df.count())

Train count: 3131
Test count: 715


In [0]:
# Check the distribution of labels in train and test dataframes.
train_df.groupBy("label").count().show()
test_df.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1|  650|
|    0| 2481|
+-----+-----+

+-----+-----+
|label|count|
+-----+-----+
|    1|  133|
|    0|  582|
+-----+-----+



In [0]:
# Save the balanced dataset in delta format
balanced_df.write.format("delta") \
.mode("overwrite") \
.save("/Volumes/workspace/advecom/advecom_data/gold/balanced_df")