In [0]:
# Import required libraries
from pyspark.sql import functions as F

# Load October raw data
october_df = spark.read.csv("/Volumes/workspace/advecom/advecom_data/2019-Oct.csv", header=True, inferSchema=True)

# Create user feature dataframe, with necessary features only 
user_features_df = october_df.filter(F.col("price") > 0) \
    .filter("user_id IS NOT NULL") \
    .dropDuplicates(["user_id","product_id","event_type","event_time"]) \
    .groupBy("user_id") \
    .agg(
        F.sum(
            F.when(F.col("event_type")=="view",1)
            .otherwise(0)
        ).alias("total_views"),
        F.sum(
            F.when(F.col("event_type")=="cart",1)
            .otherwise(0)
        ).alias("total_cart"),
        F.sum(
            F.when(F.col("event_type")=="purchase",1)
            .otherwise(0)
        ).alias("total_purchases"),
        F.avg("price").alias("avg_price"),
        F.count("user_id").alias("total_events"),
        F.sum(F.when(F.col("event_type")=="purchase", F.col("price")).otherwise(0)).alias("total_spent")
    )
    
# Check user features
display(user_features_df.limit(5))

user_id,total_views,total_cart,total_purchases,avg_price,total_events,total_spent
516407514,19,2,1,224.0890909090909,22,231.64
547701060,14,2,0,1119.1575,16,0.0
514555296,65,0,1,283.03409090909093,66,244.28
555846537,23,2,2,203.63814814814816,27,368.04
513383856,11,0,0,724.3663636363636,11,0.0


In [0]:
# Create target label dataframe from user features
labeled_df = user_features_df.withColumn(
    "label",
    F.when(F.col("total_purchases")>0,1)
    .otherwise(0)
    )

# Check labeled dataframe
display(labeled_df)

user_id,total_views,total_cart,total_purchases,avg_price,total_events,total_spent,label
516407514,19,2,1,224.0890909090909,22,231.64,1
547701060,14,2,0,1119.1575,16,0.0,0
514555296,65,0,1,283.03409090909093,66,244.28,1
555846537,23,2,2,203.63814814814816,27,368.04,1
513383856,11,0,0,724.3663636363636,11,0.0,0
519148299,11,1,3,248.33800000000005,15,483.4500000000001,1
516264626,8,0,0,392.1275,8,0.0,0
515936492,30,0,0,479.965,30,0.0,0
521328224,13,0,0,157.47384615384615,13,0.0,0
535611590,10,0,0,7.726999999999999,10,0.0,0


In [0]:
# Check distribution of labels
labeled_df.groupBy("label").count().show()

+-----+-------+
|label|  count|
+-----+-------+
|    1| 347118|
|    0|2674317|
+-----+-------+



In [0]:
# Create two dataframes for purchase and non-purchase, and then combine them with a smaller fraction of non purchase. As a result the model will be trained on a balanced dataset.
purchase_df = labeled_df.filter("label=1")

non_purchase_df = labeled_df.filter("label=0") \
.sample(fraction=0.13)

balanced_df = purchase_df.union(non_purchase_df)

balanced_df.groupBy("label").count().show()

+-----+------+
|label| count|
+-----+------+
|    1|347118|
|    0|347427|
+-----+------+



In [0]:
# Import required libraries
from pyspark.ml.feature import VectorAssembler

# Create features vector for ML
assembler = VectorAssembler(
    inputCols=[
        "total_views",
        "total_cart",
        "total_spent",
        "total_events",
        "avg_price"
    ],
    outputCol="features"
)

# Create model dataframe with features and label
model_df = assembler.transform(balanced_df).select("features","label")

display(model_df)

features,label
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""19.0"",""2.0"",""231.64"",""22.0"",""224.08909090909094""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""65.0"",""0.0"",""244.28"",""66.0"",""283.03409090909093""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""23.0"",""2.0"",""368.04"",""27.0"",""203.63814814814816""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""11.0"",""1.0"",""483.45000000000005"",""15.0"",""248.33800000000005""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""41.0"",""0.0"",""274.91"",""42.0"",""541.437380952381""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""38.0"",""0.0"",""574.02"",""39.0"",""302.8282051282051""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""150.0"",""0.0"",""266.4"",""153.0"",""76.1002614379085""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""108.0"",""1.0"",""88.79"",""110.0"",""283.26227272727266""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""53.0"",""4.0"",""86.48"",""59.0"",""86.27898305084747""]}",1
"{""type"":""1"",""size"":null,""indices"":null,""values"":[""3.0"",""1.0"",""359.03"",""5.0"",""359.03""]}",1


In [0]:
# Split data into training and test
train_df, test_df = model_df.randomSplit([0.8,0.2], seed=7)

# Check label distribution across train and test
train_df.groupBy("label").count().show()
test_df.groupBy("label").count().show() 

+-----+------+
|label| count|
+-----+------+
|    1|277727|
|    0|277757|
+-----+------+

+-----+-----+
|label|count|
+-----+-----+
|    1|69391|
|    0|69670|
+-----+-----+



In [0]:
# Import required libraries
import mlflow
import mlflow.spark
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Log Logistic Regression model with MLflow
with mlflow.start_run(run_name="log_reg_model"):
        
    # Initialize logistic regression model
    lr = LogisticRegression(
        featuresCol="features",
        labelCol="label"
    )
    # Fit model on training data
    lr_model = lr.fit(train_df)
    # Make predictions on test data
    lr_pred = lr_model.transform(test_df)
    # Initialize binary classification evaluator
    evaluator = BinaryClassificationEvaluator(
        labelCol="label"
    )
    lr_auc = evaluator.evaluate(lr_pred) # Calculate AUC

    mlflow.log_param("model_type", "Logistic Regression") # Log model type
    mlflow.log_metric("AUC", lr_auc) # Log metric as AUC
    mlflow.spark.log_model(spark_model=lr_model, # Log model
                           artifact_path="logistic_model", # Specify artifact path
                           dfs_tmpdir="/Volumes/workspace/advecom/advecom_data/mlflowlog") # Specify temp directory
    print("Logistic Regression AUC:", lr_auc)



Logistic Regression AUC: 0.9999998245930102


In [0]:

 # Import required libraries
from pyspark.ml.classification import RandomForestClassifier
# Log Random Forest Classifier model with MLflow
with mlflow.start_run(run_name="rf_classifier"):      
    # Initialize random forest model
    rf = RandomForestClassifier(
        featuresCol="features",
        labelCol="label",
        numTrees=20,
        maxDepth=5
    )
    # Fit model on training data
    rf_model = rf.fit(train_df)
    # Make predictions on test data
    rf_pred = rf_model.transform(test_df)
    # Initialize binary classification evaluator
    evaluator = BinaryClassificationEvaluator(
        labelCol="label"
    )
    rf_auc = evaluator.evaluate(rf_pred) # Calculate AUC
    mlflow.log_param("model_type", "Random Forest Classifier") # log model type
    mlflow.log_param("numTrees", rf.getNumTrees) # log parameter number of trees
    mlflow.log_param("maxDepth", rf.getMaxDepth()) # log parameter max depth
    
    mlflow.log_metric("AUC", rf_auc) # log metric as AUC
    mlflow.spark.log_model(spark_model=rf_model, # log model
                           artifact_path="random_forest_classifier", # specify artifact path
                           dfs_tmpdir="/Volumes/workspace/advecom/advecom_data/mlflowlog") # specify temp directory

    print("Random Forest Classifier AUC:", rf_auc)



Random Forest Classifier AUC: 1.0


In [0]:
print("Logistic Regression AUC:", lr_auc)
print("Random Forest Classifier AUC:", rf_auc)

Logistic Regression AUC: 0.9999998245930102
Random Forest Classifier AUC: 1.0
