# Load Data

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("model").config("fs.defaultFS", "file:///").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

df = spark.read.csv("/data/BigData/ecommerce_data_with_trends.csv", header=True)

In [2]:
from pyspark.sql.functions import isnan, when, count, col

# Check how many null values and nan values are there in the dataframe
df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df.columns]).show()



+--------------+---------+-----------+-------------+----+-------------+------------+--------+-----+--------+------------+
|transaction_id|timestamp|customer_id|customer_name|city|customer_type|product_name|category|price|quantity|total_amount|
+--------------+---------+-----------+-------------+----+-------------+------------+--------+-----+--------+------------+
|             0|        0|          0|            0|   0|            0|           0|       0|    0|       0|           0|
+--------------+---------+-----------+-------------+----+-------------+------------+--------+-----+--------+------------+



                                                                                

# Churn Analysis

In [9]:
from churn import CustomerChurnMLAnalysis

ml_analysis = CustomerChurnMLAnalysis(spark, df)
results = ml_analysis.predict_customer_churn()

                                                                                

In [35]:
from pyspark.sql.functions import when

def show_df(data):
    data = data.withColumn("churn", when(data["prediction"] == 1, "Yes").otherwise("No"))
    selected_data = data.select("customer_id", "customer_name", "churn", "probability")
    # rename column probability to churn_probability
    selected_data = selected_data.withColumnRenamed("probability", "no_churn_probability")
    selected_data.show(5)

In [36]:
lr =  results['logistic_regression']
rf = results['random_forest']

for key in rf.keys():
    if key == 'predictions':
        print(key)
        print('Logistic Regression: ')
        show_df(lr[key])
        print('Random Forest: ')
        show_df(rf[key])
    elif key == 'churn_summary'or key == 'feature_importance':
        print(key)
        print('Logistic Regression: ')
        lr[key].show()
        print('Random Forest: ')
        rf[key].show()
    else:
        print(key)
        print('Logistic Regression: ', lr[key])
        print('Random Forest: ', rf[key])
    print('')

lr['feature_importance'].show()

model
Logistic Regression:  LogisticRegressionModel: uid=LogisticRegression_35c5285081d5, numClasses=2, numFeatures=5939
Random Forest:  RandomForestClassificationModel: uid=RandomForestClassifier_fafae1136393, numTrees=100, numClasses=2, numFeatures=5939

predictions
Logistic Regression: 


                                                                                

+-----------+------------------+-----+--------------------+
|customer_id|     customer_name|churn|no_churn_probability|
+-----------+------------------+-----+--------------------+
|       1009|Mr. Martin Hammond|  Yes|[0.00997593722542...|
|       1020|   Kenneth Shannon|   No|[0.97610258607153...|
|       1044|       Jaime Mccoy|   No|[0.98422255834904...|
|       1047|      Nathan Smith|  Yes|[0.41448368804177...|
|       1052|     Cynthia Hardy|   No|[0.97438225178038...|
+-----------+------------------+-----+--------------------+
only showing top 5 rows

Random Forest: 


                                                                                

+-----------+------------------+-----+--------------------+
|customer_id|     customer_name|churn|no_churn_probability|
+-----------+------------------+-----+--------------------+
|       1009|Mr. Martin Hammond|  Yes|[0.36234966349211...|
|       1020|   Kenneth Shannon|   No|[0.63745384356571...|
|       1044|       Jaime Mccoy|   No|[0.63796587612413...|
|       1047|      Nathan Smith|   No|[0.64450067606025...|
|       1052|     Cynthia Hardy|   No|[0.63620256546921...|
+-----------+------------------+-----+--------------------+
only showing top 5 rows


auc
Logistic Regression:  0.9986777148021334
Random Forest:  0.9999347772737744

f1_score
Logistic Regression:  0.9897245325726998
Random Forest:  0.9914366733548089

churn_summary
Logistic Regression: 


                                                                                

+-----+-----+------------------+-------------------+------------------+
|churn|count|  avg_transactions|avg_purchase_amount|  avg_recency_days|
+-----+-----+------------------+-------------------+------------------+
|    1| 1439| 99.43849895760945|  187357.4043432939|44.704656011118836|
|    0| 1481|100.17758271438217|  2845587.969061445|   44.474679270763|
+-----+-----+------------------+-------------------+------------------+

Random Forest: 


                                                                                

+-----+-----+------------------+-------------------+------------------+
|churn|count|  avg_transactions|avg_purchase_amount|  avg_recency_days|
+-----+-----+------------------+-------------------+------------------+
|    1| 1439| 99.43849895760945|  187357.4043432939|44.704656011118836|
|    0| 1481|100.17758271438217|  2845587.969061445|   44.474679270763|
+-----+-----+------------------+-------------------+------------------+


+--------------------+--------------------+
|             feature|          importance|
+--------------------+--------------------+
|  total_transactions|  0.9807472807563129|
|avg_transaction_a...|  0.9785498605916242|
|total_purchase_am...| 0.11087225335238715|
|       customer_type|0.023765726589668437|
|   unique_categories|0.014317281099024397|
|                city|0.005907662397976032|
|        recency_days|0.002846807456497...|
|unique_products_p...|6.785285376679078E-4|
+--------------------+--------------------+



# Customer Segmentation

In [5]:
from pyspark.ml.evaluation import ClusteringEvaluator
from customer_segmentation import CustomerSegmentation

segmentation = CustomerSegmentation(spark, df)

evaluator_silhouette = ClusteringEvaluator(featuresCol='scaled_features', predictionCol='cluster', metricName='silhouette')

for k in range(2, 6):
    print("For k =", k)
    results = segmentation.segment_customers(n_clusters=k)
    clustered_customers = results['clustered_customers']

    silhouette_score = evaluator_silhouette.evaluate(clustered_customers)
    print(f"Silhouette Score : {silhouette_score}\n")

For k = 2


                                                                                

Customer Segment Summary:


                                                                                

+-------+--------------+-------------------+------------------+---------------------+------------------+
|cluster|customer_count|avg_purchase_amount|purchase_frequency|avg_purchase_quantity|category_diversity|
+-------+--------------+-------------------+------------------+---------------------+------------------+
|      1|          4885| 1543.4752014939297| 99.97113613101331|   3.0249228363227187|26.276356192425794|
|      0|          5115|  28347.14817834009| 100.0275659824047|    55.59189134257418|26.278787878787877|
+-------+--------------+-------------------+------------------+---------------------+------------------+



                                                                                

Silhouette Score : 0.6777447519527025

For k = 3


                                                                                

Customer Segment Summary:


                                                                                

+-------+--------------+-------------------+------------------+---------------------+------------------+
|cluster|customer_count|avg_purchase_amount|purchase_frequency|avg_purchase_quantity|category_diversity|
+-------+--------------+-------------------+------------------+---------------------+------------------+
|      1|          4882| 1543.4705159507091|  99.9877099549365|    3.024950388004315|26.279188857025808|
|      2|          2877| 28401.636453219035| 105.4045881126173|    55.65560589187904| 27.00590893291623|
|      0|          2241| 28241.324478681272| 93.08835341365462|   55.439663689214555|25.339134315037928|
+-------+--------------+-------------------+------------------+---------------------+------------------+



                                                                                

Silhouette Score : 0.5740927216216798

For k = 4


                                                                                

Customer Segment Summary:


                                                                                

+-------+--------------+-------------------+------------------+---------------------+------------------+
|cluster|customer_count|avg_purchase_amount|purchase_frequency|avg_purchase_quantity|category_diversity|
+-------+--------------+-------------------+------------------+---------------------+------------------+
|      1|          2737| 1545.9911446701722|105.78187796857874|   3.0260181385668035|26.970770917062477|
|      3|          2148| 1540.2693651469194| 92.56703910614524|    3.023527192820815|25.391527001862197|
|      2|          2877| 28401.636453219035| 105.4045881126173|    55.65560589187904| 27.00590893291623|
|      0|          2238| 28277.102259293224| 93.11528150134048|   55.509984837502415|25.344057193923145|
+-------+--------------+-------------------+------------------+---------------------+------------------+



                                                                                

Silhouette Score : 0.5293196813077646

For k = 5


                                                                                

Customer Segment Summary:


                                                                                

+-------+--------------+-------------------+------------------+---------------------+------------------+
|cluster|customer_count|avg_purchase_amount|purchase_frequency|avg_purchase_quantity|category_diversity|
+-------+--------------+-------------------+------------------+---------------------+------------------+
|      1|          1824| 1546.1282482833617|109.86677631578948|   3.0239897157888804| 26.90734649122807|
|      3|          1110| 1545.2643157988434| 94.21171171171171|   3.0259338128028297| 24.52882882882883|
|      4|          1951| 1539.9769574025004| 93.99641209636084|    3.025220031587083|26.680676576114813|
|      2|          2297| 28288.262996056514| 93.31780583369613|    55.51650647508103|25.360905528950806|
|      0|          2818| 28395.146497610942|105.49680624556423|    55.65333883747527|27.026969481902057|
+-------+--------------+-------------------+------------------+---------------------+------------------+





Silhouette Score : 0.5189754506003516



                                                                                