# Load Data

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("model").config("fs.defaultFS", "file:///").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

df = spark.read.csv("/data/BigData/ecommerce_data_with_trends.csv", header=True)

24/12/08 02:06:40 WARN Utils: Your hostname, flo-ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.1.162 instead (on interface wlp3s0)
24/12/08 02:06:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/08 02:06:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql.functions import isnan, when, count, col

# Check how many null values and nan values are there in the dataframe
df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df.columns]).show()



+--------------+---------+-----------+-------------+----+-------------+------------+--------+-----+--------+------------+
|transaction_id|timestamp|customer_id|customer_name|city|customer_type|product_name|category|price|quantity|total_amount|
+--------------+---------+-----------+-------------+----+-------------+------------+--------+-----+--------+------------+
|             0|        0|          0|            0|   0|            0|           0|       0|    0|       0|           0|
+--------------+---------+-----------+-------------+----+-------------+------------+--------+-----+--------+------------+



                                                                                

# Churn Analysis

In [3]:
from churn import CustomerChurnMLAnalysis

# Assuming 'spark' is your SparkSession and 'df' is your DataFrame
ml_analysis = CustomerChurnMLAnalysis(spark, df)
results = ml_analysis.predict_customer_churn()

                                                                                

In [4]:
lr =  results['logistic_regression']
rf = results['random_forest']
for key in rf.keys():
    if key == 'predictions' or key == 'churn_summary'or key == 'feature_importance':
        print(key)
        print('Logistic Regression: ')
        lr[key].show(5)
        print('Random Forest: ')
        rf[key].show(5)
    else:
        print(key)
        print('Logistic Regression: ', lr[key])
        print('Random Forest: ', rf[key])
    print('')

lr['feature_importance'].show()


model
Logistic Regression:  LogisticRegressionModel: uid=LogisticRegression_a46cc214e560, numClasses=2, numFeatures=5869
Random Forest:  RandomForestClassificationModel: uid=RandomForestClassifier_4a8f386d610d, numTrees=100, numClasses=2, numFeatures=5869

predictions
Logistic Regression: 


                                                                                

+-----------+--------------------+--------------+-------------+------------------+----------------------+---------------------+--------------------+-------------------------+-----------------+------------+-----+------------------------+-------------------------------+-----------------------+-------------------+----------+--------------------------+---------------------------------+-------------------------+---------------------+-------------------+--------------------+--------------------+--------------------+--------------------+----------+
|customer_id|       customer_name|          city|customer_type|total_transactions|avg_transaction_amount|total_purchase_amount|  last_purchase_date|unique_products_purchased|unique_categories|recency_days|churn|total_transactions_index|unique_products_purchased_index|unique_categories_index|customer_type_index|city_index|total_transactions_encoded|unique_products_purchased_encoded|unique_categories_encoded|customer_type_encoded|       city_encoded|

                                                                                

+-----------+--------------------+--------------+-------------+------------------+----------------------+---------------------+--------------------+-------------------------+-----------------+------------+-----+------------------------+-------------------------------+-----------------------+-------------------+----------+--------------------------+---------------------------------+-------------------------+---------------------+-------------------+--------------------+--------------------+--------------------+--------------------+----------+
|customer_id|       customer_name|          city|customer_type|total_transactions|avg_transaction_amount|total_purchase_amount|  last_purchase_date|unique_products_purchased|unique_categories|recency_days|churn|total_transactions_index|unique_products_purchased_index|unique_categories_index|customer_type_index|city_index|total_transactions_encoded|unique_products_purchased_encoded|unique_categories_encoded|customer_type_encoded|       city_encoded|

                                                                                

+-----+-----+-----------------+-------------------+------------------+
|churn|count| avg_transactions|avg_purchase_amount|  avg_recency_days|
+-----+-----+-----------------+-------------------+------------------+
|    1| 1488|99.51948924731182|  179854.8714247312|43.656586021505376|
|    0| 1451|99.86078566505859| 2833506.6143418327|43.648518263266716|
+-----+-----+-----------------+-------------------+------------------+

Random Forest: 


                                                                                

+-----+-----+-----------------+-------------------+------------------+
|churn|count| avg_transactions|avg_purchase_amount|  avg_recency_days|
+-----+-----+-----------------+-------------------+------------------+
|    1| 1488|99.51948924731182|  179854.8714247312|43.656586021505376|
|    0| 1451|99.86078566505859| 2833506.6143418327|43.648518263266716|
+-----+-----+-----------------+-------------------+------------------+


+--------------------+--------------------+
|             feature|          importance|
+--------------------+--------------------+
|  total_transactions|  0.9858087891895526|
|avg_transaction_a...|   0.979533797525071|
|total_purchase_am...| 0.09788911977499148|
|                city|0.022426256493271433|
|       customer_type|0.011639538027176231|
|unique_products_p...|0.010984638539580021|
|        recency_days| 0.00796190357723674|
|   unique_categories|0.002555806199948...|
+--------------------+--------------------+



# Customer Segmentation

In [5]:
from customer_seg import CustomerSegMLAnalysis

ml_analysis = CustomerSegMLAnalysis(spark, df)
clustered_customers_df, silhouette = ml_analysis.cluster_customers(k=10)
print(silhouette)

                                                                                

-0.061620028883509544


In [6]:
# Assuming `clustered_customers_df` is a PySpark DataFrame
for cluster_id in range(10):  # Loop through all clusters
    print(f"Cluster {cluster_id}:")
    clustered_customers_df.filter(clustered_customers_df.cluster == cluster_id).show()

Cluster 0:
+-----------+--------------------+-------+--------------------+--------------------+
|customer_id|       customer_name|cluster|         top_product|   spending_variance|
+-----------+--------------------+-------+--------------------+--------------------+
|       1001|      Nicholas Wolfe|      0|Accessories Produ...|4.7139714837479126E8|
|       1009|  Mr. Martin Hammond|      0|Supplements Produ...|  1233305.8830931948|
|       1015|        William Tran|      0|     Audio Product_7|5.1819040292093205E8|
|       1029|     Nicole Carrillo|      0|Kitchen Appliance...|  1519026.2154582455|
|       1047|        Nathan Smith|      0|Personal Care Pro...| 4.597733439227818E8|
|       1049|         Julia Hines|      0|Home Decor Product_9| 5.657236510934278E8|
|        105|      Elizabeth Ruiz|      0|   Bedding Product_8|  1416754.4615485955|
|       1050|Christopher Anderson|      0|  Cameras Product_10|  1532092.4847565852|
|       1053|      Zachary Morris|      0|     Shoes P