<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/PySpark-Analytics-Hub/blob/main/Customer%20Churn%20Analysis/Churn_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Enviroment

In [4]:
# !/bin/bash
%%bash
kaggle datasets download halimedogan/churn-dataset
unzip churn-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/halimedogan/churn-dataset
License(s): unknown
churn-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  churn-dataset.zip
  inflating: churn2.csv              


In [None]:
%%bash
# pip install sparkmagic

# Import Libraries

In [32]:
import pandas  as pd
import seaborn as sns
import numpy as np
from pyspark.ml.classification import GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import (StringIndexer,
                                VectorAssembler,
                                OneHotEncoder,
                                StandardScaler,
                                Bucketizer)
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Setup pandas option
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Setup Spark Session & Import DataFrame

In [9]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [13]:
spark_df = spark.read.csv('/content/churn2.csv',
                          header=True,
                          inferSchema=True)

# Data Analysis

In [14]:
spark_df.show(10)

+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|        1|  15634602|Hargrave|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|
|        2|  15647311|    Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|        3|  15619304|    Onio|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|        4|  15701354|    Boni|        699|   France|Female| 39|     1|      0.0|            2|        0|             0|       93826.63|

In [16]:
# Shape
print("Shape: ",(spark_df.count(), len(spark_df.columns)))

Shape:  (10000, 14)


In [17]:
# Check Schema
spark_df.printSchema()

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



In [18]:
# Check Statistical Summary
spark_df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
RowNumber,10000,5000.5,2886.8956799071675,1,10000
CustomerId,10000,1.56909405694E7,71936.18612274907,15565701,15815690
Surname,10000,,,Abazu,Zuyeva
CreditScore,10000,650.5288,96.65329873613035,350,850
Geography,10000,,,France,Spain
Gender,10000,,,Female,Male
Age,10000,38.9218,10.487806451704587,18,92
Tenure,10000,5.0128,2.8921743770496837,0,10
Balance,10000,76485.88928799961,62397.40520238599,0.0,250898.09


In [19]:
# Check exited customer
spark_df.groupby('exited').count().show()

+------+-----+
|exited|count|
+------+-----+
|     1| 2037|
|     0| 7963|
+------+-----+



In [20]:
# Check customers who have credit card
spark_df.groupby('HasCrCard').count().show()

+---------+-----+
|HasCrCard|count|
+---------+-----+
|        1| 7055|
|        0| 2945|
+---------+-----+



In [22]:
# Groupby exited customer with their estimated salayar and whether have credit card or no
spark_df.groupby("Exited").agg({"EstimatedSalary": "mean", 'HasCrCard': "count"}).show()

+------+----------------+--------------------+
|Exited|count(HasCrCard)|avg(EstimatedSalary)|
+------+----------------+--------------------+
|     1|            2037|   101465.6775306824|
|     0|            7963|   99738.39177194514|
+------+----------------+--------------------+



In [24]:
# Check Missing values
spark_df.select([f.count(f.when(f.col(c).isNull(), c)).alias(c) for c in spark_df.columns]).toPandas().T

Unnamed: 0,0
RowNumber,0
CustomerId,0
Surname,0
CreditScore,0
Geography,0
Gender,0
Age,0
Tenure,0
Balance,0
NumOfProducts,0


## Windowing Function

In [41]:
window_spec = Window.partitionBy("Geography").orderBy("CreditScore")

df_ranked = (
    spark_df
    .withColumn("Rank", f.rank().over(window_spec))
    .withColumn("DenseRank", f.dense_rank().over(window_spec))
    .withColumn("RowNumber", f.row_number().over(window_spec))
    )

df_ranked.select("Geography", "CreditScore", "Rank", "DenseRank", "RowNumber").show()

+---------+-----------+----+---------+---------+
|Geography|CreditScore|Rank|DenseRank|RowNumber|
+---------+-----------+----+---------+---------+
|   France|        350|   1|        1|        1|
|   France|        350|   1|        1|        2|
|   France|        350|   1|        1|        3|
|   France|        359|   4|        2|        4|
|   France|        373|   5|        3|        5|
|   France|        376|   6|        4|        6|
|   France|        405|   7|        5|        7|
|   France|        408|   8|        6|        8|
|   France|        408|   8|        6|        9|
|   France|        410|  10|        7|       10|
|   France|        410|  10|        7|       11|
|   France|        411|  12|        8|       12|
|   France|        411|  12|        8|       13|
|   France|        411|  12|        8|       14|
|   France|        411|  12|        8|       15|
|   France|        412|  16|        9|       16|
|   France|        413|  17|       10|       17|
|   France|        4

In [42]:
# Moving Avg
window_spec_avg = Window.partitionBy("geography").rowsBetween(-3, 3)

df_moving_avg = spark_df.withColumn("moving_avg_creditscore", f.avg("creditscore").over(window_spec_avg))

df_moving_avg.select("geography", "creditscore", "moving_avg_creditscore").show()

+---------+-----------+----------------------+
|geography|creditscore|moving_avg_creditscore|
+---------+-----------+----------------------+
|   France|        619|                 660.5|
|   France|        502|                 628.6|
|   France|        699|     637.8333333333334|
|   France|        822|     622.1428571428571|
|   France|        501|     601.7142857142857|
|   France|        684|     608.4285714285714|
|   France|        528|     612.2857142857143|
|   France|        476|     599.4285714285714|
|   France|        549|     623.4285714285714|
|   France|        726|     646.5714285714286|
|   France|        732|     653.5714285714286|
|   France|        669|     667.1428571428571|
|   France|        846|     647.4285714285714|
|   France|        577|     619.8571428571429|
|   France|        571|     583.1428571428571|
|   France|        411|                 609.0|
|   France|        533|     554.5714285714286|
|   France|        475|     551.5714285714286|
|   France|  

In [45]:
# Cumulative Sum of each Geography

window_spec_cumsum = Window.partitionBy("Geography").orderBy("Age").rowsBetween(Window.unboundedPreceding, 0)

df_cumsum = (
    spark_df
    .withColumn("Cumulative_Balance", f.sum("Balance").over(window_spec_cumsum))
)

df_cumsum.select("Geography", "Balance", "Age", "Cumulative_Balance").show()

+---------+---------+---+------------------+
|Geography|  Balance|Age|Cumulative_Balance|
+---------+---------+---+------------------+
|   France|160980.03| 18|         160980.03|
|   France|151762.74| 18|         312742.77|
|   France| 82767.42| 18|         395510.19|
|   France|133550.67| 18|         529060.86|
|   France|      0.0| 18|         529060.86|
|   France|      0.0| 18|         529060.86|
|   France| 176139.5| 18|         705200.36|
|   France| 98894.39| 18|         804094.75|
|   France|      0.0| 18|         804094.75|
|   France|      0.0| 18|         804094.75|
|   France|102983.91| 18|         907078.66|
|   France|128514.84| 19|         1035593.5|
|   France| 97445.49| 19|        1133038.99|
|   France|143390.51| 19|         1276429.5|
|   France|110928.51| 19|        1387358.01|
|   France|      0.0| 19|        1387358.01|
|   France|      0.0| 19|        1387358.01|
|   France|127649.64| 19|        1515007.65|
|   France|      0.0| 19|        1515007.65|
|   France

# Feature Engineering

In [27]:
spark_df = (
    spark_df
    .drop('RowNumber', 'CustomerId', 'Surname')
    .withColumn("CredictScore_Salary", spark_df.creditscore / spark_df.estimatedsalary)
    .withColumn("CredictScore_Tenure", spark_df.creditscore * spark_df.tenure)
    .withColumn("Balance_Salary", spark_df.balance / spark_df.estimatedsalary)
)

spark_df.show(10)

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+
|creditscore|geography|gender|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited| CredictScore_Salary|CredictScore_Tenure|    Balance_Salary|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+
|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|               1238|               0.0|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|                608|0.7446769036217226|
|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|0.004406153623618

In [30]:
# Bucketization
bucketizer = Bucketizer(splits=[0, 35, 55, 75, 95],
                        inputCol="age",
                        outputCol="age_cat")
spark_df = bucketizer.setHandleInvalid("keep").transform(spark_df)

spark_df = spark_df.withColumn("age_cat", spark_df.age_cat + 1)
spark_df = spark_df.withColumn("age_cat", spark_df["age_cat"].cast("integer"))
# Show
spark_df.show(10)

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+
|creditscore|geography|gender|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited| CredictScore_Salary|CredictScore_Tenure|    Balance_Salary|age_cat|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+
|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|               1238|               0.0|      2|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|                608|0.7446769036217226|      2|
|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|

# Features Vectorization

In [48]:
indexer = StringIndexer(inputCol="gender",
                        outputCol="gender_label")
indexer.fit(spark_df).transform(spark_df).show(5)

temp_sdf = indexer.fit(spark_df).transform(spark_df)

spark_df = temp_sdf.withColumn("gender_label", temp_sdf["gender_label"].cast("integer"))
spark_df = spark_df.drop('gender')

In [49]:
spark_df.show(10)

+-----------+---------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+------------+
|creditscore|geography|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited| CredictScore_Salary|CredictScore_Tenure|    Balance_Salary|age_cat|gender_label|
+-----------+---------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+------------+
|        619|   France| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|               1238|               0.0|      2|           1|
|        608|    Spain| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|                608|0.7446769036217226|      2|           1|
|        502|   France| 42|     8| 159660.8|            3|  

In [50]:
indexer = StringIndexer(inputCol="geography",
                        outputCol="geography_label")
indexer.fit(spark_df).transform(spark_df).show(5)

temp_sdf = indexer.fit(spark_df).transform(spark_df)

+-----------+---------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+------------+---------------+
|creditscore|geography|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited| CredictScore_Salary|CredictScore_Tenure|    Balance_Salary|age_cat|gender_label|geography_label|
+-----------+---------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+------------+---------------+
|        619|   France| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|               1238|               0.0|      2|           1|            0.0|
|        608|    Spain| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|                608|0.7446769036217226|      2|          

In [51]:
spark_df = temp_sdf.withColumn("geography_label", temp_sdf["geography_label"].cast("integer"))
spark_df = spark_df.drop('geography')

In [52]:
spark_df.show(5)

+-----------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+------------+---------------+
|creditscore|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited| CredictScore_Salary|CredictScore_Tenure|    Balance_Salary|age_cat|gender_label|geography_label|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+------------+---------------+
|        619| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|               1238|               0.0|      2|           1|              0|
|        608| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|                608|0.7446769036217226|      2|           1|              2|
|        502| 42|     8| 15966

In [53]:
# One Hot Encoding
encoder = OneHotEncoder(inputCols=["age_cat", "geography_label"],
                        outputCols=["age_cat_ohe", "geography_label_ohe"])
spark_df = encoder.fit(spark_df).transform(spark_df)

spark_df.show()

+-----------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+------------+---------------+-------------+-------------------+
|creditscore|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited| CredictScore_Salary|CredictScore_Tenure|    Balance_Salary|age_cat|gender_label|geography_label|  age_cat_ohe|geography_label_ohe|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+-------------------+------------------+-------+------------+---------------+-------------+-------------------+
|        619| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|               1238|               0.0|      2|           1|              0|(4,[2],[1.0])|      (2,[0],[1.0])|
|        608| 41|     1| 83807.86|            1|        0|             1|      112542.58