In [368]:
sc

In [369]:
spark

In [370]:
from pyspark.sql.functions import *

#### 1. Read DataSet

In [371]:
telecom_data = spark.read.csv("file:///home/hadoop/Downloads/Telco_Customer_Churn.csv", inferSchema=True, header=True)
telecom_data.head()

Row(customerID='7590-VHVEG', gender='Female', SeniorCitizen=0, Partner='Yes', Dependents='No', tenure=1, PhoneService='No', MultipleLines='No phone service', InternetService='DSL', OnlineSecurity='No', OnlineBackup='Yes', DeviceProtection='No', TechSupport='No', StreamingTV='No', StreamingMovies='No', Contract='Month-to-month', PaperlessBilling='Yes', PaymentMethod='Electronic check', MonthlyCharges=29.85, TotalCharges='29.85', Churn='No')

In [372]:
telecom_data.show()

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|   MultipleLines|InternetService|     OnlineSecurity|       OnlineBackup|   DeviceProtection|        TechSupport|        StreamingTV|    StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|7590-VHVEG|Female|            0|    Yes|        No|     1|  

In [373]:
telecom_data.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: string (nullable = true)
 |-- Churn: string (nullable = true)



####  1. Data Exploration:
    • How many records are there in the dataset?

In [374]:
telecom_data.count()

7043

##### * What is the distribution of gender among customers?

In [375]:
telecom_data.groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|Female| 3488|
|  Male| 3555|
+------+-----+



##### • What is the distribution of contract types among customers?

In [376]:
telecom_data.groupBy('Contract').count().show()

+--------------+-----+
|      Contract|count|
+--------------+-----+
|Month-to-month| 3875|
|      One year| 1473|
|      Two year| 1695|
+--------------+-----+



##### • What is the percentage of customers who churned?

In [377]:
churn_count = telecom_data.select('Churn').where("Churn = 'Yes'").count()
# print(churn_count)
total = telecom_data.count()
# print(total)
percent=churn_count/total*100
print(percent)

26.536987079369588


####     2. Data Preprocessing:
    • Check for missing values and handle them if any.
    

In [378]:
telecom_data.select([count(when(isnull(col),col)).alias(col) for col in telecom_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

* Convert categorical variables into numerical format using one-hot encoding or label encoding.
 

In [379]:
telecom_data = telecom_data.withColumn('TotalCharges',when(col('TotalCharges')==" ",None)\
                                       .otherwise(col('TotalCharges')))

* Split the dataset into training and testing sets.

In [380]:
telecom_data1 = telecom_data.na.drop()

In [381]:
from pyspark.sql.types import FloatType, IntegerType
churn_data = telecom_data1.withColumn("Churn", regexp_replace('Churn','Yes',"0"))
churn_data = churn_data.withColumn("Churn", regexp_replace('Churn','No',"1"))
churn_data = churn_data.withColumn('Churn', col('Churn').cast(IntegerType()))
churn_data = churn_data.withColumn('TotalCharges', col('TotalCharges').cast(FloatType()))


In [382]:
churn_data.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: float (nullable = true)
 |-- Churn: integer (nullable = true)



In [383]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator , VectorAssembler
from pyspark.ml import Pipeline

In [384]:
print(churn_data.columns)

['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [385]:
categorical_cols = ['gender','SeniorCitizen','Partner','Dependents',\
                    'PhoneService','MultipleLines','InternetService','OnlineSecurity',\
                    'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies',\
                    'Contract','PaperlessBilling','PaymentMethod']
#use stringIndexer to convert categorical column to numerical indexes
stages = []
for catcols in categorical_cols:
    stringindexer = StringIndexer(inputCol=catcols, outputCol=catcols+"_Index")
    onehotencoder = OneHotEncoderEstimator(inputCols=[stringindexer.getOutputCol()], 
                                           outputCols=[catcols+"classVec"])
    stages += [stringindexer, onehotencoder]

In [386]:
numeric_cols = ['tenure','MonthlyCharges', 'TotalCharges']
inputs = [c + 'classVec' for c in categorical_cols] + numeric_cols
assembler = VectorAssembler(inputCols=inputs, outputCol='features')
stages+=[assembler]

In [387]:
# label_stringIdx = StringIndexer(inputCol='Churn', outputCol='label')
# stages += [label_stringIdx] 

In [388]:
print(stages)

[StringIndexer_9da55f0d9b59, OneHotEncoderEstimator_b1c7c72d966f, StringIndexer_74d49faad6d3, OneHotEncoderEstimator_ecf506e5ffa4, StringIndexer_1f15dd1a4612, OneHotEncoderEstimator_40407a0b4566, StringIndexer_29bea8e13a90, OneHotEncoderEstimator_7cc511d99591, StringIndexer_968a8cb3804e, OneHotEncoderEstimator_21b70b7e2b49, StringIndexer_5055b85e4e7a, OneHotEncoderEstimator_313cd9d4e916, StringIndexer_500643ddd6a3, OneHotEncoderEstimator_cc326893470a, StringIndexer_f632873858f2, OneHotEncoderEstimator_b339d18147cc, StringIndexer_dbe224106307, OneHotEncoderEstimator_c971b2c3bc94, StringIndexer_0aadad15f326, OneHotEncoderEstimator_bda7c3417ddd, StringIndexer_0b75e556edf1, OneHotEncoderEstimator_ff3b53a4c17a, StringIndexer_904b6f6baf9c, OneHotEncoderEstimator_1c9f171b61eb, StringIndexer_88abf0640c65, OneHotEncoderEstimator_c08182a74a4e, StringIndexer_d0479087bb00, OneHotEncoderEstimator_7526844f7365, StringIndexer_9c0d0fe0a2fd, OneHotEncoderEstimator_223eb6114be3, StringIndexer_79b7e210dd

In [389]:
pipeline = Pipeline(stages=stages)
churn_data = pipeline.fit(churn_data).transform(churn_data)
churn_data.select(["features","Churn"]).show(truncate = False)


+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                                         |Churn|
+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|(30,[1,3,8,9,12,13,15,17,19,21,23,24,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29.85,29.850000381469727])                   |1    |
|(30,[0,1,2,3,4,5,8,10,11,14,15,17,19,25,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,34.0,56.95,1889.5])                   |1    |
|[1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,53.85,108.1500015258789]        |0    |
|(30,[0,1,2,3,8,10,11,14,16,17,19,26,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,

##### model split

In [390]:
train, test = churn_data.randomSplit([0.8, 0.2], seed = 2)

In [391]:
train.show()

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+------------+--------------+-------------------+---------------------+-------------+---------------+----------------+------------------+------------------+--------------------+-------------------+---------------------+---------------------+-----------------------+--------------------+----------------------+------------------+--------------------+----------------------+------------------------+-----------------+-------------------+-----------------+-------------------+---------------------+-----------------------+--------------+----------------+----------------------+------------------------+-------------------+---------------------+--------------------+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|Ph

In [392]:
train.select(['features', 'Churn'])

DataFrame[features: vector, Churn: int]

##### Decision tree classifier

In [404]:
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegressionModel, RandomForestClassificationModel
tree = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'Churn')
decision_model = tree.fit(train)

In [405]:
#prediction on test data
predictions = decision_model.transform(test)

In [406]:
predictions.select(['features','Churn','probability','prediction']).show(truncate = False)

+-------------------------------------------------------------------------------------------------------------------------------------------+-----+----------------------------------------+----------+
|features                                                                                                                                   |Churn|probability                             |prediction|
+-------------------------------------------------------------------------------------------------------------------------------------------+-----+----------------------------------------+----------+
|(30,[3,4,5,7,10,12,14,16,18,20,22,23,26,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,71.0,109.7,7904.25])                |1    |[0.0682261208576998,0.9317738791423001] |1.0       |
|(30,[1,4,6,7,10,12,14,16,18,20,22,23,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,72.0,116.8,8456.75])                       |1    |[0.0682261208576998,0.9317738791423001] |1.0       |


In [407]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

In [408]:
accuracy

0.7941176470588235

#### Logistic Regression

In [409]:
logit = LogisticRegression(featuresCol='features', labelCol='Churn')
logit_model = logit.fit(train)

In [410]:
predictions = logit_model.transform(test)

In [411]:
evaluator = MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
accuracy

0.8122171945701357

In [412]:
rf_clf = RandomForestClassifier(featuresCol="features", labelCol='Churn')
rf_model = rf_clf.fit(train)

In [415]:
predictions = rf_model.transform(test)

In [416]:
evaluator = MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
accuracy

0.8009049773755657