In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
pd.set_option("display.max_columns",None)

In [2]:
data=pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn-Copy1.csv")

In [3]:
data.shape

(7043, 21)

In [4]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
data["PhoneService"].value_counts()

Yes    6361
No      682
Name: PhoneService, dtype: int64

In [6]:
data=data.drop(columns="customerID")

In [7]:
data["Churn"].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [8]:
### Data is imbalanced, so we use imbalanced techniques

In [9]:
data.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [10]:
data["Churn"]=np.where(data["Churn"]=="Yes",1,0)

In [11]:
data["Churn"].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

In [12]:
data["SeniorCitizen"]=np.where(data["SeniorCitizen"]==0,"No","Yes")

In [13]:
data["TotalCharges"].value_counts(dropna=False)

20.2       11
           11
19.75       9
20.05       8
19.9        8
           ..
4391.45     1
369.05      1
5979.7      1
7238.6      1
3270.25     1
Name: TotalCharges, Length: 6531, dtype: int64

In [14]:
data.loc[data["TotalCharges"]==" ",["TotalCharges"]] 

Unnamed: 0,TotalCharges
488,
753,
936,
1082,
1340,
3331,
3826,
4380,
5218,
6670,


In [15]:
data.loc[data["TotalCharges"]==" ",["MonthlyCharges"]]

Unnamed: 0,MonthlyCharges
488,52.55
753,20.25
936,80.85
1082,25.75
1340,56.05
3331,19.85
3826,25.35
4380,20.0
5218,19.7
6670,73.35


In [16]:
data.loc[data["TotalCharges"]==" ",["TotalCharges"]]=data["MonthlyCharges"]

In [17]:
data.iloc[6650]

gender                           Female
SeniorCitizen                        No
Partner                              No
Dependents                           No
tenure                                6
PhoneService                        Yes
MultipleLines                        No
InternetService                      No
OnlineSecurity      No internet service
OnlineBackup        No internet service
DeviceProtection    No internet service
TechSupport         No internet service
StreamingTV         No internet service
StreamingMovies     No internet service
Contract                 Month-to-month
PaperlessBilling                     No
PaymentMethod              Mailed check
MonthlyCharges                     20.2
TotalCharges                     123.65
Churn                                 0
Name: 6650, dtype: object

In [18]:
data["TotalCharges"]=pd.to_numeric(data["TotalCharges"])

In [19]:
non_num_feat=[i for i in data.columns if data[i].dtypes=="O"]
non_num_feat

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [20]:
data=pd.get_dummies(data,columns=non_num_feat,drop_first=True)

In [21]:
data.shape

(7043, 31)

In [22]:
data.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,SeniorCitizen_Yes,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,29.85,29.85,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
1,34,56.95,1889.5,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
2,2,53.85,108.15,1,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1
3,45,42.3,1840.75,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0
4,2,70.7,151.65,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0


In [23]:
x=data.drop(columns=["Churn"])
y=data["Churn"]

In [24]:
from sklearn.model_selection import cross_val_score,RandomizedSearchCV,train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from xgboost import XGBRFClassifier,XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.combine import SMOTEENN,SMOTETomek
from collections import Counter

In [25]:
rf=RandomForestClassifier()
xg=XGBClassifier()
xrf=XGBRFClassifier()
sv=SVC()
kn=KNeighborsClassifier()

In [26]:
trainx,testx,trainy,testy=train_test_split(x,y,test_size=0.25,random_state=5)

In [27]:
trainx.shape,testx.shape,trainy.shape,testy.shape

((5282, 30), (1761, 30), (5282,), (1761,))

## SMOTEENN

In [28]:
sm=SMOTEENN()

In [29]:
trainxsm,trainysm=sm.fit_sample(trainx,trainy)

In [30]:
testxsm,testysm=sm.fit_sample(testx,testy)

In [31]:
Counter(trainy),Counter(trainysm)

(Counter({0: 3886, 1: 1396}), Counter({0: 2041, 1: 2394}))

In [32]:
scorerf=cross_val_score(rf,trainxsm,trainysm,cv=10,n_jobs=-1)
scorerf.mean()*100

95.80699774266365

In [33]:
scorexrf=cross_val_score(xrf,trainxsm,trainysm,cv=10,n_jobs=-1)
scorexrf.mean()*100

94.364081914872

In [34]:
scorexg=cross_val_score(xg,trainxsm,trainysm,cv=10,n_jobs=-1)
scorexg.mean()*100

96.14549651231368

In [35]:
scoresv=cross_val_score(sv,trainxsm,trainysm,cv=10,n_jobs=-1)
scoresv.mean()*100

79.52712870884427

In [36]:
scorekn=cross_val_score(kn,trainxsm,trainysm,cv=10,n_jobs=-1)
scorekn.mean()*100

97.09113741280785

In [37]:
rf.fit(trainxsm,trainysm)

y1=rf.predict(testxsm)
print(accuracy_score(testysm,y1)*100)
print(confusion_matrix(testysm,y1))
print(classification_report(testysm,y1))

93.11145510835914
[[560  31]
 [ 58 643]]
              precision    recall  f1-score   support

           0       0.91      0.95      0.93       591
           1       0.95      0.92      0.94       701

    accuracy                           0.93      1292
   macro avg       0.93      0.93      0.93      1292
weighted avg       0.93      0.93      0.93      1292



In [38]:
xrf.fit(trainxsm,trainysm)

y2=xrf.predict(testxsm)
print(accuracy_score(testysm,y2)*100)
print(confusion_matrix(testysm,y2))
print(classification_report(testysm,y2))



93.88544891640866
[[561  30]
 [ 49 652]]
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       591
           1       0.96      0.93      0.94       701

    accuracy                           0.94      1292
   macro avg       0.94      0.94      0.94      1292
weighted avg       0.94      0.94      0.94      1292



In [39]:
xg.fit(trainxsm,trainysm)

y3=xg.predict(testxsm)
print(accuracy_score(testysm,y3)*100)
print(confusion_matrix(testysm,y3))
print(classification_report(testysm,y3))

93.343653250774
[[568  23]
 [ 63 638]]
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       591
           1       0.97      0.91      0.94       701

    accuracy                           0.93      1292
   macro avg       0.93      0.94      0.93      1292
weighted avg       0.94      0.93      0.93      1292



In [40]:
sv.fit(trainxsm,trainysm)

y4=sv.predict(testxsm)
print(accuracy_score(testysm,y4)*100)
print(confusion_matrix(testysm,y4))
print(classification_report(testysm,y4))

82.58513931888545
[[482 109]
 [116 585]]
              precision    recall  f1-score   support

           0       0.81      0.82      0.81       591
           1       0.84      0.83      0.84       701

    accuracy                           0.83      1292
   macro avg       0.82      0.83      0.82      1292
weighted avg       0.83      0.83      0.83      1292



In [41]:
kn.fit(trainxsm,trainysm)

y5=kn.predict(testxsm)
print(accuracy_score(testysm,y5)*100)
print(confusion_matrix(testysm,y5))
print(classification_report(testysm,y5))

88.08049535603715
[[538  53]
 [101 600]]
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       591
           1       0.92      0.86      0.89       701

    accuracy                           0.88      1292
   macro avg       0.88      0.88      0.88      1292
weighted avg       0.88      0.88      0.88      1292



## SMOTETomek

In [42]:
sk=SMOTETomek()

In [43]:
trainxsk,trainysk=sk.fit_sample(trainx,trainy)
testxsk,testysk=sk.fit_sample(testx,testy)

In [44]:
Counter(trainy),Counter(trainysk)

(Counter({0: 3886, 1: 1396}), Counter({0: 3573, 1: 3573}))

In [45]:
rf1=RandomForestClassifier()
xg1=XGBClassifier()
xrf1=XGBRFClassifier()
sv1=SVC()
kn1=KNeighborsClassifier()

In [46]:
scorerf1=cross_val_score(rf1,trainxsk,trainysk,cv=10,n_jobs=-1)
scorerf1.mean()*100

85.54768760651115

In [47]:
scorexrf1=cross_val_score(xrf1,trainxsk,trainysk,cv=10,n_jobs=-1)
scorexrf1.mean()*100

82.59379835850424

In [48]:
scorexg1=cross_val_score(xg1,trainxsk,trainysk,cv=10,n_jobs=-1)
scorexg1.mean()*100

85.71581359816655

In [49]:
scoresv1=cross_val_score(sv1,trainxsk,trainysk,cv=10,n_jobs=-1)
scoresv1.mean()*100

66.56892127480363

In [50]:
scorekn1=cross_val_score(kn1,trainxsk,trainysk,cv=10,n_jobs=-1)
scorekn1.mean()*100

80.64750935339171

In [51]:
rf1.fit(trainxsk,trainysk)

y11=rf1.predict(testxsk)
print(accuracy_score(testysk,y11)*100)
print(confusion_matrix(testysk,y11))
print(classification_report(testysk,y11))

82.48686514886164
[[981 161]
 [239 903]]
              precision    recall  f1-score   support

           0       0.80      0.86      0.83      1142
           1       0.85      0.79      0.82      1142

    accuracy                           0.82      2284
   macro avg       0.83      0.82      0.82      2284
weighted avg       0.83      0.82      0.82      2284



In [52]:
xrf1.fit(trainxsk,trainysk)

y22=xrf1.predict(testxsk)
print(accuracy_score(testysk,y22)*100)
print(confusion_matrix(testysk,y22))
print(classification_report(testysk,y22))



82.61821366024519
[[903 239]
 [158 984]]
              precision    recall  f1-score   support

           0       0.85      0.79      0.82      1142
           1       0.80      0.86      0.83      1142

    accuracy                           0.83      2284
   macro avg       0.83      0.83      0.83      2284
weighted avg       0.83      0.83      0.83      2284



In [53]:
xg1.fit(trainxsk,trainysk)

y33=xg1.predict(testxsk)
print(accuracy_score(testysk,y33)*100)
print(confusion_matrix(testysk,y33))
print(classification_report(testysk,y33))

82.9246935201401
[[983 159]
 [231 911]]
              precision    recall  f1-score   support

           0       0.81      0.86      0.83      1142
           1       0.85      0.80      0.82      1142

    accuracy                           0.83      2284
   macro avg       0.83      0.83      0.83      2284
weighted avg       0.83      0.83      0.83      2284



In [54]:
sv1.fit(trainxsk,trainysk)

y44=sv1.predict(testxsk)
print(accuracy_score(testysk,y44)*100)
print(confusion_matrix(testysk,y44))
print(classification_report(testysk,y44))

68.73905429071804
[[818 324]
 [390 752]]
              precision    recall  f1-score   support

           0       0.68      0.72      0.70      1142
           1       0.70      0.66      0.68      1142

    accuracy                           0.69      2284
   macro avg       0.69      0.69      0.69      2284
weighted avg       0.69      0.69      0.69      2284



In [55]:
kn1.fit(trainxsk,trainysk)

y55=kn1.predict(testxsk)
print(accuracy_score(testysk,y55)*100)
print(confusion_matrix(testysk,y55))
print(classification_report(testysk,y55))

70.88441330998249
[[845 297]
 [368 774]]
              precision    recall  f1-score   support

           0       0.70      0.74      0.72      1142
           1       0.72      0.68      0.70      1142

    accuracy                           0.71      2284
   macro avg       0.71      0.71      0.71      2284
weighted avg       0.71      0.71      0.71      2284



### The XGBRFClassifier of SMOTEENN gave a better Prediction result, so we will use it

In [56]:
import pickle

In [57]:
file="customer_churn_prediction.pkl"

pickle.dump(rf,open(file,"wb"))

In [58]:
model=pickle.load(open(file,"rb"))

In [59]:
model=pickle.load(open("customer_churn_prediction.pkl","rb"))

In [60]:
data.columns

Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'Churn', 'gender_Male',
       'SeniorCitizen_Yes', 'Partner_Yes', 'Dependents_Yes',
       'PhoneService_Yes', 'MultipleLines_No phone service',
       'MultipleLines_Yes', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No internet service',
       'DeviceProtection_Yes', 'TechSupport_No internet service',
       'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')