In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [122]:
df = pd.read_csv('../dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [123]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [124]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [125]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [126]:
df.dropna(how='any', inplace=True)

In [127]:
df.shape

(7032, 21)

In [128]:
df.columns.to_list()

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [129]:
df['Churn'].value_counts() / len(df) * 100

Churn
No     73.421502
Yes    26.578498
Name: count, dtype: float64

In [130]:
x = df.drop(columns=['customerID', 'Churn'])
y = df['Churn']

In [131]:
x.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [132]:
x = pd.get_dummies(x, columns=[
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
], dtype=int)

In [133]:
x.head(1)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,1,0,0,1,1,0,...,0,1,0,0,0,1,0,0,1,0


In [134]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [135]:
len(x_train)

5274

In [136]:
len(x_test)

1758

In [137]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

x_train_sc = sc.fit_transform(x_train)
x_test_sc = sc.transform(x_test)

In [138]:
x_train_sc

array([[-0.4423203 ,  1.39556509,  1.54470712, ..., -0.53217636,
        -0.70318871, -0.54711499],
       [ 2.26080514, -0.59076662,  0.67256079, ..., -0.53217636,
         1.42209338, -0.54711499],
       [-0.4423203 ,  0.38213054, -0.28596957, ...,  1.87907632,
        -0.70318871, -0.54711499],
       ...,
       [-0.4423203 , -0.30700495,  1.33705323, ..., -0.53217636,
        -0.70318871, -0.54711499],
       [-0.4423203 ,  1.55771462,  0.73568757, ..., -0.53217636,
        -0.70318871, -0.54711499],
       [-0.4423203 , -0.34754233, -1.46045996, ..., -0.53217636,
        -0.70318871,  1.82776933]])

### KNN

In [139]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

model = KNeighborsClassifier()

model.fit(x_train_sc, y_train)

In [140]:
y_pred = model.predict(x_test_sc)
print(accuracy_score(y_test, y_pred))

0.7531285551763367


In [141]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.84      0.83      0.83      1300
         Yes       0.53      0.55      0.54       458

    accuracy                           0.75      1758
   macro avg       0.68      0.69      0.68      1758
weighted avg       0.76      0.75      0.75      1758



### Decision Tree Classifier

In [142]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier()

model_dt.fit(x_train_sc, y_train)

In [143]:
y_pred_dt = model_dt.predict(x_test_sc)
print(accuracy_score(y_test, y_pred_dt))

0.7167235494880546


In [144]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

          No       0.82      0.79      0.80      1300
         Yes       0.46      0.51      0.49       458

    accuracy                           0.72      1758
   macro avg       0.64      0.65      0.65      1758
weighted avg       0.73      0.72      0.72      1758



### Random Forest

In [145]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=200)

model_rf.fit(x_train_sc, y_train)

In [146]:
y_pred_rf = model_rf.predict(x_test_sc)
print(accuracy_score(y_test, y_pred_rf))

0.7963594994311718


In [147]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

          No       0.83      0.90      0.87      1300
         Yes       0.64      0.49      0.56       458

    accuracy                           0.80      1758
   macro avg       0.74      0.70      0.71      1758
weighted avg       0.78      0.80      0.79      1758



### Naive Bayes

In [148]:
from sklearn.naive_bayes import BernoulliNB

model_nb = BernoulliNB()
model_nb.fit(x_train_sc, y_train)

y_pred_nb = model_dt.predict(x_test_sc)
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

          No       0.82      0.79      0.80      1300
         Yes       0.46      0.51      0.49       458

    accuracy                           0.72      1758
   macro avg       0.64      0.65      0.65      1758
weighted avg       0.73      0.72      0.72      1758



### SVM

In [149]:
from sklearn.svm import SVC

model_sv = SVC()
model_sv.fit(x_train_sc, y_train)

y_pred_sv = model_sv.predict(x_test_sc)
print(classification_report(y_test, y_pred_sv))

              precision    recall  f1-score   support

          No       0.84      0.90      0.87      1300
         Yes       0.65      0.51      0.57       458

    accuracy                           0.80      1758
   macro avg       0.75      0.71      0.72      1758
weighted avg       0.79      0.80      0.79      1758



### Logistic Regression

In [150]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression()
model_lr.fit(x_train_sc, y_train)

y_pred_lr = model_lr.predict(x_test_sc)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

          No       0.85      0.90      0.88      1300
         Yes       0.66      0.56      0.61       458

    accuracy                           0.81      1758
   macro avg       0.76      0.73      0.74      1758
weighted avg       0.80      0.81      0.81      1758

