In [1]:
import csv
import pandas as pd
import numpy as np

csvfile = 'WA_Fn-UseC_-Telco-Customer-Churn.csv' 
df = pd.read_csv(csvfile)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [2]:
#converting TotalCharges column from string to float
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')

#dropping null entries and the customerID col
df = df.dropna()
df = df.drop(columns='customerID')

In [3]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
#creating dummy variables to resolve categorical features

categ_feats = list(df.columns.values)
list_to_remove = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']
categ_feats = list(set(categ_feats).difference(set(list_to_remove)))
df = pd.get_dummies(df, columns=categ_feats)

In [5]:
pd.pandas.set_option('display.max_columns', None)
df.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Churn,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,gender_Female,gender_Male,Dependents_No,Dependents_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,PaperlessBilling_No,PaperlessBilling_Yes,SeniorCitizen_0,SeniorCitizen_1,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,PhoneService_No,PhoneService_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Partner_No,Partner_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,29.85,29.85,No,1,0,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0
1,34,56.95,1889.5,No,0,0,1,0,1,0,1,0,0,0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1
2,2,53.85,108.15,Yes,1,0,0,1,0,0,0,0,1,0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1
3,45,42.3,1840.75,No,0,0,1,0,1,0,1,0,0,0,1,1,0,1,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0
4,2,70.7,151.65,Yes,1,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0


In [6]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

#pulling put numerical features for normalisation
numric_feats = ['tenure', 'MonthlyCharges', 'TotalCharges']
df_numeric_feats = pd.DataFrame(df, columns=numric_feats)
df_categ_feats = df.drop(columns=numric_feats)

#normalising numeric features and converting them back to dataframe
min_max_scaler = preprocessing.MinMaxScaler()
normalised_numeric_feats = min_max_scaler.fit_transform(df_numeric_feats)
normalised_numeric_feats = pd.DataFrame(normalised_numeric_feats, 
                                        columns=numric_feats, index=df_categ_feats.index)

#creating new dataframe with categorical features and the normalised numeric features
df_numeric_norm = pd.concat([df_categ_feats, normalised_numeric_feats], axis=1)

In [7]:
# splitting normalized X data into train and test sets
X_normalized = df_numeric_norm.drop('Churn', axis=1).values
y = df_numeric_norm['Churn'].values
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=21)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Normalized KNN classifier: identifying best k value with GridSearchCV
param_grid = {'n_neighbors': np.arange(1, 30)}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X_train, y_train)

print('k-NN best n_neighbors:', knn_cv.best_params_, '\n')

# predicted values
y_pred_knn_test = knn_cv.predict(X_test)

# predicted values
y_pred_knn_test = knn_cv.predict(X_test)

k-NN best n_neighbors: {'n_neighbors': 27} 



In [9]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print('k-NN test set confusion matrix:')
print(pd.DataFrame(confusion_matrix(y_test, y_pred_knn_test), index=['actual: no churn', 'actual: churn'], columns=['pred: no churn', 'pred: churn']), '\n')

print('k-NN test set classification report:')
print(classification_report(y_test, y_pred_knn_test))

k-NN test set confusion matrix:
                  pred: no churn  pred: churn
actual: no churn             883          124
actual: churn                187          213 

k-NN test set classification report:
              precision    recall  f1-score   support

          No       0.83      0.88      0.85      1007
         Yes       0.63      0.53      0.58       400

    accuracy                           0.78      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.77      0.78      0.77      1407



In [10]:
# Logistic Regression Classifier (L1 regularisation)

from sklearn.linear_model import LogisticRegression

param_grid = {'C': np.arange(0.5,5,0.5)}

logreg_L1 = LogisticRegression(penalty='l1', solver='liblinear')
logreg_L1_cv = GridSearchCV(logreg_L1, param_grid, cv=5)
logreg_L1_cv.fit(X_train, y_train)

print('Lasso Reg best C value', logreg_L1_cv.best_params_, '\n')

y_pred_L1_test = logreg_L1_cv.predict(X_test)

print('Lasso Reg test set confusion matrix:')

print(pd.DataFrame(confusion_matrix(y_test, y_pred_L1_test), index=[
    'actual:no churn', 'actual: churn'], columns=['pred: no churn', 'pred:churn']), '\n')

print('Lasso Reg test set classification report:')
print(classification_report(y_test, y_pred_L1_test))

Lasso Reg best C value {'C': 1.0} 

Lasso Reg test set confusion matrix:
                 pred: no churn  pred:churn
actual:no churn             909          98
actual: churn               197         203 

Lasso Reg test set classification report:
              precision    recall  f1-score   support

          No       0.82      0.90      0.86      1007
         Yes       0.67      0.51      0.58       400

    accuracy                           0.79      1407
   macro avg       0.75      0.71      0.72      1407
weighted avg       0.78      0.79      0.78      1407



In [11]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV


param_grid_rf = {'n_estimators': np.arange(10, 2000, 10), 
                 'max_features': ['auto', 'sqrt'],
                 'max_depth': np.arange(10, 200, 10),
                 'criterion': ['gini', 'entropy'],
                 'bootstrap': [True, False]}

rf = RandomForestClassifier()
rf_random_grid = RandomizedSearchCV(estimator=rf, param_distributions=param_grid_rf, cv=5)
rf_random_grid.fit(X_train, y_train)

y_pred_rf_test = rf_random_grid.predict(X_test)

print('Tuned Random Forest Params:', rf_random_grid.best_params_, '\n')
print('Tuned Random Forest score is {}.'.format(rf_random_grid.best_score_.round(3)), '\n')

print('Tuned Random Forest test set confusion matrix:')
print(pd.DataFrame(confusion_matrix(y_test, y_pred_rf_test), index=['actual: no churn', 'actual: churn'], columns=['pred: no churn', 'pred: churn']), '\n')
print('Tuned Random Forest test set classification report:')
print(classification_report(y_test, y_pred_rf_test), '\n')




Tuned Random Forest Params: {'n_estimators': 1470, 'max_features': 'sqrt', 'max_depth': 140, 'criterion': 'gini', 'bootstrap': True} 

Tuned Random Forest score is 0.787. 

Tuned Random Forest test set confusion matrix:
                  pred: no churn  pred: churn
actual: no churn             913           94
actual: churn                207          193 

Tuned Random Forest test set classification report:
              precision    recall  f1-score   support

          No       0.82      0.91      0.86      1007
         Yes       0.67      0.48      0.56       400

    accuracy                           0.79      1407
   macro avg       0.74      0.69      0.71      1407
weighted avg       0.77      0.79      0.77      1407
 



In [17]:
# using oversampling and undersampling to address data imbalance

from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

classifiers_dict = {RandomForestClassifier:'rf', LogisticRegression:'lr', KNeighborsClassifier:'knn'}

# function that builds pipeline for each classifier using RandomOverSampler
def oversampling(classifiers_dict):
    
    for key, value in classifiers_dict.items():
        globals()['over_' + value + '_pipeline'] = make_pipeline_imb(RandomOverSampler(random_state=4), key())
        globals()['over_' + value + '_model'] = globals()['over_' + value + '_pipeline'].fit(X_train, y_train)

        globals()['y_pred_over_' + value + '_test'] = globals()['over_' + value + '_model'].predict(X_test)
        globals()['X_over_' + value + '_resample'], globals()['y_over_' + '_resample'] = RandomOverSampler().fit(X_train, y_train)

# function that builds pipeline for each classifier using RandomUnderSampler
def undersampling(classifiers_dict):
    
    for key, value in classifiers_dict.items():
        globals()['under_' + value + '_pipeline'] = make_pipeline_imb(RandomUnderSampler(random_state=4), key())
        globals()['under_' + value + '_model'] = globals()['under_' + value + '_pipeline'].fit(X_train, y_train)

        globals()['y_pred_under_' + value + '_test'] = globals()['under_' + value + '_model'].predict(X_test)
        globals()['X_under_' + value + '_resample'], globals()['y_under_' + '_resample'] = RandomUnderSampler().fit_sample(X_train, y_train)

oversampling(classifiers_dict)
undersampling(classifiers_dict)

TypeError: cannot unpack non-iterable RandomOverSampler object

In [18]:
# creating table of the classification reports for all algorithms

from sklearn.metrics import precision_recall_fscore_support as score

algorithm_dict = {'KNN':y_pred_knn_test, 
                  'Log Reg':y_pred_L1_test, 
                  'Random Forest':y_pred_rf_test,
                  }

nochurn_class_report_df = pd.DataFrame(index=algorithm_dict.keys(), columns=['Precision', 'Recall', 'F-score', 'Support'])
churn_class_report_df = pd.DataFrame(index=algorithm_dict.keys(), columns=['Precision', 'Recall', 'F-score', 'Support'])

for key, value in algorithm_dict.items():
    
    precision, recall, fscore, support = score(y_test, value)
    
    nochurn_class_report_df.loc[key, 'Precision'] = precision[0].round(4)
    nochurn_class_report_df.loc[key, 'Recall'] = recall[0].round(4)
    nochurn_class_report_df.loc[key, 'F-score'] = fscore[0].round(4)
    nochurn_class_report_df.loc[key, 'Support'] = support[0]
    
    churn_class_report_df.loc[key, 'Precision'] = precision[1].round(4)
    churn_class_report_df.loc[key, 'Recall'] = recall[1].round(4)
    churn_class_report_df.loc[key, 'F-score'] = fscore[1].round(4)
    churn_class_report_df.loc[key, 'Support'] = support[1]

print('Classification report results for non-churn customers:')
print(nochurn_class_report_df, '\n')
    
print('Classification report results for churn customers:')
print(churn_class_report_df, '\n')

Classification report results for non-churn customers:
              Precision  Recall F-score Support
KNN              0.8252  0.8769  0.8503    1007
Log Reg          0.8219  0.9027  0.8604    1007
Random Forest    0.8152  0.9067  0.8585    1007 

Classification report results for churn customers:
              Precision  Recall F-score Support
KNN               0.632  0.5325   0.578     400
Log Reg          0.6744  0.5075  0.5792     400
Random Forest    0.6725  0.4825  0.5619     400 



In [13]:
from imblearn.over_sampling import RandomOverSampler