In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier, BaggingClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix, f1_score, roc_curve, auc, classification_report, confusion_matrix, precision_recall_curve, recall_score, precision_score, accuracy_score, roc_auc_score, make_scorer

In [2]:
churn_data = pd.read_excel('Churn_Data.xlsx')
churn_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,numAdminTickets,numTechTickets,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,0,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,One year,No,Mailed check,56.95,1889.5,0,0,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,0,0,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0,3,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,0,0,Yes


In [3]:
# One Hot Encoding Categorical Data


cols = churn_data.columns.to_list()

object_cols = []

for col in cols:
    if churn_data[col].dtype == object and col not in ['customerID', 'TotalCharges', 'Churn']:
        object_cols.append(col)

churn_data = pd.get_dummies(churn_data, columns = object_cols, drop_first = True)
churn_data.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,numAdminTickets,numTechTickets,Churn,gender_Male,Partner_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,1,29.85,29.85,0,0,No,0,1,...,0,0,0,0,0,0,1,0,1,0
1,5575-GNVDE,0,34,56.95,1889.5,0,0,No,1,0,...,0,0,0,0,1,0,0,0,0,1
2,3668-QPYBK,0,2,53.85,108.15,0,0,Yes,1,0,...,0,0,0,0,0,0,1,0,0,1
3,7795-CFOCW,0,45,42.3,1840.75,0,3,No,1,0,...,0,0,0,0,1,0,0,0,0,0
4,9237-HQITU,0,2,70.7,151.65,0,0,Yes,0,0,...,0,0,0,0,0,0,1,0,1,0


In [4]:
churn_data.shape

(7043, 34)

In [5]:
# Handling missing data

churn_data.drop(churn_data[churn_data['TotalCharges'] == ' '].index, inplace=True)
churn_data.shape

(7032, 34)

In [6]:
churn_data['TotalCharges'] = churn_data['TotalCharges'].astype(float)

In [7]:
# Transforming the target variable from No/Yes to 0/1

churn_data['Churn'].replace(['No', 'Yes'], [0, 1], inplace = True)

In [8]:
X = churn_data.drop(['customerID', 'Churn'], axis = 1)
y = churn_data['Churn']

In [9]:
# Standardising the numeric data

numeric_columns = ['tenure', 'MonthlyCharges', 'TotalCharges', 'numAdminTickets', 'numTechTickets']

scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

X.shape

(7032, 32)

In [10]:
# Train/Test split with stratify on the target variable to handle the target class imbalance

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=42)

print(X_train.shape), print(X_test.shape), print(y_train.shape), print(y_test.shape)

(5625, 32)
(1407, 32)
(5625,)
(1407,)


(None, None, None, None)

In [11]:
# Function to determine the optimal threshold for individual model

def find_threshold(classifier, test_data, test_labels):
    y_proba_thresh = classifier.predict_proba(test_data)[::,1]
    
    fpr, tpr, thresholds = roc_curve(test_labels, y_proba_thresh)

    y_proba = classifier.predict_proba(test_data)
    y_pred = classifier.predict(test_data)

    f1_scores = []

    for threshold in thresholds:
        y_pred = (y_proba[:, 1] > threshold).astype(int)
        f1_scores.append(f1_score(test_labels, y_pred))

    optimal_threshold = thresholds[np.argmax(f1_scores)]

    print(f"The optimum threshold is {optimal_threshold.round(3)} with an F1 score of {max(f1_scores).round(3)}")
    return optimal_threshold

In [12]:
# Function to aggregate the relevant metrics for individual model

def calculate_metrics(model_name, y_true, predictions):
    
    rscore = recall_score(y_true, predictions)
    pscore = precision_score(y_true, predictions)
    fscore = f1_score(y_true, predictions)
    rascore = roc_auc_score(y_true, predictions)

    results = [(rscore, pscore, fscore, rascore)]
    classifier_metrics = pd.DataFrame(data = results, columns=['Recall','Precision','F1 Score', 'ROC-AUC Score'])
    classifier_metrics.insert(0, 'Model', model_name)
    print(classifier_metrics)
    return classifier_metrics

In [13]:
# Vanilla Logistic Regression

clf = LogisticRegression(random_state = 42)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

clf_metrics = calculate_metrics('Logistic Regression', y_test, y_pred)

                 Model    Recall  Precision  F1 Score  ROC-AUC Score
0  Logistic Regression  0.713904   0.723577  0.718708       0.807581


In [14]:
# Determining the optimal threshold for the Vanilla Logistic Regression

optimal_threshold = find_threshold(clf, X_test, y_test)

The optimum threshold is 0.448 with an F1 score of 0.726


In [15]:
y_proba = clf.predict_proba(X_test)
y_pred = (y_proba[:, 1] > optimal_threshold).astype(int)

clf_thresh_metrics = calculate_metrics('Optimal Threshold Logistic Regression', y_test, y_pred)

                                   Model    Recall  Precision  F1 Score  \
0  Optimal Threshold Logistic Regression  0.751337     0.7025  0.726098   

   ROC-AUC Score  
0       0.818069  


In [16]:
pd.concat([clf_metrics, clf_thresh_metrics])

Unnamed: 0,Model,Recall,Precision,F1 Score,ROC-AUC Score
0,Logistic Regression,0.713904,0.723577,0.718708,0.807581
0,Optimal Threshold Logistic Regression,0.751337,0.7025,0.726098,0.818069


In [17]:
# Hyperparameter tuning the Vanilla Logistic Regression

grid = {'C': [0.001,0.01,0.1,1,10,100,1000]}

clf = LogisticRegression(max_iter = 1000, random_state = 42)

clf_grid = GridSearchCV(clf, grid, cv=8, scoring = 'f1')
clf_grid.fit(X_train, y_train)

best_params = clf_grid.best_params_
best_model = clf_grid.best_estimator_

print(best_params)

{'C': 100}


In [18]:
clf_100 = LogisticRegression(C = 100, random_state = 42)

clf_100.fit(X_train, y_train)
y_pred = clf_100.predict(X_test)

clf_100_metrics = calculate_metrics('Tuned Logistic Regression', y_test, y_pred)

                       Model    Recall  Precision  F1 Score  ROC-AUC Score
0  Tuned Logistic Regression  0.700535   0.715847  0.708108       0.799929


In [19]:
# Determining the optimal threshold for the tuned Logistic Regression

optimal_threshold = find_threshold(clf_100, X_test, y_test)

The optimum threshold is 0.336 with an F1 score of 0.721


In [20]:
y_proba = clf_100.predict_proba(X_test)
y_pred = (y_proba[:, 1] > optimal_threshold).astype(int)

clf_100_thresh_metrics = calculate_metrics('Tuned Threshold Logistic Regression', y_test, y_pred)

                                 Model    Recall  Precision  F1 Score  \
0  Tuned Threshold Logistic Regression  0.828877    0.63786   0.72093   

   ROC-AUC Score  
0        0.82925  


In [21]:
# Metrics for the variants of the Logistic Regression

logistic = pd.concat([clf_metrics, clf_thresh_metrics, clf_100_metrics, clf_100_thresh_metrics])
logistic

Unnamed: 0,Model,Recall,Precision,F1 Score,ROC-AUC Score
0,Logistic Regression,0.713904,0.723577,0.718708,0.807581
0,Optimal Threshold Logistic Regression,0.751337,0.7025,0.726098,0.818069
0,Tuned Logistic Regression,0.700535,0.715847,0.708108,0.799929
0,Tuned Threshold Logistic Regression,0.828877,0.63786,0.72093,0.82925


In [22]:
# Vanilla SVM Classifier

clf_svc = SVC(probability = True, random_state = 42)

clf_svc.fit(X_train, y_train)
y_pred = clf_svc.predict(X_test)

clf_svc_metrics = calculate_metrics('SVM Classifier', y_test, y_pred)

            Model    Recall  Precision  F1 Score  ROC-AUC Score
0  SVM Classifier  0.665775   0.736686  0.699438       0.789809


In [23]:
# Determining the optimal threshold for the Vanilla SVM Classifier

optimal_threshold = find_threshold(clf_svc, X_test, y_test)

The optimum threshold is 0.249 with an F1 score of 0.742


In [24]:
y_proba = clf_svc.predict_proba(X_test)
y_pred = (y_proba[:, 1] > optimal_threshold).astype(int)

clf_svc_thresh_metrics = calculate_metrics('Optimal Threshold SVM Classifier', y_test, y_pred)

                              Model    Recall  Precision  F1 Score  \
0  Optimal Threshold SVM Classifier  0.804813   0.688787  0.742293   

   ROC-AUC Score  
0       0.836579  


In [25]:
# Hyperparameter tuning the Vanilla SVM Classifier

grid = {'C': [0.001,0.01,0.1,1,10,100,1000],
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}

clf_svc = SVC(probability = True, max_iter = -1, random_state = 42)

clf_svc_grid = GridSearchCV(clf_svc, grid, cv=8, scoring = 'f1')
clf_svc_grid.fit(X_train, y_train)

best_params = clf_svc_grid.best_params_
best_model = clf_svc_grid.best_estimator_

print(best_params)

{'C': 10, 'gamma': 0.01}


In [26]:
clf_svc_10_001 = SVC(C = 10, gamma = 0.01, probability = True, random_state = 42)

clf_svc_10_001.fit(X_train, y_train)
y_pred = clf_svc_10_001.predict(X_test)

clf_svc_10_001_metrics = calculate_metrics('Tuned SVM Classifier', y_test, y_pred)

                  Model    Recall  Precision  F1 Score  ROC-AUC Score
0  Tuned SVM Classifier  0.679144   0.747059  0.711485       0.797946


In [27]:
# Dtermining the optimal threshold for the tuned SVM Classifier

optimal_threshold = find_threshold(clf_svc_10_001, X_test, y_test)

The optimum threshold is 0.315 with an F1 score of 0.744


In [28]:
y_proba = clf_svc_10_001.predict_proba(X_test)
y_pred = (y_proba[:, 1] > optimal_threshold).astype(int)

clf_svc_10_001_thresh_metrics = calculate_metrics('Tuned Threshold SVM Classifier', y_test, y_pred)

                            Model    Recall  Precision  F1 Score  \
0  Tuned Threshold SVM Classifier  0.802139   0.694444  0.744417   

   ROC-AUC Score  
0       0.837178  


In [29]:
# Metrics for the variants of the SVM Classifier

supvecclass = pd.concat([clf_svc_metrics, clf_svc_thresh_metrics, clf_svc_10_001_metrics, clf_svc_10_001_thresh_metrics])
supvecclass

Unnamed: 0,Model,Recall,Precision,F1 Score,ROC-AUC Score
0,SVM Classifier,0.665775,0.736686,0.699438,0.789809
0,Optimal Threshold SVM Classifier,0.804813,0.688787,0.742293,0.836579
0,Tuned SVM Classifier,0.679144,0.747059,0.711485,0.797946
0,Tuned Threshold SVM Classifier,0.802139,0.694444,0.744417,0.837178


In [33]:
# Metrics for the variants of the Logistic Regression and SVM Classifiers

logreg_svc = pd.concat([logistic, supvecclass])
logreg_svc.sort_values('F1 Score', ascending = False)

Unnamed: 0,Model,Recall,Precision,F1 Score,ROC-AUC Score
0,Tuned Threshold SVM Classifier,0.802139,0.694444,0.744417,0.837178
0,Optimal Threshold SVM Classifier,0.804813,0.688787,0.742293,0.836579
0,Optimal Threshold Logistic Regression,0.751337,0.7025,0.726098,0.818069
0,Tuned Threshold Logistic Regression,0.828877,0.63786,0.72093,0.82925
0,Logistic Regression,0.713904,0.723577,0.718708,0.807581
0,Tuned SVM Classifier,0.679144,0.747059,0.711485,0.797946
0,Tuned Logistic Regression,0.700535,0.715847,0.708108,0.799929
0,SVM Classifier,0.665775,0.736686,0.699438,0.789809
