In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score, roc_curve, recall_score, precision_score, roc_auc_score

In [3]:
churn_data = pd.read_csv('data/Churn_data.csv')
churn_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,0,1,0,-1.280248,0,1,-1.161694,-0.994194,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,5575-GNVDE,1,0,0,0,0.064303,1,0,-0.260878,-0.17374,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,3668-QPYBK,1,0,0,0,-1.239504,1,1,-0.363923,-0.959649,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,7795-CFOCW,1,0,0,0,0.512486,0,0,-0.74785,-0.195248,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,9237-HQITU,0,0,0,0,-1.239504,1,1,0.196178,-0.940457,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:
X = churn_data.drop(['customerID', 'Churn'], axis = 1)
y = churn_data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=42)

In [6]:
# Function to find the optimal threshold value for a given classifier using test data and labels
def find_threshold(classifier, test_data, test_labels):
    y_proba_thresh = classifier.predict_proba(test_data)[::,1]
    fpr, tpr, thresholds = roc_curve(test_labels, y_proba_thresh)
    y_proba = classifier.predict_proba(test_data)
    y_pred = classifier.predict(test_data)
    f1_scores = []
    for threshold in thresholds:
        y_pred = (y_proba[:, 1] > threshold).astype(int)
        f1_scores.append(f1_score(test_labels, y_pred))
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    print(f"The optimum threshold is {optimal_threshold.round(3)} with an F1 score of {max(f1_scores).round(3)}")
    return optimal_threshold


# Function to calculate various classification metrics for a given model and its predictions
def calculate_metrics(model_name, y_true, predictions):
    rscore = recall_score(y_true, predictions)
    pscore = precision_score(y_true, predictions)
    fscore = f1_score(y_true, predictions)
    rascore = roc_auc_score(y_true, predictions)
    results = [(rscore, pscore, fscore, rascore)]
    classifier_metrics = pd.DataFrame(data = results, columns=['Recall','Precision','F1 Score', 'ROC-AUC Score'])
    classifier_metrics.insert(0, 'Model', model_name)
    # print(classifier_metrics)
    return classifier_metrics


#### 1. Logistic Regeression

In [8]:
clf = LogisticRegression(random_state = 42)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

clf_metrics = calculate_metrics('Logistic Regression', y_test, y_pred)
optimal_threshold = find_threshold(clf, X_test, y_test)

The optimum threshold is 0.446 with an F1 score of 0.724


In [9]:
y_proba = clf.predict_proba(X_test)
y_pred = (y_proba[:, 1] > optimal_threshold).astype(int)

clf_thresh_metrics = calculate_metrics('Optimal Threshold Logistic Regression', y_test, y_pred)
pd.concat([clf_metrics, clf_thresh_metrics])


Unnamed: 0,Model,Recall,Precision,F1 Score,ROC-AUC Score
0,Logistic Regression,0.71123,0.722826,0.716981,0.806244
0,Optimal Threshold Logistic Regression,0.751337,0.699005,0.724227,0.817101


In [10]:
grid = {'C': [0.001,0.01,0.1,1,10,100,1000]}

clf = LogisticRegression(max_iter = 1000, random_state = 42)

clf_grid = GridSearchCV(clf, grid, cv=8, scoring = 'f1')
clf_grid.fit(X_train, y_train)

best_params = clf_grid.best_params_
best_model = clf_grid.best_estimator_

print(best_params)

{'C': 100}


In [14]:
clf_100 = LogisticRegression(C = 100, random_state = 42)

clf_100.fit(X_train, y_train)
y_pred = clf_100.predict(X_test)

clf_100_metrics = calculate_metrics('Tuned Logistic Regression', y_test, y_pred)
optimal_threshold = find_threshold(clf_100, X_test, y_test)

y_proba = clf_100.predict_proba(X_test)
y_pred = (y_proba[:, 1] > optimal_threshold).astype(int)

clf_100_thresh_metrics = calculate_metrics('Tuned Threshold Logistic Regression', y_test, y_pred)

The optimum threshold is 0.336 with an F1 score of 0.721


In [15]:
logistic = pd.concat([clf_metrics, clf_thresh_metrics, clf_100_metrics, clf_100_thresh_metrics])
logistic

Unnamed: 0,Model,Recall,Precision,F1 Score,ROC-AUC Score
0,Logistic Regression,0.71123,0.722826,0.716981,0.806244
0,Optimal Threshold Logistic Regression,0.751337,0.699005,0.724227,0.817101
0,Tuned Logistic Regression,0.700535,0.715847,0.708108,0.799929
0,Tuned Threshold Logistic Regression,0.828877,0.63786,0.72093,0.82925


#### 2. SVM

In [17]:
clf_svc = SVC(probability = True, random_state = 42)

clf_svc.fit(X_train, y_train)
y_pred = clf_svc.predict(X_test)

clf_svc_metrics = calculate_metrics('SVM Classifier', y_test, y_pred)
optimal_threshold = find_threshold(clf_svc, X_test, y_test)

The optimum threshold is 0.272 with an F1 score of 0.74


In [18]:
y_proba = clf_svc.predict_proba(X_test)
y_pred = (y_proba[:, 1] > optimal_threshold).astype(int)

clf_svc_thresh_metrics = calculate_metrics('Optimal Threshold SVM Classifier', y_test, y_pred)

In [20]:
grid = {'C': [0.001,0.01,0.1,1,10,100,1000],
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}

clf_svc = SVC(probability = True, max_iter = -1, random_state = 42)

clf_svc_grid = GridSearchCV(clf_svc, grid, cv=8, scoring = 'f1', n_jobs= -1)
clf_svc_grid.fit(X_train, y_train)

best_params = clf_svc_grid.best_params_
best_model = clf_svc_grid.best_estimator_

print(best_params)

{'C': 1, 'gamma': 0.1}


In [22]:
clf_svc_10_001 = SVC(C = 10, gamma = 0.01, probability = True, random_state = 42)

clf_svc_10_001.fit(X_train, y_train)
y_pred = clf_svc_10_001.predict(X_test)

clf_svc_10_001_metrics = calculate_metrics('Tuned SVM Classifier', y_test, y_pred)
optimal_threshold = find_threshold(clf_svc_10_001, X_test, y_test)

The optimum threshold is 0.313 with an F1 score of 0.741


In [23]:
y_proba = clf_svc_10_001.predict_proba(X_test)
y_pred = (y_proba[:, 1] > optimal_threshold).astype(int)

clf_svc_10_001_thresh_metrics = calculate_metrics('Tuned Threshold SVM Classifier', y_test, y_pred)

supvecclass = pd.concat([clf_svc_metrics, clf_svc_thresh_metrics, clf_svc_10_001_metrics, clf_svc_10_001_thresh_metrics])
supvecclass

Unnamed: 0,Model,Recall,Precision,F1 Score,ROC-AUC Score
0,SVM Classifier,0.671123,0.733918,0.701117,0.791515
0,Optimal Threshold SVM Classifier,0.799465,0.68894,0.740099,0.834389
0,Tuned SVM Classifier,0.681818,0.732759,0.706371,0.795895
0,Tuned Threshold SVM Classifier,0.794118,0.693925,0.740648,0.833651


### All the models arranged on F1 Score

In [24]:
logreg_svc = pd.concat([logistic, supvecclass])
logreg_svc.sort_values('F1 Score', ascending = False)

Unnamed: 0,Model,Recall,Precision,F1 Score,ROC-AUC Score
0,Tuned Threshold SVM Classifier,0.794118,0.693925,0.740648,0.833651
0,Optimal Threshold SVM Classifier,0.799465,0.68894,0.740099,0.834389
0,Optimal Threshold Logistic Regression,0.751337,0.699005,0.724227,0.817101
0,Tuned Threshold Logistic Regression,0.828877,0.63786,0.72093,0.82925
0,Logistic Regression,0.71123,0.722826,0.716981,0.806244
0,Tuned Logistic Regression,0.700535,0.715847,0.708108,0.799929
0,Tuned SVM Classifier,0.681818,0.732759,0.706371,0.795895
0,SVM Classifier,0.671123,0.733918,0.701117,0.791515
