In [1]:
pip install -U imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


# Loading Libraries

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.metrics import roc_curve, classification_report

# Reading the data

In [3]:
churn_data = pd.read_csv('telecom_churn.csv')
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [4]:
churn_data['Churn'].value_counts(normalize = True)

0    0.855086
1    0.144914
Name: Churn, dtype: float64

# Defining X and Y

In [5]:
X = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'DayMins', 'MonthlyCharge']]
Y = churn_data['Churn']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

# Over-Sampling

In [6]:
Y_train.value_counts(normalize = True)

0    0.855214
1    0.144786
Name: Churn, dtype: float64

In [7]:
X_over, Y_over = RandomOverSampler().fit_resample(X_train, Y_train)

In [8]:
Y_over.value_counts(normalize = True)

1    0.5
0    0.5
Name: Churn, dtype: float64

# ExtraTrees

In [9]:
ET_md = ExtraTreesClassifier(n_estimators = 500, 
                             max_depth = 3).fit(X_over, Y_over)

ET_pred = ET_md.predict_proba(X_test)[:, 1]

## Computing the ROC-curve
fpr, tpr, thresholds = roc_curve(Y_test, ET_pred)

## Finding the optimal threshold
ET_cutoff = pd.DataFrame({'False_Positive': fpr,
                          'True_Positive': tpr,
                          'Cutoff': thresholds})

ET_cutoff['True_Positive_minus_1'] = ET_cutoff['True_Positive'] - 1
ET_cutoff['Distance_to_perfect_model'] = np.sqrt(ET_cutoff['False_Positive']**2 + ET_cutoff['True_Positive_minus_1']**2)

## Sorting based on distance to perfect model 
ET_cutoff = ET_cutoff.sort_values(by = 'Distance_to_perfect_model').reset_index(drop = True)
ET_cutoff.head()

Unnamed: 0,False_Positive,True_Positive,Cutoff,True_Positive_minus_1,Distance_to_perfect_model
0,0.194737,0.845361,0.442571,-0.154639,0.248668
1,0.205263,0.85567,0.439081,-0.14433,0.250926
2,0.205263,0.845361,0.439378,-0.154639,0.256995
3,0.17193,0.804124,0.450035,-0.195876,0.260629
4,0.185965,0.814433,0.446378,-0.185567,0.262713


In [11]:
## Changing likelihoods to labels
ET_label = np.where(ET_pred < 0.442571, 0, 1)

print(classification_report(Y_test, ET_label))

              precision    recall  f1-score   support

           0       0.97      0.81      0.88       570
           1       0.42      0.85      0.57        97

    accuracy                           0.81       667
   macro avg       0.70      0.83      0.72       667
weighted avg       0.89      0.81      0.83       667



# HistGradient

In [12]:
HG_md = HistGradientBoostingClassifier(max_iter = 500, 
                                       max_depth = 3, 
                                       learning_rate = 0.01).fit(X_over, Y_over)

HG_pred = HG_md.predict_proba(X_test)[:, 1]

## Computing the ROC-curve
fpr, tpr, thresholds = roc_curve(Y_test, HG_pred)

## Finding the optimal threshold
HG_cutoff = pd.DataFrame({'False_Positive': fpr,
                          'True_Positive': tpr,
                          'Cutoff': thresholds})

HG_cutoff['True_Positive_minus_1'] = HG_cutoff['True_Positive'] - 1
HG_cutoff['Distance_to_perfect_model'] = np.sqrt(HG_cutoff['False_Positive']**2 + HG_cutoff['True_Positive_minus_1']**2)

## Sorting based on distance to perfect model 
HG_cutoff = HG_cutoff.sort_values(by = 'Distance_to_perfect_model').reset_index(drop = True)
HG_cutoff.head()

Unnamed: 0,False_Positive,True_Positive,Cutoff,True_Positive_minus_1,Distance_to_perfect_model
0,0.14386,0.876289,0.239967,-0.123711,0.189737
1,0.136842,0.865979,0.296713,-0.134021,0.191539
2,0.103509,0.835052,0.470932,-0.164948,0.194736
3,0.14386,0.865979,0.240089,-0.134021,0.196614
4,0.135088,0.85567,0.332338,-0.14433,0.197686


In [13]:
## Changing likelihoods to labels
HG_label = np.where(HG_pred < 0.239967, 0, 1)

print(classification_report(Y_test, HG_label))

              precision    recall  f1-score   support

           0       0.97      0.86      0.91       570
           1       0.51      0.87      0.64        97

    accuracy                           0.86       667
   macro avg       0.74      0.86      0.78       667
weighted avg       0.91      0.86      0.87       667



In [None]:
# Based on my results, I would use HistGradientBoosting to predict churn.