In [1]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
     |████████████████████████████████| 226 kB 18.7 MB/s            
[?25h  Downloading imbalanced_learn-0.10.0-py3-none-any.whl (225 kB)
     |████████████████████████████████| 225 kB 49.2 MB/s            
[?25h  Downloading imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)
     |████████████████████████████████| 199 kB 43.4 MB/s            
  Downloading imbalanced_learn-0.9.0-py3-none-any.whl (199 kB)
     |████████████████████████████████| 199 kB 90.2 MB/s            
  Downloading imbalanced_learn-0.8.1-py3-none-any.whl (189 kB)
     |████████████████████████████████| 189 kB 71.5 MB/s            
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.8.1 imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, classification_report

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/telecom_churn.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
churn_data = pd.read_csv(file_content_stream)
churn_data.head()

Matplotlib is building the font cache; this may take a moment.


Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [3]:
## Defining input and target variables
X = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'DayMins', 'MonthlyCharge']]
Y = churn_data['Churn']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [4]:
## Running over-sampling
X_over, Y_over = RandomOverSampler().fit_resample(X_train, Y_train)

# Random Forest

In [5]:
## Building the model 
RF_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_over, Y_over)

## Predicting on the test dataset
RF_pred = RF_md.predict_proba(X_test)[:, 1]

## ROC
fpr, tpr, threshold = roc_curve(Y_test, RF_pred)

## Finding the optimal cutoff from ROC
RF_cutoff = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'cutoff': threshold})

## Computing distane to perfect model 
RF_cutoff['Distance'] = np.sqrt(RF_cutoff['FPR']**2 + (1 - RF_cutoff['TPR'])**2)

## Sorting based on distance 
RF_cutoff = RF_cutoff.sort_values(by = 'Distance').reset_index(drop = True)

## Changing likelihoods to labels
RF_pred_label = np.where(RF_pred < RF_cutoff['cutoff'][0], 0, 1)

## Classification report
print(classification_report(Y_test, RF_pred_label))

              precision    recall  f1-score   support

           0       0.97      0.81      0.88       570
           1       0.43      0.86      0.58        97

    accuracy                           0.82       667
   macro avg       0.70      0.83      0.73       667
weighted avg       0.89      0.82      0.84       667



# AdaBoost

In [6]:
## Building the model 
Ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_over, Y_over)

## Predicting on the test dataset
Ada_pred = Ada_md.predict_proba(X_test)[:, 1]

## ROC
fpr, tpr, threshold = roc_curve(Y_test, Ada_pred)

## Finding the optimal cutoff from ROC
Ada_cutoff = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'cutoff': threshold})

## Computing distane to perfect model 
Ada_cutoff['Distance'] = np.sqrt(Ada_cutoff['FPR']**2 + (1 - Ada_cutoff['TPR'])**2)

## Sorting based on distance 
Ada_cutoff = Ada_cutoff.sort_values(by = 'Distance').reset_index(drop = True)

## Changing likelihoods to labels
Ada_pred_label = np.where(Ada_pred < Ada_cutoff['cutoff'][0], 0, 1)

## Classification report
print(classification_report(Y_test, Ada_pred_label))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90       570
           1       0.48      0.86      0.61        97

    accuracy                           0.84       667
   macro avg       0.72      0.85      0.76       667
weighted avg       0.90      0.84      0.86       667



In [None]:
## Based on my results, I would use the adaboost model to predict churn.