In [1]:
pip install imblearn

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.10.1 imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from imblearn.over_sampling import RandomOverSampler

s3= boto3.resource('s3')
bucket_name= 'morgan-gant-data448-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'telecom_churn.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datefile
churn_data = pd.read_csv(file_content_stream)
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [None]:
#Defining input and target variables
x= churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
y= churn_data['Churn']

#Splitting the data
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= .2, stratify=y)

NameError: name 'churn_data' is not defined

In [7]:
#Runnign over-sampling
x_over, y_over= RandomOverSampler().fit_resample(x_train, y_train)

Random Forest 

In [11]:
#Building RF model
rf_md= RandomForestClassifier(n_estimators= 500, max_depth=3).fit(x_over, y_over)

#Prediction on the test
rf_pred= rf_md.predict_proba(x_test)[:,1]

#ROC_AUC
fpr, tpr, threshold= roc_curve(y_test, rf_pred)

#Finding optimal cutoff from ROC
rf_cutoffs= pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'threshold': threshold})

#Computing distance to perfect model
rf_cutoffs['Euclidean_dist'] = np.sqrt(rf_cutoffs['fpr']**2 + (1 - rf_cutoffs['tpr'])**2)

#Sorting based on distance
rf_cutoffs= rf_cutoffs.sort_values(by= 'Euclidean_dist').reset_index(drop= True)

#Changing likelyhoods to lables
rf_pred_label = np.where(rf_pred < rf_cutoffs['threshold'][0], 0, 1)

#Classification report
print(classification_report(y_test, rf_pred_label))

              precision    recall  f1-score   support

           0       0.97      0.82      0.89       570
           1       0.45      0.87      0.60        97

    accuracy                           0.83       667
   macro avg       0.71      0.84      0.74       667
weighted avg       0.90      0.83      0.85       667



Ada Boost

In [16]:
#Building RF model
ada_md= AdaBoostClassifier(base_estimator= DecisionTreeClassifier(max_depth=3), n_estimators=500, learning_rate=.01).fit(x_over, y_over)

#Prediction on the test
ada_pred= ada_md.predict_proba(x_test)[:,1]

#ROC_AUC
fpr, tpr, threshold= roc_curve(y_test, ada_pred)

#Finding optimal cutoff from ROC
ada_cutoffs= pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'threshold': threshold})

#Computing distance to perfect model
ada_cutoffs['Euclidean_dist'] = np.sqrt(ada_cutoffs['fpr']**2 + (1 - ada_cutoffs['tpr'])**2)

#Sorting based on distance
ada_cutoffs= ada_cutoffs.sort_values(by= 'Euclidean_dist').reset_index(drop= True)

#Changing likelyhoods to lables
ada_pred_label = np.where(ada_pred < ada_cutoffs['threshold'][0], 0, 1)

#Classification report
print(classification_report(y_test, ada_pred_label))

              precision    recall  f1-score   support

           0       0.97      0.86      0.91       570
           1       0.52      0.86      0.64        97

    accuracy                           0.86       667
   macro avg       0.74      0.86      0.78       667
weighted avg       0.91      0.86      0.88       667



In [None]:
#Based on the two models above and their reports, the model I would use to predict customer churn is the AdaBoost model