In [29]:
import boto3
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

s3= boto3.resource('s3')
bucket_name= 'morgan-gant-data448-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'telecom_churn.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datefile
churn_data = pd.read_csv(file_content_stream)
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [30]:
churn_data['Churn'].value_counts() / churn_data.shape[0]

0    0.855086
1    0.144914
Name: Churn, dtype: float64

In [31]:
#Defining input and target variables
x= churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
y= churn_data['Churn']

#Splitting the data
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= .2, stratify=y)

In [32]:
y_train.value_counts()/ y_train.shape[0]

0    0.855214
1    0.144786
Name: Churn, dtype: float64

In [33]:
y_test.value_counts()/ y_test.shape[0]

0    0.854573
1    0.145427
Name: Churn, dtype: float64

Random Forest

In [34]:
rf_md= RandomForestClassifier(n_estimators= 500, max_depth=3).fit(x_train, y_train)

#Prediction on the test
rf_pred= rf_md.predict_proba(x_test)[:,1]

# ROC_AUC
fpr, tpr, threshold= roc_curve(y_test, rf_pred)

In [40]:
cutoffs= pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'threshold': threshold})
cutoffs

Unnamed: 0,fpr,tpr,threshold
0,0.000000,0.000000,1.649300
1,0.000000,0.010309,0.649300
2,0.000000,0.206186,0.557513
3,0.001754,0.206186,0.553102
4,0.001754,0.298969,0.508787
...,...,...,...
138,0.982456,1.000000,0.052436
139,0.984211,1.000000,0.052429
140,0.991228,1.000000,0.052386
141,0.998246,1.000000,0.052293


In [41]:
cutoffs= cutoffs.drop(cutoffs.index[0], axis=0)
cutoffs

Unnamed: 0,fpr,tpr,threshold
1,0.000000,0.010309,0.649300
2,0.000000,0.206186,0.557513
3,0.001754,0.206186,0.553102
4,0.001754,0.298969,0.508787
5,0.003509,0.298969,0.505871
...,...,...,...
138,0.982456,1.000000,0.052436
139,0.984211,1.000000,0.052429
140,0.991228,1.000000,0.052386
141,0.998246,1.000000,0.052293


In [42]:
cutoffs['Euclidean_dist'] = np.sqrt(cutoffs['fpr']**2 + (1 - cutoffs['tpr'])**2)
cutoffs

Unnamed: 0,fpr,tpr,threshold,Euclidean_dist
1,0.000000,0.010309,0.649300,0.989691
2,0.000000,0.206186,0.557513,0.793814
3,0.001754,0.206186,0.553102,0.793816
4,0.001754,0.298969,0.508787,0.701033
5,0.003509,0.298969,0.505871,0.701040
...,...,...,...,...
138,0.982456,1.000000,0.052436,0.982456
139,0.984211,1.000000,0.052429,0.984211
140,0.991228,1.000000,0.052386,0.991228
141,0.998246,1.000000,0.052293,0.998246


In [43]:
cutoffs= cutoffs.sort_values(by= 'Euclidean_dist')
cutoffs

Unnamed: 0,fpr,tpr,threshold,Euclidean_dist
64,0.166667,0.793814,0.133646,0.265123
68,0.189474,0.814433,0.127062,0.265208
70,0.201754,0.824742,0.106565,0.267245
66,0.182456,0.804124,0.129968,0.267690
62,0.163158,0.783505,0.135129,0.271091
...,...,...,...,...
139,0.984211,1.000000,0.052429,0.984211
1,0.000000,0.010309,0.649300,0.989691
140,0.991228,1.000000,0.052386,0.991228
141,0.998246,1.000000,0.052293,0.998246


In [44]:
#Chaning the likelyhoods to labels
rf_labels= np.where(rf_pred < .138858, 0, 1)

print(classification_report(y_test, rf_labels))

              precision    recall  f1-score   support

           0       0.96      0.84      0.89       570
           1       0.45      0.77      0.57        97

    accuracy                           0.83       667
   macro avg       0.70      0.81      0.73       667
weighted avg       0.88      0.83      0.85       667

