In [1]:
import boto3

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, classification_report

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/telecom_churn.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
churn_data = pd.read_csv(file_content_stream)
churn_data.head()



Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [3]:
## Creating the frequency table of Churn
churn_data['Churn'].value_counts() / churn_data.shape[0]

0    0.855086
1    0.144914
Name: Churn, dtype: float64

In [4]:
## Defining the input and target variables
X = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
Y = churn_data['Churn']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [6]:
Y_train.value_counts() / Y_train.shape[0]

0    0.855214
1    0.144786
Name: Churn, dtype: float64

In [7]:
Y_test.value_counts() / Y_test.shape[0]

0    0.854573
1    0.145427
Name: Churn, dtype: float64

# Random Forest

In [8]:
RF_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

## Predicting on test
RF_pred = RF_md.predict_proba(X_test)[:, 1]

## ROC_AUC
fpr, tpr, threshold = roc_curve(Y_test, RF_pred)

In [9]:
cutoffs = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'threshold': threshold})
cutoffs

Unnamed: 0,fpr,tpr,threshold
0,0.000000,0.000000,1.655147
1,0.000000,0.010309,0.655147
2,0.000000,0.134021,0.581083
3,0.001754,0.134021,0.578422
4,0.001754,0.226804,0.543657
...,...,...,...
136,0.987719,0.989691,0.052276
137,0.994737,0.989691,0.052134
138,0.994737,1.000000,0.052088
139,0.998246,1.000000,0.052057


In [10]:
cutoffs = cutoffs.drop(cutoffs.index[0], axis = 0)
cutoffs

Unnamed: 0,fpr,tpr,threshold
1,0.000000,0.010309,0.655147
2,0.000000,0.134021,0.581083
3,0.001754,0.134021,0.578422
4,0.001754,0.226804,0.543657
5,0.003509,0.226804,0.539609
...,...,...,...
136,0.987719,0.989691,0.052276
137,0.994737,0.989691,0.052134
138,0.994737,1.000000,0.052088
139,0.998246,1.000000,0.052057


In [11]:
cutoffs['Euclidean_dist'] = np.sqrt(cutoffs['fpr']**2 + (1 - cutoffs['tpr'])**2)
cutoffs

Unnamed: 0,fpr,tpr,threshold,Euclidean_dist
1,0.000000,0.010309,0.655147,0.989691
2,0.000000,0.134021,0.581083,0.865979
3,0.001754,0.134021,0.578422,0.865981
4,0.001754,0.226804,0.543657,0.773198
5,0.003509,0.226804,0.539609,0.773204
...,...,...,...,...
136,0.987719,0.989691,0.052276,0.987773
137,0.994737,0.989691,0.052134,0.994790
138,0.994737,1.000000,0.052088,0.994737
139,0.998246,1.000000,0.052057,0.998246


In [12]:
cutoffs = cutoffs.sort_values(by = 'Euclidean_dist')
cutoffs

Unnamed: 0,fpr,tpr,threshold,Euclidean_dist
72,0.187719,0.865979,0.138858,0.230651
68,0.173684,0.845361,0.156358,0.232550
66,0.154386,0.824742,0.181460,0.233560
70,0.184211,0.855670,0.143144,0.234018
71,0.187719,0.855670,0.140836,0.236790
...,...,...,...,...
1,0.000000,0.010309,0.655147,0.989691
138,0.994737,1.000000,0.052088,0.994737
137,0.994737,0.989691,0.052134,0.994790
139,0.998246,1.000000,0.052057,0.998246


In [13]:
## Changing likelihoods to labels
RF_labels = np.where(RF_pred < 0.138858, 0, 1)

print(classification_report(Y_test, RF_labels))

              precision    recall  f1-score   support

           0       0.97      0.81      0.88       570
           1       0.44      0.86      0.58        97

    accuracy                           0.82       667
   macro avg       0.70      0.83      0.73       667
weighted avg       0.89      0.82      0.84       667

