In [21]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve, classification_report

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/telecom_churn.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
churn_data = pd.read_csv(file_content_stream)
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [22]:
## Report proportion of 0s and 1s
churn_data['Churn'].value_counts() / churn_data.shape[0]

0    0.855086
1    0.144914
Name: Churn, dtype: float64

In [23]:
pd.crosstab(churn_data['ContractRenewal'], churn_data['Churn'])

Churn,0,1
ContractRenewal,Unnamed: 1_level_1,Unnamed: 2_level_1
0,186,137
1,2664,346


In [24]:
## Defining input and target variables
X = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'DayMins', 'MonthlyCharge']]
Y = churn_data['Churn']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Standardizing the input data 
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [29]:
## Building the random forest model 
RF_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

## Predicting on the test 
RF_pred = RF_md.predict_proba(X_test)[:, 1]

## Computing the ROC
fpr, tpr, thresholds = roc_curve(Y_test, RF_pred)

## Finding the optimal cutoff
RF_cutoff = pd.DataFrame({'False_Positive': fpr, 'True_Positive': tpr, 'Cutoff': thresholds})
RF_cutoff['True_Positive_minus_1'] = RF_cutoff['True_Positive'] - 1
RF_cutoff['Distance_to_perfect_model'] = np.sqrt(RF_cutoff['False_Positive']**2 + RF_cutoff['True_Positive_minus_1']**2)

## Sorting
RF_cutoff = RF_cutoff.sort_values(by = 'Distance_to_perfect_model').reset_index(drop = True)

## Extracting optimal cutoff
RF_cutoff_val = RF_cutoff['Cutoff'][0]

## Changing likelihoods to labels 
RF_pred_label = np.where(RF_pred < RF_cutoff_val, 0, 1)

## Classification report 
print(classification_report(Y_test, RF_pred_label))

              precision    recall  f1-score   support

           0       0.95      0.78      0.86       570
           1       0.37      0.77      0.50        97

    accuracy                           0.78       667
   macro avg       0.66      0.78      0.68       667
weighted avg       0.87      0.78      0.80       667



array([0.        , 0.01030928, 0.04123711, 0.06185567, 0.11340206,
       0.11340206, 0.12371134, 0.12371134, 0.1443299 , 0.1443299 ,
       0.15463918, 0.15463918, 0.17525773, 0.17525773, 0.18556701,
       0.18556701, 0.19587629, 0.19587629, 0.21649485, 0.21649485,
       0.22680412, 0.22680412, 0.24742268, 0.24742268, 0.26804124,
       0.26804124, 0.27835052, 0.27835052, 0.28865979, 0.28865979,
       0.30927835, 0.30927835, 0.43298969, 0.43298969, 0.45360825,
       0.45360825, 0.46391753, 0.46391753, 0.48453608, 0.48453608,
       0.49484536, 0.49484536, 0.50515464, 0.50515464, 0.51546392,
       0.51546392, 0.5257732 , 0.5257732 , 0.55670103, 0.55670103,
       0.56701031, 0.56701031, 0.57731959, 0.57731959, 0.58762887,
       0.58762887, 0.59793814, 0.59793814, 0.60824742, 0.60824742,
       0.6185567 , 0.6185567 , 0.63917526, 0.63917526, 0.64948454,
       0.64948454, 0.65979381, 0.65979381, 0.67010309, 0.67010309,
       0.68041237, 0.68041237, 0.69072165, 0.69072165, 0.70103