In [2]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key_1 = 'Demos/churn-bigml-80.csv'
file_key_2 = 'Demos/churn-bigml-20.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

## Reading the csv files
telecom_train = pd.read_csv(file_content_stream_1)
telecom_test = pd.read_csv(file_content_stream_2)

## Changing logical value to numbers
telecom_train['Churn_numb'] = np.where(telecom_train['Churn'] == False, 0, 1)
telecom_test['Churn_numb'] = np.where(telecom_test['Churn'] == False, 0, 1)

telecom_train['International_plan'] = np.where(telecom_train['International_plan'] == 'No', 0, 1)
telecom_test['International_plan'] = np.where(telecom_test['International_plan'] == 'No', 0, 1)

telecom_train['Voice_mail_plan'] = np.where(telecom_train['Voice_mail_plan'] == 'No', 0, 1)
telecom_test['Voice_mail_plan'] = np.where(telecom_test['Voice_mail_plan'] == 'No', 0, 1)

telecom_train['total_charge'] = telecom_train['Total_day_charge'] + telecom_train['Total_eve_charge'] + telecom_train['Total_night_charge'] + telecom_train['Total_intl_charge']
telecom_test['total_charge'] = telecom_test['Total_day_charge'] + telecom_test['Total_eve_charge'] + telecom_test['Total_night_charge'] + telecom_test['Total_intl_charge']

telecom_train.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,...,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn,Churn_numb,total_charge
0,KS,128,415,0,1,25,265.1,110,45.07,197.4,...,244.7,91,11.01,10.0,3,2.7,1,False,0,75.56
1,OH,107,415,0,1,26,161.6,123,27.47,195.5,...,254.4,103,11.45,13.7,3,3.7,1,False,0,59.24
2,NJ,137,415,0,0,0,243.4,114,41.38,121.2,...,162.6,104,7.32,12.2,5,3.29,0,False,0,62.29
3,OH,84,408,1,0,0,299.4,71,50.9,61.9,...,196.9,89,8.86,6.6,7,1.78,2,False,0,66.8
4,OK,75,415,1,0,0,166.7,113,28.34,148.3,...,186.9,121,8.41,10.1,3,2.73,3,False,0,52.09


In [3]:
## Selecting variables of interest
telecom_train = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls', 'Churn_numb']]
telecom_test = telecom_test[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls', 'Churn_numb']]

telecom_train.head()

Unnamed: 0,Account_length,International_plan,Voice_mail_plan,total_charge,Customer_service_calls,Churn_numb
0,128,0,1,75.56,1,0
1,107,0,1,59.24,1,0
2,137,0,0,62.29,0,0
3,84,1,0,66.8,2,0
4,75,1,0,52.09,3,0


In [None]:
## Defining the input and target variables
X = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']]
Y = telecom_train['Churn_numb']

## Defining data-frame to store results 
RF_importances = pd.DataFrame({'Account_length': np.repeat(np.nan, 100), 'International_plan': np.repeat(np.nan, 100), 'Voice_mail_plan': np.repeat(np.nan, 100), 'total_charge': np.repeat(np.nan, 100), 'Customer_service_calls': np.repeat(np.nan, 100)})
Ada_importances = pd.DataFrame({'Account_length': np.repeat(np.nan, 100), 'International_plan': np.repeat(np.nan, 100), 'Voice_mail_plan': np.repeat(np.nan, 100), 'total_charge': np.repeat(np.nan, 100), 'Customer_service_calls': np.repeat(np.nan, 100)})
GB_importances = pd.DataFrame({'Account_length': np.repeat(np.nan, 100), 'International_plan': np.repeat(np.nan, 100), 'Voice_mail_plan': np.repeat(np.nan, 100), 'total_charge': np.repeat(np.nan, 100), 'Customer_service_calls': np.repeat(np.nan, 100)})

for i in range(0, 100):

    ## Splitting the data 
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ###################
    ## Random Forest ##
    ###################
    
    RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)
    RF_importances.loc[i] = RF.feature_importances_
    
    ##############
    ## AdaBoost ##
    ##############
    
    Ada = AdaBoostClassifier(n_estimators = 500, base_estimator = DecisionTreeClassifier(max_depth = 3), learning_rate = 0.01).fit(X_train, Y_train)
    Ada_importances.loc[i] = Ada.feature_importances_
    
    #######################
    ## Gradient Boosting ##
    #######################
    
    GB = GradientBoostingClassifier(n_estimators = 500, max_depth = 3, learning_rate = 0.01).fit(X_train, Y_train)
    GB_importances.loc[i] = GB.feature_importances_
    
## Combining the importances of the three models 
Feature_Importances = pd.concat([RF_importances, Ada_importances, GB_importances], axis = 0)
Feature_Importances.mean()

In [None]:
Feature_Importances = pd.concat([RF_importances, Ada_importances, GB_importances], axis = 0)
Feature_Importances.mean()

In [18]:
## Re-defining the input and target variables
X = telecom_train[['Account_length', 'International_plan', 'total_charge', 'Customer_service_calls']]
Y = telecom_train['Churn_numb']

def expand_grid(model):
    
    if (model == 'RF'):
        
        dictionary = {'n_tree': [100, 500, 1000, 1500, 2000],
                      'depth': [3, 5, 7]}
        
        param = pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())
        param['accuracy'] = np.nan
        param['recall'] = np.nan
    
        return param
    
    elif ((model == 'Ada') | (model == 'GB')):
        
        dictionary = {'n_tree': [100, 500, 1000, 1500, 2000],
                      'depth': [3, 5, 7],
                      'learning_rate': [0.1, 0.01, 0.001]}
    
        param = pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())
        param['accuracy'] = np.nan
        param['recall'] = np.nan
    
        return param
        

def one_round(X_train, X_test, Y_train, Y_test, model):
    
    if (model == 'RF'):
        
        ## Defining the grid of hyper-parameters
        RF_param = expand_grid('RF')

        for i in range(0, RF_param.shape[0]):

            ## Building the model
            RF = RandomForestClassifier(n_estimators = RF_param['n_tree'][i], max_depth = RF_param['depth'][i]).fit(X_train, Y_train)

            ## Predicting on test
            RF_pred = RF.predict_proba(X_test)[:, 1]
            RF_pred = np.where(RF_pred < 0.1, 0, 1)

            ## Computing & storing accuracy and recall 
            RF_param.loc[i, 'accuracy'] = accuracy_score(Y_test, RF_pred)
            RF_param.loc[i, 'recall'] = recall_score(Y_test, RF_pred)

        return RF_param
    
    elif (model == 'Ada'):
        
        ## Defining the grid of hyper-parameters
        Ada_param = expand_grid('Ada')
        
        for i in range(0, Ada_param.shape[0]):

            ## Building the model
            Ada = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = Ada_param['depth'][i]), n_estimators = Ada_param['n_tree'][i], learning_rate = Ada_param['learning_rate'][i]).fit(X_train, Y_train)

            ## Predicting on test
            Ada_pred = Ada.predict_proba(X_test)[:, 1]
            Ada_pred = np.where(Ada_pred < 0.1, 0, 1)

            ## Computing & storing accuracy and recall 
            Ada_param.loc[i, 'accuracy'] = accuracy_score(Y_test, Ada_pred)
            Ada_param.loc[i, 'recall'] = recall_score(Y_test, Ada_pred)

        return Ada_param
    
    elif (model == 'GB'):
        
        ## Defining the grid of hyper-parameters
        GB_param = expand_grid('GB')
        
        for i in range(0, GB_param.shape[0]):
            
            ## Building the model 
            GB = GradientBoostingClassifier(n_estimators = GB_param['n_tree'][i], max_depth = GB_param['depth'][i], learning_rate = GB_param['learning_rate'][i]).fit(X_train, Y_train)
        
            ## Predicting on test 
            GB_pred = GB.predict_proba(X_test)[:, 1]
            GB_pred = np.where(GB_pred < 0.1, 0, 1)
            
            ## Computing & storing accuracy and recall 
            GB_param.loc[i, 'accuracy'] = accuracy_score(Y_test, GB_pred)
            GB_param.loc[i, 'recall'] = recall_score(Y_test, GB_pred)
            
        return GB_param
    
    
def multiple_rounds(X, Y, numb_rounds, model):
    
    ## Defining list to store results
    results = list()
    
    for i in range(0, numb_rounds):
        
        ## Splitting data 
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
        
        if (model == 'RF'):
            
            ## Storing results
            results.append(one_round(X_train, X_test, Y_train, Y_test, 'RF'))
            
        elif (model == 'Ada'):
            
            ## Storing results 
            results.append(one_round(X_train, X_test, Y_train, Y_test, 'Ada'))
        
        elif (model == 'GB'):
            
            ## Storing results
            results.append(one_round(X_train, X_test, Y_train, Y_test, 'GB'))
    
    ## Putting all the results together 
    results = pd.concat(results)
    
    if (model == 'RF'):
    
        results = results.groupby(['n_tree', 'depth']).agg({'accuracy': 'mean', 'recall': 'mean'})
        results = results.sort_values(by = ['accuracy', 'recall'], ascending = False)
        
        return results
    
    else:
        
        results = results.groupby(['n_tree', 'depth', 'learning_rate']).agg({'accuracy': 'mean', 'recall': 'mean'})
        results = results.sort_values(by = ['accuracy', 'recall'], ascending = False)
        
        return results