In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import warnings
warnings.simplefilter(action= 'ignore', category='SettingWithCopyWarning')


from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import recall_score, accuracy_score

import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,  GradientBoostingClassifier 
from sklearn.tree import DecisionTreeClassifier
from itertools import product



s3= boto3.resource('s3')
bucket_name= 'morgangant-bata-445-bucket'
bucket= s3.Bucket(bucket_name)

file_key1= 'churn-bigml-80.csv'
file_key2= 'churn-bigml-20.csv'


bucket_object1= bucket.Object(file_key1)
file_object1= bucket_object1.get()
file_content_stream1 = file_object1.get('Body')

bucket_object2= bucket.Object(file_key2)
file_object2 = bucket_object2.get()
file_content_stream2 = file_object2.get('Body')

#reading the datefile
telecom_train = pd.read_csv(file_content_stream1)
telecom_test= pd.read_csv(file_content_stream2)

In [2]:
telecom_train= pd.DataFrame(telecom_train)
telecom_test= pd.DataFrame(telecom_test)

In [3]:
#Creating churn_numb true/false to 1/0
telecom_train= telecom_train.assign(churn_numb= telecom_train['Churn'].astype(int))

#Changing International_plan yes/no to 1/0
telecom_train['International_plan'].replace(['Yes', 'No'], [1,0], inplace= True)
telecom_test['International_plan'].replace(['Yes', 'No'], [1,0], inplace= True)

#Changing Voice_mail_plan yes/no to 1/0
telecom_train['Voice_mail_plan'].replace(['Yes', 'No'], [1,0], inplace= True)
telecom_test['Voice_mail_plan'].replace(['Yes', 'No'], [1,0], inplace= True)

#Creating variable Total_charge
telecom_train= telecom_train.assign(total_charge= telecom_train['Total_day_charge'] + telecom_train['Total_eve_charge'] + telecom_train['Total_night_charge']+ telecom_train['Total_intl_charge'])
telecom_test= telecom_test.assign(total_charge= telecom_test['Total_day_charge'] + telecom_test['Total_eve_charge'] + telecom_test['Total_night_charge']+ telecom_test['Total_intl_charge'])

In [4]:
#Setting x and y variables
x = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']]
y = telecom_train['churn_numb']

In [None]:
rf_results= list()
ada_results= list()
gb_results= list()

for i in range(0,1000):
    #Splitting the Data
    x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, stratify= y)
    
    #Buidling the model
    rf_md= RandomForestClassifier(n_estimators= 500, max_depth= 3).fit(x_train, y_train)
    #Extracting the feature importances
    rf_results.append(rf_md.feature_importances_)
    
    #Building the model
    ada_md= AdaBoostClassifier(base_estimator= DecisionTreeClassifier(max_depth= 3), n_estimators= 500, learning_rate=.01).fit(x_train, y_train)
    #Extracting the feature importances
    ada_results.append(ada_md.feature_importances_)
    
    
    #Building the model
    gb_md= GradientBoostingClassifier(max_depth= 3, n_estimators= 500, learning_rate=.01).fit(x_train, y_train)
    #Extracting the feature importances
    gb_results.append(gb_md.feature_importances_)
    

In [26]:
a = pd.DataFrame(rf_results)
a.columns= ['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']
a.apply(np.mean, axis = 0)

NameError: name 'rf_results' is not defined

In [None]:
b = pd.DataFrame(ada_results)
b.columns= ['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']
b.apply(np.mean, axis = 0)

In [None]:
c = pd.DataFrame(gb_results)
c.columns= ['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']
c.apply(np.mean, axis = 0)

In [None]:
#Top 4 average importances: international_plan, voice_mail_plan, total_charge, customer_service_calls
    #I dropped Account_length since it was insignificant in 2/3 models

In [5]:
x= x.drop(columns= 'Account_length')

In [6]:
def expand_grid(dictionary1):
    return pd.DataFrame([row for row in product(*dictionary1.values())], 
                        columns = dictionary1.keys())
dictionary1 = {'n_tree': [100, 500, 1000, 1500, 2000], 
                'depth': [3, 5, 7]}
parameters1= expand_grid(dictionary1)
parameters1= expand_grid(dictionary1)
parameters1['recall']= np.nan
parameters1['accuracy']= np.nan

In [36]:
parameters1

Unnamed: 0,n_tree,depth,recall,accuracy
0,100,3,,
1,100,5,,
2,100,7,,
3,500,3,,
4,500,5,,
5,500,7,,
6,1000,3,,
7,1000,5,,
8,1000,7,,
9,1500,3,,


In [7]:
def expand_grid(dictionary2):
    return pd.DataFrame([row for row in product(*dictionary2.values())], 
                        columns = dictionary2.keys())
dictionary2 = {'n_tree': [100, 500, 1000, 1500, 2000], 
                'depth': [3, 5, 7], 
                'learning_rate': [0.1, 0.01, 0.001]}
parameters2 = expand_grid(dictionary2)
parameters2['recall']= np.nan
parameters2['accuracy']= np.nan

In [90]:
parameters2

Unnamed: 0,n_tree,depth,learning_rate,recall,accuracy
0,100,3,0.1,,
1,100,3,0.01,,
2,100,3,0.001,,
3,100,5,0.1,,
4,100,5,0.01,,
5,100,5,0.001,,
6,100,7,0.1,,
7,100,7,0.01,,
8,100,7,0.001,,
9,500,3,0.1,,


In [8]:
def expand_grid(dictionary3):
    return pd.DataFrame([row for row in product(*dictionary3.values())],
                        columns = dictionary3.keys())
dictionary3 = {'n_tree': [100, 500, 1000, 1500, 2000],
                'depth': [3, 5, 7], 
                'learning_rate': [0.1, 0.01, 0.001]}
parameters3 = expand_grid(dictionary3)
parameters3['recall']= np.nan
parameters3['accuracy']= np.nan

In [21]:
parameters3

Unnamed: 0,n_tree,depth,learning_rate,recall,accuracy
0,100,3,0.1,,
1,100,3,0.01,,
2,100,3,0.001,,
3,100,5,0.1,,
4,100,5,0.01,,
5,100,5,0.001,,
6,100,7,0.1,,
7,100,7,0.01,,
8,100,7,0.001,,
9,500,3,0.1,,


In [None]:
rf_recall= pd.DataFrame()
rf_accuracy= pd.DataFrame()

ada_recall= pd.DataFrame()
ada_accuracy= pd.DataFrame()

gb_recall= pd.DataFrame()
gb_accuracy= pd.DataFrame()
#Running loops to find best model

for i in range (0,100):
    print([i])
    #Splitting the Data
    x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, stratify= y)
    

    ##### Random Forest
    for k in range(len(parameters1)):
        print([k])
        #Buidling the model
        rf_md1= RandomForestClassifier(n_estimators= parameters1['n_tree'][k], max_depth=parameters1['depth'][k]).fit(x_train, y_train)
        #Predicting on the model
        rf_pred1= rf_md1.predict_proba(x_test)[:,1]
        #changing likelihoods to labels 
        rf_labels1= np.where(rf_pred1 < .1, 0, 1)
        #parameters1.loc[k,'recall']= recall_score(y_test, rf_labels1)
        #parameters1.loc[k,'accuracy']= accuracy_score(y_test, rf_labels1)
        rf_recall.loc[i,k]= recall_score(y_test, rf_labels1)
        rf_accuracy.loc[i,k]= accuracy_score(y_test, rf_labels1)

        
    #### Ada Boost    
    for m in range(len(parameters2)):
        print([m])
        #Building the model
        ada_md1= AdaBoostClassifier(base_estimator= DecisionTreeClassifier(max_depth= parameters2['n_tree'][m]), n_estimators= parameters2['depth'][m], learning_rate= parameters2['learning_rate'][m]).fit(x_train, y_train)
        #Predicting on the model
        ada_pred1= ada_md1.predict_proba(x_test)[:,1]
        #changing likelihoods to labels 
        ada_labels1= np.where(ada_pred1 < .1, 0, 1)
        #parameters2.loc[m, 'recall']= recall_score(y_test, ada_labels1)
        #parameters2.loc[m, 'accuracy']= accuracy_score(y_test, ada_labels1)
        ada_recall.loc[i,m]= recall_score(y_test, ada_labels1)
        ada_accuracy.loc[i,m]= accuracy_score(y_test, ada_labels1)

    #### Gradient Boost
    for n in range(len(parameters3)):
        print([n])
        #Building the model
        gb_md1= GradientBoostingClassifier(max_depth= parameters3['depth'][n], n_estimators= parameters3['n_tree'][n], learning_rate= parameters3['learning_rate'][n]).fit(x_train, y_train)
        #Predicting on the model
        gb_pred1= gb_md1.predict_proba(x_test)[:,1]
        #changing likelihoods to labels 
        gb_labels1= np.where(gb_pred1 < .1, 0, 1)
        #parameters3.loc[n, 'recall']= recall_score(y_test, gb_labels)
        #parameters3.loc[n, 'accuracy'] = accuracy_score(y_test, gb_labels)
        gb_recall.loc[i,n]= recall_score(y_test, gb_labels1)
        gb_accuracy.loc[i,n]= accuracy_score(y_test, gb_labels1)

[0]
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
[10]
[11]
[12]
[13]
[14]
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
[30]
[31]
[32]
[33]
[34]
[35]
[36]
[37]
[38]
[39]
[40]
[41]
[42]
[43]
[44]
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
[30]
[31]
[32]
[33]
[34]
[35]
[36]
[37]
[38]
[39]
[40]
[41]
[42]
[43]
[44]
[1]
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
[10]
[11]
[12]
[13]
[14]
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
[30]
[31]
[32]
[33]
[34]
[35]
[36]
[37]
[38]
[39]
[40]
[41]
[42]
[43]
[44]
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
[30]
[31]
[32]
[33]
[34]
[35]
[36]
[37]
[38]
[39]
[40]
[41]
[42]
[43]
[44]
[2

In [14]:
rf_recall.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.865897,0.866026,0.863718,0.866154,0.866795,0.864872,0.86641,0.866667,0.864359,0.866282,0.866923,0.864615,0.866154,0.867051,0.864359
std,0.034881,0.034974,0.035365,0.034877,0.034761,0.035024,0.035154,0.034814,0.03539,0.035158,0.03466,0.034829,0.035208,0.034558,0.034679
min,0.75641,0.75641,0.75641,0.75641,0.75641,0.75641,0.75641,0.75641,0.75641,0.75641,0.75641,0.75641,0.75641,0.75641,0.75641
25%,0.846154,0.846154,0.846154,0.846154,0.846154,0.846154,0.846154,0.846154,0.846154,0.846154,0.846154,0.846154,0.846154,0.846154,0.846154
50%,0.871795,0.871795,0.858974,0.871795,0.871795,0.865385,0.871795,0.871795,0.858974,0.871795,0.871795,0.871795,0.871795,0.871795,0.858974
75%,0.884615,0.884615,0.884615,0.884615,0.884615,0.884615,0.884615,0.884615,0.884615,0.884615,0.884615,0.884615,0.884615,0.884615,0.884615
max,0.948718,0.948718,0.948718,0.948718,0.948718,0.948718,0.948718,0.948718,0.948718,0.948718,0.948718,0.948718,0.948718,0.948718,0.948718


In [16]:
rf_accuracy.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.872416,0.906873,0.911479,0.872472,0.907397,0.91294,0.872528,0.90779,0.912921,0.872472,0.907191,0.912846,0.872509,0.907772,0.91294
std,0.014036,0.013255,0.010397,0.013995,0.013352,0.010341,0.014055,0.013231,0.010807,0.014038,0.013759,0.010591,0.014013,0.013328,0.010738
min,0.837079,0.872659,0.889513,0.837079,0.882022,0.889513,0.837079,0.88015,0.891386,0.837079,0.876404,0.891386,0.837079,0.878277,0.891386
25%,0.865169,0.898876,0.904494,0.865169,0.898876,0.906367,0.865169,0.900749,0.904494,0.865169,0.898876,0.904494,0.865169,0.900749,0.904494
50%,0.872659,0.906367,0.910112,0.872659,0.90824,0.913858,0.872659,0.90824,0.913858,0.872659,0.90824,0.913858,0.872659,0.906367,0.913858
75%,0.882022,0.91573,0.918071,0.882022,0.91573,0.919944,0.882022,0.918071,0.921348,0.882022,0.916199,0.919476,0.882022,0.917603,0.919476
max,0.910112,0.940075,0.940075,0.910112,0.940075,0.93633,0.910112,0.941948,0.941948,0.910112,0.94382,0.938202,0.910112,0.941948,0.940075


In [17]:
ada_recall.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.711667,0.713846,0.712692,0.710769,0.712436,0.712821,0.709744,0.712692,0.712308,0.711923,0.713718,0.713205,0.710256,0.713974,0.713333,0.709487,0.712308,0.713462,0.71141,0.713205,0.712692,0.711154,0.713718,0.712692,0.709615,0.713718,0.712436,0.712051,0.712821,0.712949,0.711026,0.713846,0.711795,0.710641,0.712436,0.713205,0.711795,0.713462,0.713077,0.711538,0.713333,0.712821,0.708974,0.712564,0.712692
std,0.047395,0.046987,0.046782,0.046931,0.047176,0.047378,0.043905,0.049034,0.047808,0.04802,0.047028,0.047854,0.047762,0.048132,0.047536,0.043969,0.047773,0.047879,0.048022,0.047611,0.047591,0.046652,0.047485,0.047904,0.043899,0.048145,0.048082,0.047393,0.047167,0.047549,0.047358,0.047233,0.048159,0.044682,0.048768,0.047576,0.04757,0.047039,0.046806,0.046902,0.048092,0.048039,0.045428,0.048252,0.048111
min,0.589744,0.589744,0.589744,0.589744,0.589744,0.589744,0.589744,0.576923,0.589744,0.576923,0.589744,0.589744,0.589744,0.589744,0.589744,0.589744,0.589744,0.589744,0.576923,0.589744,0.589744,0.589744,0.576923,0.589744,0.589744,0.589744,0.576923,0.589744,0.576923,0.589744,0.589744,0.589744,0.576923,0.589744,0.576923,0.589744,0.589744,0.589744,0.589744,0.589744,0.589744,0.576923,0.589744,0.576923,0.589744
25%,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.692308,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.692308,0.679487,0.679487,0.679487,0.679487,0.679487,0.692308,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487,0.679487
50%,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.705128,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.705128,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.705128,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949,0.717949
75%,0.730769,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.730769,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359,0.74359
max,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692,0.807692


In [18]:
ada_accuracy.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.912772,0.91161,0.91161,0.913614,0.911667,0.911536,0.913652,0.912191,0.911255,0.912472,0.911779,0.911536,0.913689,0.91191,0.911948,0.913596,0.912322,0.91191,0.91264,0.911554,0.911423,0.913764,0.911929,0.911742,0.913521,0.912547,0.911704,0.912772,0.911461,0.911498,0.913989,0.912135,0.911404,0.91367,0.912191,0.911423,0.91279,0.911667,0.911423,0.913933,0.911948,0.911536,0.913315,0.912397,0.911592
std,0.007655,0.007599,0.008016,0.008566,0.007918,0.00786,0.008706,0.008082,0.008112,0.007795,0.007626,0.0082,0.008572,0.008025,0.00776,0.008844,0.007785,0.008234,0.007902,0.007495,0.008124,0.008456,0.008,0.007935,0.008486,0.007876,0.00796,0.007946,0.007604,0.007777,0.008384,0.007899,0.00805,0.008206,0.007967,0.008093,0.008105,0.007519,0.008054,0.008559,0.007846,0.008003,0.008777,0.007735,0.007946
min,0.895131,0.88764,0.88764,0.88764,0.891386,0.88764,0.893258,0.891386,0.889513,0.895131,0.891386,0.891386,0.88764,0.891386,0.891386,0.895131,0.895131,0.88764,0.893258,0.891386,0.891386,0.88764,0.891386,0.891386,0.893258,0.895131,0.891386,0.893258,0.891386,0.891386,0.889513,0.891386,0.88764,0.897004,0.891386,0.891386,0.891386,0.891386,0.891386,0.88764,0.891386,0.88764,0.893258,0.891386,0.891386
25%,0.90824,0.907772,0.906367,0.90824,0.906367,0.906367,0.906367,0.907772,0.906367,0.906367,0.906367,0.906367,0.90824,0.906367,0.906367,0.906367,0.906367,0.906367,0.90824,0.906367,0.906367,0.90824,0.906367,0.906367,0.90824,0.90824,0.906367,0.90824,0.906367,0.906367,0.90824,0.906367,0.906367,0.906367,0.906367,0.905899,0.90824,0.906367,0.906367,0.90824,0.906367,0.906367,0.906367,0.907772,0.906367
50%,0.913858,0.911985,0.911985,0.913858,0.911985,0.911985,0.913858,0.911985,0.911985,0.913858,0.911985,0.911985,0.913858,0.911985,0.912921,0.913858,0.911985,0.912921,0.913858,0.911049,0.911985,0.913858,0.911985,0.911985,0.913858,0.911985,0.912921,0.913858,0.911985,0.911985,0.913858,0.911985,0.911985,0.913858,0.911985,0.911985,0.913858,0.911985,0.911985,0.91573,0.911985,0.911985,0.913858,0.911985,0.911985
75%,0.917603,0.917603,0.917603,0.919476,0.917603,0.91573,0.919476,0.917603,0.916199,0.917603,0.917603,0.917603,0.919476,0.918071,0.917603,0.919476,0.917603,0.917603,0.917603,0.917603,0.917603,0.919476,0.917603,0.917603,0.919476,0.918071,0.917603,0.918071,0.916199,0.917603,0.919476,0.917603,0.917603,0.919476,0.918071,0.91573,0.917603,0.917603,0.917603,0.919476,0.917603,0.91573,0.919476,0.917603,0.917603
max,0.932584,0.928839,0.930712,0.930712,0.926966,0.930712,0.934457,0.930712,0.928839,0.932584,0.928839,0.930712,0.930712,0.928839,0.930712,0.934457,0.930712,0.930712,0.934457,0.928839,0.930712,0.930712,0.930712,0.930712,0.934457,0.932584,0.928839,0.934457,0.928839,0.926966,0.932584,0.928839,0.932584,0.934457,0.930712,0.930712,0.932584,0.926966,0.928839,0.930712,0.928839,0.930712,0.93633,0.930712,0.928839


In [20]:
gb_recall.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.852692,0.850513,1.0,0.82,0.849872,1.0,0.766154,0.841923,1.0,0.797308,0.855385,0.856154,0.724744,0.841795,0.868846,0.713974,0.823077,0.873333,0.757564,0.853718,0.850513,0.713077,0.823077,0.849872,0.707821,0.772564,0.841795,0.740385,0.851538,0.850513,0.709103,0.794359,0.849359,0.704872,0.744487,0.840897,0.729487,0.847308,0.850513,0.708333,0.770513,0.848974,0.703846,0.731667,0.839487
std,0.037233,0.037884,0.0,0.040743,0.038839,0.0,0.045268,0.038743,0.0,0.044175,0.03708,0.03768,0.047847,0.038146,0.045074,0.046483,0.040583,0.047071,0.047171,0.036585,0.037884,0.046912,0.042696,0.038839,0.04686,0.045359,0.038879,0.047775,0.037264,0.037884,0.047402,0.04526,0.038196,0.046457,0.046574,0.038184,0.048538,0.039214,0.037884,0.047705,0.048091,0.038508,0.047255,0.046929,0.037852
min,0.75641,0.730769,1.0,0.730769,0.730769,1.0,0.615385,0.730769,1.0,0.679487,0.74359,0.74359,0.589744,0.730769,0.730769,0.589744,0.692308,0.717949,0.641026,0.74359,0.730769,0.589744,0.705128,0.730769,0.589744,0.628205,0.730769,0.602564,0.75641,0.730769,0.576923,0.666667,0.730769,0.576923,0.615385,0.730769,0.602564,0.74359,0.730769,0.576923,0.615385,0.730769,0.589744,0.602564,0.730769
25%,0.833333,0.833333,1.0,0.794872,0.820513,1.0,0.74359,0.820513,1.0,0.769231,0.833333,0.833333,0.705128,0.820513,0.833333,0.692308,0.807692,0.846154,0.730769,0.833333,0.833333,0.679487,0.794872,0.820513,0.692308,0.74359,0.820513,0.717949,0.833333,0.833333,0.679487,0.766026,0.820513,0.679487,0.717949,0.820513,0.705128,0.820513,0.833333,0.679487,0.740385,0.820513,0.679487,0.705128,0.820513
50%,0.846154,0.846154,1.0,0.820513,0.846154,1.0,0.769231,0.846154,1.0,0.794872,0.858974,0.858974,0.730769,0.846154,0.871795,0.717949,0.833333,0.884615,0.769231,0.858974,0.846154,0.717949,0.820513,0.846154,0.717949,0.769231,0.846154,0.74359,0.846154,0.846154,0.717949,0.794872,0.846154,0.705128,0.74359,0.846154,0.730769,0.846154,0.846154,0.717949,0.769231,0.846154,0.705128,0.730769,0.846154
75%,0.875,0.871795,1.0,0.846154,0.871795,1.0,0.794872,0.871795,1.0,0.820513,0.875,0.884615,0.75641,0.871795,0.897436,0.74359,0.846154,0.910256,0.782051,0.875,0.871795,0.74359,0.846154,0.871795,0.74359,0.794872,0.871795,0.769231,0.871795,0.871795,0.730769,0.820513,0.871795,0.730769,0.769231,0.858974,0.75641,0.871795,0.871795,0.74359,0.794872,0.871795,0.730769,0.769231,0.858974
max,0.948718,0.948718,1.0,0.897436,0.935897,1.0,0.871795,0.935897,1.0,0.910256,0.948718,0.948718,0.846154,0.923077,0.961538,0.820513,0.910256,0.961538,0.858974,0.948718,0.948718,0.846154,0.910256,0.935897,0.820513,0.871795,0.935897,0.858974,0.948718,0.948718,0.846154,0.897436,0.935897,0.820513,0.846154,0.935897,0.846154,0.935897,0.948718,0.807692,0.884615,0.935897,0.794872,0.846154,0.935897


In [19]:
gb_accuracy.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.910674,0.923708,0.1460674,0.897341,0.915243,0.1460674,0.900655,0.907247,0.1460674,0.897303,0.920262,0.887715,0.906779,0.909419,0.810861,0.909794,0.900562,0.750243,0.902678,0.91309,0.923708,0.909438,0.89824,0.915262,0.911742,0.899888,0.907378,0.906255,0.906367,0.923708,0.910356,0.897041,0.917809,0.911573,0.904082,0.909888,0.906816,0.902322,0.923633,0.910861,0.899569,0.917772,0.911573,0.90633,0.909625
std,0.011079,0.010304,1.952678e-16,0.009479,0.012432,1.952678e-16,0.009457,0.010685,1.952678e-16,0.009969,0.010101,0.01955,0.008211,0.009815,0.113837,0.007443,0.009632,0.075389,0.009716,0.010848,0.010304,0.007531,0.009777,0.01244,0.007611,0.010179,0.010678,0.009308,0.011068,0.010304,0.00768,0.0097,0.009357,0.008019,0.008672,0.009712,0.009015,0.011235,0.010321,0.007486,0.009596,0.009366,0.007597,0.008467,0.010039
min,0.876404,0.900749,0.1460674,0.874532,0.868914,0.1460674,0.874532,0.872659,0.1460674,0.872659,0.898876,0.835206,0.882022,0.883895,0.541199,0.889513,0.876404,0.625468,0.878277,0.876404,0.900749,0.88764,0.874532,0.868914,0.889513,0.876404,0.872659,0.88015,0.861423,0.900749,0.891386,0.874532,0.898876,0.885768,0.883895,0.88764,0.88015,0.859551,0.900749,0.891386,0.874532,0.898876,0.885768,0.883895,0.883895
25%,0.902622,0.917135,0.1460674,0.891386,0.910112,0.1460674,0.893258,0.900749,0.1460674,0.891386,0.91339,0.874532,0.902622,0.902622,0.699906,0.904494,0.893258,0.686798,0.896536,0.906367,0.917135,0.904494,0.891386,0.910112,0.907772,0.893258,0.900749,0.900749,0.898876,0.917135,0.904494,0.890918,0.911985,0.906367,0.898876,0.902622,0.900749,0.895131,0.917135,0.906367,0.893258,0.910112,0.906367,0.900749,0.900749
50%,0.910112,0.923221,0.1460674,0.897004,0.91573,0.1460674,0.900749,0.906367,0.1460674,0.897004,0.919476,0.88764,0.906367,0.90824,0.876404,0.909176,0.902622,0.734082,0.902622,0.911985,0.923221,0.910112,0.898876,0.91573,0.910112,0.898876,0.906367,0.906367,0.906367,0.923221,0.910112,0.897004,0.917603,0.910112,0.904494,0.909176,0.907303,0.902622,0.923221,0.910112,0.898876,0.917603,0.910112,0.906367,0.910112
75%,0.917603,0.930712,0.1460674,0.904494,0.923221,0.1460674,0.906835,0.91573,0.1460674,0.904494,0.926966,0.900749,0.911985,0.91573,0.899345,0.913858,0.906367,0.816011,0.910112,0.919476,0.930712,0.913858,0.904494,0.923221,0.917603,0.90824,0.91573,0.911985,0.913858,0.930712,0.91573,0.904494,0.923221,0.917603,0.910112,0.917603,0.911985,0.90824,0.930712,0.91573,0.906367,0.923221,0.917603,0.911985,0.917603
max,0.94382,0.953184,0.1460674,0.917603,0.945693,0.1460674,0.919476,0.926966,0.1460674,0.917603,0.947566,0.928839,0.930712,0.94382,0.930712,0.926966,0.921348,0.898876,0.925094,0.945693,0.953184,0.928839,0.928839,0.945693,0.928839,0.921348,0.934457,0.926966,0.938202,0.953184,0.928839,0.919476,0.945693,0.928839,0.923221,0.932584,0.928839,0.938202,0.953184,0.928839,0.919476,0.94382,0.928839,0.928839,0.93633


In [None]:
#When looking at recall and accuracy rates to determine which model is the best, you need to consider the business. What are we doing when predicting on churn?
    #We are predicting wheather or not we think a customer will cancel their subscription or not. Knowing this, customer intrest, numbers, charges, etc. will
    #depend on the customers actions. It's going to be more benificial to the business to falsley predict they will cancel their subscription, than predit they 
    #won't and they do. And corecctly classifying them and not cancling and having them not, is very good.
    #Therefore, when I am considering the best models, I am going to continue with the model with the highest recall becasue of this business, it is more benficial
        #to consider than accuracy is!
#Note: Mnay of the models were very similar in recalls!

In [None]:
#Best Random Forest model: 
    #Model 13 --> 2000 trees and a depth of 5
    #recall --> 0.867051
#Best Ada Boost model: 
    #Model 13 --> 500 trees, a depth of 5, and learning rate .01
    #recall --> 0.713974
#Best Gradient Boost model:
    #Model 14 --> 500 trees, a depth of 5, and learning rate .001
    #recall --> 0.868846

In [30]:
#redefining test data set
#Setting x and y variables
x = telecom_test[['International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']]
y = telecom_test['Churn']

#Splitting the Data
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, stratify= y)

#Buidling the models from the best chosen above!
### Random Forest
rf_md1= RandomForestClassifier(n_estimators= 2000, max_depth= 5).fit(x_train, y_train)
#Predicting on the model
rf_pred1= rf_md1.predict_proba(x_test)[:,1]
#changing likelihoods to labels 
#rf_labels1= np.where(rf_pred1 < .1, 0, 1)
#rf_recall_average= recall_score(y_test, rf_labels1)
#rf_accuracy= accuracy_score(y_test, rf_labels1)
    
        
#### Ada Boost    
#Building the model
ada_md1= AdaBoostClassifier(base_estimator= DecisionTreeClassifier(max_depth= 5), n_estimators= 500, learning_rate= .01).fit(x_train, y_train)
#Predicting on the model
ada_pred1= ada_md1.predict_proba(x_test)[:,1]
#changing likelihoods to labels 
#ada_labels1= np.where(ada_pred1 < .1, 0, 1)
#ada_recall= recall_score(y_test, ada_labels1)
#ada_accuracy= accuracy_score(y_test, ada_labels1)

        
    
#### Gradient Boost
#Building the model
gb_md1= GradientBoostingClassifier(max_depth= 5, n_estimators= 500, learning_rate= .001).fit(x_train, y_train)
#Predicting on the model
gb_pred1= gb_md1.predict_proba(x_test)[:,1]
#changing likelihoods to labels 
#gb_labels1= np.where(gb_pred1 < .1, 0, 1)
#gb_recall= recall_score(y_test, gb_labels1)
#gb_accuracy= accuracy_score(y_test, gb_labels1)


rf_recall_old=  0.867051
ada_recall_old= 0.713974
gb_recall_old= 0.868846
total_recall=  rf_recall_old +  ada_recall_old + gb_recall_old

lables= (rf_recall_old/total_recall)*(rf_pred1) + (ada_recall_old/total_recall)*(ada_pred1) + (gb_recall_old/total_recall)*(gb_pred1)
lables= np.where(lables< .1, 0, 1)
recall= recall_score(y_test, lables)
accuracy= accuracy_score(y_test, lables)

In [31]:
recall

0.8947368421052632

In [32]:
accuracy

0.9477611940298507