In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import warnings
warnings.simplefilter(action= 'ignore', category=FutureWarning)


from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import recall_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,  GradientBoostingClassifier 
from sklearn.tree import DecisionTreeClassifier
from itertools import product



s3= boto3.resource('s3')
bucket_name= 'morgangant-bata-445-bucket'
bucket= s3.Bucket(bucket_name)

file_key1= 'churn-bigml-80.csv'
file_key2= 'churn-bigml-20.csv'


bucket_object1= bucket.Object(file_key1)
file_object1= bucket_object1.get()
file_content_stream1 = file_object1.get('Body')

bucket_object2= bucket.Object(file_key2)
file_object2 = bucket_object2.get()
file_content_stream2 = file_object2.get('Body')

#reading the datefile
telecom_train = pd.read_csv(file_content_stream1)
telecom_test= pd.read_csv(file_content_stream2)

In [2]:
telecom_train= pd.DataFrame(telecom_train)
telecom_test= pd.DataFrame(telecom_test)

In [3]:
#Creating churn_numb true/false to 1/0
telecom_train= telecom_train.assign(churn_numb= telecom_train['Churn'].astype(int))
telecom_test= telecom_test.assign(churn_numb= telecom_test['Churn'].astype(int))

#Changing International_plan yes/no to 1/0
telecom_train['International_plan'].replace(['Yes', 'No'], [1,0], inplace= True)
telecom_test['International_plan'].replace(['Yes', 'No'], [1,0], inplace= True)

#Changing Voice_mail_plan yes/no to 1/0
telecom_train['Voice_mail_plan'].replace(['Yes', 'No'], [1,0], inplace= True)
telecom_test['Voice_mail_plan'].replace(['Yes', 'No'], [1,0], inplace= True)

#Creating variable Total_charge
telecom_train= telecom_train.assign(total_charge= telecom_train['Total_day_charge'] + telecom_train['Total_eve_charge'] + telecom_train['Total_night_charge']+ telecom_train['Total_intl_charge'])
telecom_test= telecom_test.assign(total_charge= telecom_test['Total_day_charge'] + telecom_test['Total_eve_charge'] + telecom_test['Total_night_charge']+ telecom_test['Total_intl_charge'])

In [4]:
telecom_train.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn,churn_numb,total_charge
0,KS,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False,0,75.56
1,OH,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False,0,59.24
2,NJ,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False,0,62.29
3,OH,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False,0,66.8
4,OK,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False,0,52.09


In [5]:
telecom_test.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn,churn_numb,total_charge
0,LA,117,408,0,0,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False,0,73.32
1,IN,65,415,0,0,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,True,1,54.2
2,NY,161,415,0,0,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True,1,92.29
3,SC,111,415,0,0,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,False,0,41.05
4,HI,49,510,0,0,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,False,0,49.6


In [4]:
#Setting x and y variables
x = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']]
y = telecom_train['churn_numb']

In [8]:
rf_results= list()
ada_results= list()
gb_results= list()

for i in range(0,1000):
    #Splitting the Data
    x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, stratify= y)
    
    #Buidling the model
    rf_md= RandomForestClassifier(n_estimators= 500, max_depth= 3).fit(x_train, y_train)
    #Extracting the feature importances
    rf_results.append(rf_md.feature_importances_)
    
    #Building the model
    ada_md= AdaBoostClassifier(base_estimator= DecisionTreeClassifier(max_depth= 3), n_estimators= 500, learning_rate=.01).fit(x_train, y_train)
    #Extracting the feature importances
    ada_results.append(ada_md.feature_importances_)
    
    
    #Building the model
    gb_md= GradientBoostingClassifier(max_depth= 3, n_estimators= 500, learning_rate=.01).fit(x_train, y_train)
    #Extracting the feature importances
    gb_results.append(gb_md.feature_importances_)
    

In [9]:
a = pd.DataFrame(rf_results)
a.columns= ['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']
a.apply(np.mean, axis = 0)

Account_length            0.019613
International_plan        0.143908
Voice_mail_plan           0.072898
total_charge              0.554556
Customer_service_calls    0.209026
dtype: float64

In [10]:
b = pd.DataFrame(ada_results)
b.columns= ['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']
b.apply(np.mean, axis = 0)

Account_length            0.298886
International_plan        0.249115
Voice_mail_plan           0.036094
total_charge              0.343370
Customer_service_calls    0.072535
dtype: float64

In [11]:
c = pd.DataFrame(gb_results)
c.columns= ['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']
c.apply(np.mean, axis = 0)

Account_length            0.015177
International_plan        0.117566
Voice_mail_plan           0.131850
total_charge              0.571458
Customer_service_calls    0.163949
dtype: float64

In [None]:
#Top 4 average importances: international_plan, voice_mail_plan, total_charge, customer_service_calls

In [5]:
x= x.drop(columns= 'Account_length')

In [11]:
#Splitting the Data
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, stratify= y)

for i in range (0,100):

    def expand_grid(dictionary):
        return pd.DataFrame([row for row in product(*dictionary.values())], 
                            columns = dictionary.keys())
    dictionary = {'n_tree': [100, 500, 1000, 1500, 2000], 
                    'depth': [3, 5, 7]}
    parameters = expand_grid(dictionary)


    def expand_grid(dictionary):
        return pd.DataFrame([row for row in product(*dictionary.values())], 
                            columns = dictionary.keys())
    dictionary = {'n_tree': [100, 500, 1000, 1500, 2000], 
                  'depth': [3, 5, 7], 
                  'learning_rate': [0.1, 0.01, 0.001]}
    parameters = expand_grid(dictionary)


    def expand_grid(dictionary):
        return pd.DataFrame([row for row in product(*dictionary.values())],
                            columns = dictionary.keys())
    dictionary = {'n_tree': [100, 500, 1000, 1500, 2000],
                  'depth': [3, 5, 7], 
                  'learning_rate': [0.1, 0.01, 0.001]}
    parameters = expand_grid(dictionary)