In [61]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import warnings
warnings.simplefilter(action= 'ignore', category=FutureWarning)


from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import recall_score
import matplotlib.pyplot as plt



s3= boto3.resource('s3')
bucket_name= 'morgangant-bata-445-bucket'
bucket= s3.Bucket(bucket_name)

file_key1= 'churn-bigml-80.csv'
file_key2= 'churn-bigml-20.csv'


bucket_object1= bucket.Object(file_key1)
file_object1= bucket_object1.get()
file_content_stream1 = file_object1.get('Body')

bucket_object2= bucket.Object(file_key2)
file_object2 = bucket_object2.get()
file_content_stream2 = file_object2.get('Body')

#reading the datefile
telecom_train = pd.read_csv(file_content_stream1)
telecom_test= pd.read_csv(file_content_stream2)

In [62]:
telecom_train.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [10]:
telecom_test.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn
0,LA,117,408,No,No,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False
1,IN,65,415,No,No,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,True
2,NY,161,415,No,No,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True
3,SC,111,415,No,No,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,False
4,HI,49,510,No,No,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,False


In [63]:
telecom_train= pd.DataFrame(telecom_train)
telecom_test= pd.DataFrame(telecom_test)

In [64]:
#Changing Churn yes/no to 1/0
telecom_train['Churn'].replace(['Yes', 'No'], [1,0], inplace= True)
telecom_test['Churn'].replace(['Yes', 'No'], [1,0], inplace= True)

#Changing International_plan yes/no to 1/0
telecom_train['International_plan'].replace(['Yes', 'No'], [1,0], inplace= True)
telecom_test['International_plan'].replace(['Yes', 'No'], [1,0], inplace= True)

#Changing Voice_mail_plan yes/no to 1/0
telecom_train['Voice_mail_plan'].replace(['Yes', 'No'], [1,0], inplace= True)
telecom_test['Voice_mail_plan'].replace(['Yes', 'No'], [1,0], inplace= True)

#Creating variable Total_charge
telecom_train= telecom_train.assign(total_charge= telecom_train['Total_day_charge'] + telecom_train['Total_eve_charge'] + telecom_train['Total_night_charge']+ telecom_train['Total_intl_charge'])
telecom_test= telecom_test.assign(total_charge= telecom_test['Total_day_charge'] + telecom_test['Total_eve_charge'] + telecom_test['Total_night_charge']+ telecom_test['Total_intl_charge'])

In [65]:
telecom_train= telecom_train.drop(columns= ['Area_code', 'Number_vmail_messages', 'Total_day_minutes', 'Total_day_calls', 'Total_day_charge', 'Total_eve_minutes', 'Total_eve_calls','Total_eve_charge', 'Total_night_minutes', 'Total_night_calls', 'Total_night_charge', 'Total_intl_minutes', 'Total_intl_calls', 'Total_intl_charge'], axis=1)
telecom_test= telecom_test.drop(columns= ['Area_code', 'Number_vmail_messages', 'Total_day_minutes', 'Total_day_calls', 'Total_day_charge', 'Total_eve_minutes', 'Total_eve_calls','Total_eve_charge', 'Total_night_minutes', 'Total_night_calls', 'Total_night_charge', 'Total_intl_minutes', 'Total_intl_calls', 'Total_intl_charge'], axis=1)

In [66]:
#Setting x and y variables
x = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']]
y = telecom_train['Churn']

In [67]:
df= list()
for i in range(0,1000):
    #Splitting the Data
    x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, stratify= y)
    
    #Min Max Scaler
    scaler= MinMaxScaler()

    x_train= scaler.fit_transform(x_train)
    x_test= scaler.fit_transform(x_test)
    
    #Estimating lambda for lasso
    lasso_cv= LassoCV(normalize= True, cv=5).fit(x_train, y_train)

    #Extracting best lambda
    cv_lambda= lasso_cv.alpha_
    #print('Estimated lambda for the lasso model is:', cv_lambda)
    
    #Building lasso
    lasso_md= Lasso(alpha= cv_lambda, normalize= True).fit(x_train, y_train)
    df.append(lasso_md.coef_)

In [68]:
A = pd.DataFrame(df)
A

Unnamed: 0,0,1,2,3,4
0,0.000000,0.318555,-0.078533,0.486427,0.522987
1,0.023641,0.331287,-0.070859,0.503032,0.558501
2,0.022072,0.326699,-0.072188,0.563940,0.506953
3,0.006763,0.290194,-0.070219,0.475123,0.501206
4,0.035187,0.306683,-0.068674,0.508727,0.514689
...,...,...,...,...,...
995,0.001175,0.313153,-0.090782,0.544707,0.458635
996,0.009010,0.314223,-0.075183,0.501713,0.489456
997,0.045225,0.348680,-0.083690,0.546119,0.475931
998,0.000000,0.281366,-0.069369,0.490833,0.562716


In [69]:
print(sum(A[0] == 0.0))
print(sum(A[1] == 0.0))
print(sum(A[2] == 0.0))
print(sum(A[3] == 0.0))
print(sum(A[4] == 0.0))

257
0
0
0
0


In [70]:
#Dropping Account_length and Voice_mail_plan
telecom_train= telecom_train.drop(columns= ['Account_length'], axis=1)
telecom_test= telecom_test.drop(columns= ['Account_length'], axis=1)
telecom_train.head()

Unnamed: 0,State,International_plan,Voice_mail_plan,Customer_service_calls,Churn,total_charge
0,KS,0,1,1,False,75.56
1,OH,0,1,1,False,59.24
2,NJ,0,0,0,False,62.29
3,OH,1,0,2,False,66.8
4,OK,1,0,3,False,52.09


In [85]:
#Lists to store results
md1_results= list()
md2_results= list()
md3_results= list()
md4_results= list()


for i in range(0,100):
    
    kfold= StratifiedKFold(n_splits=5, shuffle=True)
   
    #Defining list to store
    md1_fold_result= list()
    md2_fold_result= list()
    md3_fold_result= list()
    md4_fold_result= list()
    
    for train_idx, val_idx in kfold.split(x,y):
        #Splitting data
        x_train, x_val= x.iloc[train_idx], x.iloc[val_idx]
        y_train, y_val= y.iloc[train_idx], y.iloc[val_idx]
         
        #Model 1 
        x1= x_train[['International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']] 
        x_val_1= x_val[['International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']] 
        #Min Max Scaler
        scaler=MinMaxScaler()
        x1=scaler.fit_transform(x1)
        x_val_1=scaler.fit_transform(x_val_1)
        #Building the linear model
        md1= LogisticRegression(solver='liblinear', penalty='l1').fit(x1, y_train)
        #Predicting on the validation set
        md1_pred= md1.predict(x_val_1)
        #Setting 10% cut-off
        md1_labels=np.where(md1_pred<0.10,0,1)
        #Computing recall score and storing it
        md1_recall= recall_score(y_val, md1_labels)
        md1_fold_result.append(md1_recall)
        #Reporting Average
        Print('Avg recall for model 1:', avg1 = sum(md1_fold_result)/len(md1_fold_result))
       
    
        #Model 2
        x2= x_train[['International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']] 
        x_val_2= x_val[['International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']] 
        #Min Max Scaler
        scaler=MinMaxScaler()
        x2=scaler.fit_transform(x2)
        x_val_2=scaler.fit_transform(x_val_2)
        #Building the linear model
        md2= LogisticRegression(solver='liblinear', penalty='l2').fit(x2, y_train)
        #Predicting on the validation set
        md2_pred= md2.predict(x_val_2)
        #Setting 10% cut-off
        md2_labels=np.where(md2_pred<0.10,0,1)
        #Computing recall score and storing it
        md2_recall= recall_score(y_val, md2_labels)
        md2_fold_result.append(md2_recall)


        #Model 3
        x3= x_train[['International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']] 
        x_val_3= x_val[['International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']] 
        #Min Max Scaler
        scaler=MinMaxScaler()
        x3=scaler.fit_transform(x3)
        x_val_3=scaler.fit_transform(x_val_3)
        #Building the linear model
        md3= LogisticRegression(solver='saga', penalty='l1').fit(x3, y_train)
        #Predicting on the validation set
        md3_pred= md3.predict(x_val_3)
        #Setting 10% cut-off
        md3_labels=np.where(md3_pred<0.10,0,1)
        #Computing recall score and storing it
        md3_recall= recall_score(y_val, md3_labels)
        md3_fold_result.append(md3_recall)


        #Model 4
        x4= x_train[['International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']] 
        x_val_4= x_val[['International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']] 
        #Min Max Scaler
        scaler=MinMaxScaler()
        x4=scaler.fit_transform(x4)
        x_val_4=scaler.fit_transform(x_val_4)
        #Building the linear model
        md4= LogisticRegression(solver='saga', penalty='l2').fit(x4, y_train)
        #Predicting on the validation set
        md4_pred= md4.predict(x_val_4)
        #Setting 10% cut-off
        md4_labels=np.where(md4_pred<0.10,0,1)
        #Computing recall score and storing it
        md4_recall= recall_score(y_val, md4_labels)
        md4_fold_result.append(md4_recall)
        
    md1_results.append(md1_recall)
    

ValueError: Found input variables with inconsistent numbers of samples: [100, 2666]

In [None]:
x = [i for i in range(0,100)]
plt.plot(x, md1_fold_result, color = 'blue')
plt.plot(x, md2__fold_result, color = 'orange')
plt.plot(x, md3_fold_result, color = 'red')
plt.plot(x, md4_fold_result, color = 'yellow')
plt.xlabel('Split Number')
plt.ylabel('Recall')
plt.legend(loc = 'upper right', labels = ['Model 1', 'Model 2', 'Model 3', 'Model 4'])
plt.grid()
plt.show();