In [37]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.svm import SVC

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key_1 = 'Demos/churn-bigml-80.csv'
file_key_2 = 'Demos/churn-bigml-20.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

## Reading the csv files
telecom_train = pd.read_csv(file_content_stream_1)
telecom_test = pd.read_csv(file_content_stream_2)

telecom_train.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [38]:
## Changing Churn to 0-1
telecom_train['Churn'] = np.where(telecom_train['Churn'] == False, 0, 1)
telecom_test['Churn'] = np.where(telecom_test['Churn'] == False, 0, 1)

## Changing International_plan to 0-1
telecom_train['International_plan'] = np.where(telecom_train['International_plan'] == 'No', 0, 1)
telecom_test['International_plan'] = np.where(telecom_test['International_plan'] == 'No', 0, 1)

## Changing voice mail plan to 0-1
telecom_train['Voice_mail_plan'] = np.where(telecom_train['Voice_mail_plan'] == 'No', 0, 1)
telecom_test['Voice_mail_plan'] = np.where(telecom_test['Voice_mail_plan'] == 'No', 0, 1)

## Creating total charges
telecom_train['total_charge'] = telecom_train['Total_day_charge'] + telecom_train['Total_eve_charge'] + telecom_train['Total_night_charge'] + telecom_train['Total_intl_charge']
telecom_test['total_charge'] = telecom_test['Total_day_charge'] + telecom_test['Total_eve_charge'] + telecom_test['Total_night_charge'] + telecom_test['Total_intl_charge']

In [39]:
telecom_train.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,...,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn,total_charge
0,KS,128,415,0,1,25,265.1,110,45.07,197.4,...,16.78,244.7,91,11.01,10.0,3,2.7,1,0,75.56
1,OH,107,415,0,1,26,161.6,123,27.47,195.5,...,16.62,254.4,103,11.45,13.7,3,3.7,1,0,59.24
2,NJ,137,415,0,0,0,243.4,114,41.38,121.2,...,10.3,162.6,104,7.32,12.2,5,3.29,0,0,62.29
3,OH,84,408,1,0,0,299.4,71,50.9,61.9,...,5.26,196.9,89,8.86,6.6,7,1.78,2,0,66.8
4,OK,75,415,1,0,0,166.7,113,28.34,148.3,...,12.61,186.9,121,8.41,10.1,3,2.73,3,0,52.09


In [40]:
## Defining initial input and targe variables
X = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge' , 'Customer_service_calls']]
Y = telecom_train['Churn']

## Transforming to 0-1
# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)

In [50]:
## Defining the list to estore coefficients
est_coeff = list()

for i in range(0, 1000):
    
    ## Splitting the data 
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## Transform the input data to 0-1
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    
    ## Running LASSO cross-validation to estimate optimal lambda
    lasso_cv = LassoCV(cv = 5).fit(X_train, Y_train)
    
    ## Building LASSO regression with optimal lambda
    lasso_md = Lasso(alpha = lasso_cv.alpha_).fit(X_train, Y_train)

    ## Storing estimated coefficients
    est_coeff.append(lasso_md.coef_) 

## Putting the list as data-frame
est_coeff = pd.DataFrame(est_coeff)    

In [51]:
def sum_zeros(X):
    return(sum(X == 0.0))

est_coeff.apply(sum_zeros, axis = 0)

0    282
1      0
2      0
3      0
4      0
dtype: int64

In [None]:
## Defining initial input and targe variables
X = telecom_train[['International_plan', 'Voice_mail_plan', 'total_charge' , 'Customer_service_calls']]
Y = telecom_train['Churn']

In [56]:
Y.value_counts()

0    2278
1     388
Name: Churn, dtype: int64

In [57]:
from sklearn.svm import SVC
from sklearn.metrics import recall_score
from sklearn.model_selection import StratifiedKFold

In [61]:
for i in range(0, 10):
    
    kfold = StratifiedKFold(n_splits = 5, shuffle = True)

    for train_ix, test_ix in kfold.split(X, Y):
    
        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
        
        ## Transformation inputs to 0-1 scale
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
        
        #############
        ## Model 1 ##
        #############
        md1 = SVC(kernel = 'rbf', probability = True).fit(X_train, Y_train)
        pred1 = md1.predict_proba(X_test)[:, 1]
        pred1 = np.where(pred1 < 0.1, 0, 1)
        print(recall_score(Y_test, pred1))
        
        
        

0.7564102564102564
0.974025974025974
0.8961038961038961
0.9102564102564102
0.8974358974358975
0.9102564102564102
0.8831168831168831
0.8051948051948052
0.9230769230769231
0.9615384615384616
0.9358974358974359
0.974025974025974
0.8441558441558441
0.8461538461538461
0.9102564102564102
0.9102564102564102
0.935064935064935
0.8701298701298701
0.8589743589743589
0.8717948717948718
0.8589743589743589
0.8181818181818182
0.935064935064935
0.8846153846153846
0.9487179487179487
0.9358974358974359
0.9090909090909091
0.9090909090909091
0.7948717948717948
0.8333333333333334
0.8333333333333334
0.922077922077922
0.8831168831168831
0.8461538461538461
0.9743589743589743
0.8717948717948718
0.8701298701298701
0.922077922077922
0.9743589743589743
0.8717948717948718
0.9487179487179487
0.9090909090909091
0.8441558441558441
0.8589743589743589
0.8461538461538461
0.8974358974358975
0.8961038961038961
0.922077922077922
0.9230769230769231
0.8974358974358975
