In [1]:
from collections import Counter

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

pd.set_option('max_columns', 500)

In [26]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

In [27]:
def print_metrics(label, pred):
    print('Accuracy: ', accuracy_score(label, pred))
    print('Precision: ', precision_score(label, pred))
    print('Recall: ', recall_score(label, pred))
    print('F1: ', f1_score(label, pred))

In [28]:
df = pd.read_csv('preprocessed.csv',index_col=0)

In [29]:
df.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Churn,est_num_payments,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,payment_diff_No_Diff,payment_diff_Positive
0,0.0,3.396185,3.396185,0,0.0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0
1,3.526361,4.042174,7.544068,0,3.496508,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1
2,0.693147,3.986202,4.683519,1,0.693147,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0
3,3.806662,3.744787,7.517928,0,3.7612,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1
4,0.693147,4.258446,5.021575,1,0.693147,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0


# Logistic Regression (baseline)

In [30]:
y = df['Churn']
X = df.drop(['Churn'],axis=1)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25, random_state=0)

In [32]:
logreg_cv = LogisticRegressionCV(solver = 'liblinear', max_iter=1000, cv=10, class_weight = 'balanced')

logreg_cv.fit(X_train, y_train)

y_hat_train = logreg_cv.predict(X_train)
y_hat_test = logreg_cv.predict(X_test)

In [33]:
print_metrics(y_train, y_hat_train)
print()
print_metrics(y_test, y_hat_test)

Accuracy:  0.7812914220791517
Precision:  0.5875090777051561
Recall:  0.579512893982808
F1:  0.5834835917778579

Accuracy:  0.7705848949460534
Precision:  0.5738758029978587
Recall:  0.5665961945031712
F1:  0.5702127659574467


In [34]:
tpr, fpr, thresholds = roc_curve(y_test, y_hat_test,)

In [35]:
auc(tpr, fpr)

0.7060465444565546

# Logistic Regression with Lasso

In [36]:
from sklearn.preprocessing import StandardScaler

In [37]:
scaler = StandardScaler()
y = y
X_scaled = scaler.fit_transform(X)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,test_size=.25, random_state=0)

In [39]:
logreg_cv = LogisticRegressionCV(solver='liblinear',max_iter=1000, cv=10, class_weight='balanced', penalty='l1')

logreg_cv.fit(X_train, y_train)

y_hat_train = logreg_cv.predict(X_train)
y_hat_test = logreg_cv.predict(X_test)

In [40]:
print_metrics(y_train, y_hat_train)
print()
print_metrics(y_test, y_hat_test)

Accuracy:  0.7561067979549327
Precision:  0.5249537892791127
Recall:  0.8137535816618912
F1:  0.6382022471910112

Accuracy:  0.7455990914253265
Precision:  0.517193947730399
Recall:  0.7949260042283298
F1:  0.6266666666666667


# Logistic Regression with Ridge

In [41]:
scaler = StandardScaler()
y = y
X_scaled = scaler.fit_transform(X)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25, random_state=0)

In [43]:
logreg_cv = LogisticRegressionCV(solver='liblinear',max_iter=1000, cv=10, class_weight='balanced', penalty='l2')

logreg_cv.fit(X_train, y_train)

y_hat_train = logreg_cv.predict(X_train)
y_hat_test = logreg_cv.predict(X_test)

In [44]:
print_metrics(y_train, y_hat_train)
print()
print_metrics(y_test, y_hat_test)

Accuracy:  0.7812914220791517
Precision:  0.5875090777051561
Recall:  0.579512893982808
F1:  0.5834835917778579

Accuracy:  0.7705848949460534
Precision:  0.5738758029978587
Recall:  0.5665961945031712
F1:  0.5702127659574467


# Recursive Feature Elimination

In [45]:
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler

In [46]:
scaler = StandardScaler()
y=y
X_scaled = scaler.fit_transform(X)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.25, random_state=0)

In [48]:
num_features = 15
model = LogisticRegressionCV(solver='liblinear',max_iter=1000, cv=10, class_weight='balanced',scoring='f1')
rfe_stand = RFE(model, n_features_to_select=num_features,verbose=1,)
fit_stand = rfe_stand.fit(X_train, y_train)

print("Std Model Feature Ranking:", fit_stand.ranking_)

score_stand = rfe_stand.score(X_train,y_train,)
print("Standardized Model Score with selected features is: %f (%f)" % (score_stand.mean(), score_stand.std()))

Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Std Model Feature Ranking: [ 8 15  1  1 19  5 17 14 11  9  1  1  2  1  3  1 16  4 18  1 10  1  1  1
  1  1  1  1  6  1  7 13 12]
Standardized Model Score with selected features is: 0.635546 (0.000000)


In [49]:
feature_names = np.array(X.columns)
print('Most important features (RFE): %s'% feature_names[rfe_stand.support_])

Most important features (RFE): ['TotalCharges' 'est_num_payments' 'MultipleLines_Yes'
 'InternetService_Fiber optic' 'OnlineSecurity_No internet service'
 'OnlineBackup_No internet service' 'TechSupport_No internet service'
 'StreamingTV_No internet service' 'StreamingTV_Yes'
 'StreamingMovies_No internet service' 'StreamingMovies_Yes'
 'Contract_One year' 'Contract_Two year' 'PaperlessBilling_Yes'
 'PaymentMethod_Electronic check']


In [50]:
X_imp = X[feature_names[rfe_stand.support_]]
y = y

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25, random_state=0)

In [51]:
logreg_cv = LogisticRegressionCV(solver='liblinear',max_iter=1000, cv=10, class_weight='balanced', penalty='l1')

logreg_cv.fit(X_train, y_train)

y_hat_train = logreg_cv.predict(X_train)
y_hat_test = logreg_cv.predict(X_test)

In [52]:
print_metrics(y_train, y_hat_train)
print()
print_metrics(y_test, y_hat_test)

Accuracy:  0.7604620337057375
Precision:  0.5346744309158284
Recall:  0.7234957020057307
F1:  0.6149162861491628

Accuracy:  0.7461669505962522
Precision:  0.5198776758409785
Recall:  0.718816067653277
F1:  0.6033717834960071


# VIF Analysis

In [53]:
y = df['Churn']
X = df.drop(['Churn'],axis=1)

In [54]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

viffactor = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif = pd.DataFrame()
vif["VIF Factor"] = viffactor
#[variance_inflation_factor(df_features.values, i) for i in range(df_features.shape[1])]
vif["features"] = X.columns
vif = vif.round(1).sort_values("VIF Factor",ascending=True)
worst_features = vif[vif['VIF Factor']>20]['features']

  vif = 1. / (1. - r_squared_i)


In [55]:
X = X.drop(worst_features,axis=1)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25, random_state=0)

In [57]:
logreg_cv = LogisticRegressionCV(solver='liblinear',max_iter=1000, cv=10, class_weight='balanced')

logreg_cv.fit(X_train, y_train)

y_hat_train = logreg_cv.predict(X_train)
y_hat_test = logreg_cv.predict(X_test)

In [58]:
print_metrics(y_train, y_hat_train)
print()
print_metrics(y_test, y_hat_test)

Accuracy:  0.7358454838098845
Precision:  0.5002257336343116
Recall:  0.7936962750716332
F1:  0.6136804209360288

Accuracy:  0.737649063032368
Precision:  0.5075653370013755
Recall:  0.7801268498942917
F1:  0.615


# Polynomial Interactions

In [59]:
from sklearn.preprocessing import PolynomialFeatures

In [60]:
scaler = StandardScaler()
y = df['Churn']
X = df.drop(['Churn'],axis=1)
X_scaled = scaler.fit_transform(X)

In [61]:
X.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,est_num_payments,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,payment_diff_No_Diff,payment_diff_Positive
0,0.0,3.396185,3.396185,0.0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0
1,3.526361,4.042174,7.544068,3.496508,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1
2,0.693147,3.986202,4.683519,0.693147,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0
3,3.806662,3.744787,7.517928,3.7612,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1
4,0.693147,4.258446,5.021575,0.693147,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0


In [62]:
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only = False)
poly_data = poly.fit_transform(X_scaled)
poly_columns = poly.get_feature_names(X.columns)
X_poly = pd.DataFrame(poly_data, columns=poly_columns)

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_poly,y,test_size=.25, random_state=0)

In [64]:
#use sklearn package to find the best 20 features
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression

selector = SelectKBest(f_regression, k=500)

selector.fit(X_train, y_train)

SelectKBest(k=500, score_func=<function f_regression at 0x124261dc0>)

In [65]:
selected_columns = X_train.columns[selector.get_support()]
removed_columns = X_train.columns[~selector.get_support()]

In [66]:
logreg_cv = LogisticRegressionCV(solver='liblinear',max_iter=1000, cv=10, class_weight='balanced', penalty='l1', n_jobs=-1, verbose=True)

logreg_cv.fit(X_train[selected_columns], y_train)

y_hat_train = logreg_cv.predict(X_train[selected_columns])
y_hat_test = logreg_cv.predict(X_test[selected_columns])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed: 18.1min remaining: 12.1min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 23.2min finished


In [67]:
print_metrics(y_train, y_hat_train)
print()
print_metrics(y_test, y_hat_test)

Accuracy:  0.7559174398788109
Precision:  0.5250585480093677
Recall:  0.8030085959885387
F1:  0.6349476069102238

Accuracy:  0.7444633730834753
Precision:  0.5155195681511471
Recall:  0.8076109936575053
F1:  0.6293245469522241


# Running Multiple

In [68]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 0)

classifiers = [['DecisionTree :',DecisionTreeClassifier()],
               ['RandomForest :',RandomForestClassifier()], 
               ['Naive Bayes :', GaussianNB()],
               ['KNeighbours :', KNeighborsClassifier()],
               ['SVM :', SVC()],
               ['Neural Network :', MLPClassifier()],
               ['LogisticRegression :', LogisticRegression()],
               ['ExtraTreesClassifier :', ExtraTreesClassifier()],
               ['AdaBoostClassifier :', AdaBoostClassifier()],
               ['GradientBoostingClassifier: ', GradientBoostingClassifier()]]

predictions_df = pd.DataFrame()
predictions_df['actual_labels'] = y_test

for name,classifier in classifiers:
    classifier = classifier
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    predictions_df[name.strip(" :")] = predictions
    print(name, accuracy_score(y_test, predictions), precision_score(y_test, predictions), recall_score(y_test,predictions),f1_score(y_test,predictions))

DecisionTree : 0.7397065783246569 0.5138888888888888 0.5229681978798587 0.5183887915936952
RandomForest : 0.7823000473260767 0.6338383838383839 0.44346289752650175 0.5218295218295218
Naive Bayes : 0.6601987695220066 0.43177737881508077 0.8498233215547704 0.5726190476190476
KNeighbours : 0.7610033128253668 0.5596868884540117 0.5053003533568905 0.531104921077066
SVM : 0.7879791765262659 0.6755952380952381 0.4010600706713781 0.5033259423503327


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Neural Network : 0.7960246095598675 0.6483516483516484 0.5212014134275619 0.5778648383937317
LogisticRegression : 0.8035967818267865 0.6735632183908046 0.5176678445229682 0.5854145854145854
ExtraTreesClassifier : 0.767628963558921 0.5894988066825776 0.4363957597173145 0.5015228426395939
AdaBoostClassifier : 0.7960246095598675 0.6523702031602708 0.5106007067137809 0.572844400396432
GradientBoostingClassifier:  0.7960246095598675 0.6544622425629291 0.5053003533568905 0.5702891326021935


# Voting Classifier

In [70]:
scaler = StandardScaler()
y = y
X_Scaled = scaler.fit_transform(X)

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,test_size=.25, random_state=0)

In [72]:
from sklearn.ensemble import VotingClassifier
clf1 = KNeighborsClassifier()
clf2 = LogisticRegressionCV(solver='liblinear', class_weight = 'balanced')
clf3 = RandomForestClassifier()
clf4 = DecisionTreeClassifier() 
eclf1 = VotingClassifier(estimators=[('kn', clf1), ('lr', clf2), ('rnf', clf3), ('dtc', clf4)], voting='soft')
eclf1.fit(X_train, y_train)
predictions = eclf1.predict(X_test)
print_metrics(y_test, predictions)

Accuracy:  0.7660420215786485
Precision:  0.5673289183222958
Recall:  0.5433403805496829
F1:  0.5550755939524837


# Voting Classifier with multiple Logistic Regression

In [82]:
C_param_range = [0.001,0.01,0.1,1,10]
titles = ['lr_0_001', 'lr_0_01', 'lr_0_1', 'lr_1', 'lr_10']

params = dict(zip(titles, C_param_range)) 
models = {}

table = pd.DataFrame(columns = ['C_parameter','Accuracy','Precision','Recall','F1'])
table['C_parameter'] = C_param_range
j = 0

for k , v  in params.items():
    
    # Create model using different value for c  
    lr = LogisticRegression(solver='liblinear',penalty = 'l1', C = v, random_state = 1, class_weight='balanced')
    
    #save the model to a dictionary to use later in our voting classifiers
    models[k]= lr
    
    #the steps below this point are unnecessary in order to create a voting classifier, 
    #but it is easy to fit the model and see how performance changes for different levels of regularization
    lr.fit(X_train, y_train)
    
    # Predict using model
    y_preds = lr.predict(X_test)

    # Saving accuracy score in table
    table.iloc[j,1] = accuracy_score(y_test, y_preds)
    table.iloc[j,2] = precision_score(y_test, y_preds)
    table.iloc[j,3] = recall_score(y_test, y_preds)
    table.iloc[j,4] = f1_score(y_test, y_preds)
    j += 1


In [83]:
table

Unnamed: 0,C_parameter,Accuracy,Precision,Recall,F1
0,0.001,0.678024,0.434358,0.657505,0.523129
1,0.01,0.725724,0.493719,0.830867,0.619385
2,0.1,0.745031,0.51626,0.805497,0.629232
3,1.0,0.746735,0.518724,0.790698,0.626466
4,10.0,0.745031,0.516575,0.790698,0.624896


In [84]:
lr_voting = VotingClassifier(estimators=list(models.items()), 
                              voting='hard')

lr_voting.fit(X_train, y_train)

lrv_preds = lr_voting.predict(X_test)

lrv_f1 = f1_score(y_test, lrv_preds)

print(lrv_f1)

0.629232039636664


# Bagging Model

In [73]:
scaler = StandardScaler()
y = y
X_Scaled = scaler.fit_transform(X)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,test_size=.25, random_state=0)

In [75]:
from sklearn.ensemble import BaggingClassifier

In [76]:
bc_lr = BaggingClassifier(
            base_estimator=LogisticRegression(solver='liblinear',random_state = 1, class_weight='balanced'), 
            n_estimators= 1000,
            max_samples= 0.8,
            max_features= 6,
            oob_score= True
                )

In [77]:
bc_lr.fit(X_train, y_train)

BaggingClassifier(base_estimator=LogisticRegression(class_weight='balanced',
                                                    random_state=1,
                                                    solver='liblinear'),
                  max_features=6, max_samples=0.8, n_estimators=1000,
                  oob_score=True)

In [78]:
# Use the oob_score to get some idea of how the model performs on a validation set

bc_lr.oob_score_

0.7508047718235182

In [79]:
# See how the model performs on the test set

bc_lr_preds = bc_lr.predict(X_test)

bc_lr_f1 = f1_score(y_test, bc_lr_preds)

print(bc_lr_f1)

0.6133108677337826


# Random Forest

In [80]:
scaler = StandardScaler()
y = y
X_Scaled = scaler.fit_transform(X)

In [81]:
# Instantiate the classifier using 100 trees
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [82]:
#fit the model to the training data
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [83]:
#use the fitted model to predict on the test data
rfc_preds = rfc.predict(X_test)

rfc_f1 = f1_score(y_test, rfc_preds)

# checking accuracy on the test data
print('Test F1 score: ', rfc_f1)

Test F1 score:  0.5339925834363413


# Gridsearch with Random Forest

In [84]:
from sklearn.model_selection import GridSearchCV

In [85]:
#create a dictionary of all the parameters you want to tune
param_grid = { 
    'n_estimators': [100,500,1000],
    'criterion': ['gini', 'entropy'],
    'max_depth': list(range(2,15)),
    'max_features': list(range(3,15))
}

In [86]:
#create a grid search object and fit it to the data

grid_tree=GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)

In [None]:
grid_tree.fit(X_train, y_train)

Fitting 5 folds for each of 936 candidates, totalling 4680 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.4s


In [113]:

# Single best score achieved across all params (min_samples_split)
print(grid_tree.best_score_)

# Dictionary containing the parameters (min_samples_split) used to generate that score
print(grid_tree.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid_tree.best_estimator_)
#Identify the best score during fitting with cross-validation


0.5841535748205636
{'criterion': 'gini', 'max_depth': 11, 'max_features': 12, 'n_estimators': 100}
RandomForestClassifier(max_depth=11, max_features=12)


# Logistic Regression Using SMOTE

In [12]:
from imblearn.over_sampling import SMOTE

In [128]:
y=y
X=X_scaled

In [129]:
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [130]:
sm = SMOTE(random_state=0)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [132]:
#create a dictionary of all the parameters you want to tune
param_grid = { 
    'Cs': [.1,1,10,100],
    'solver': ['liblinear','lbfgs','sag'],
    'penalty': [None, 'l1', 'l2']
}

In [133]:
#create a grid search object and fit it to the data

grid_logreg=GridSearchCV(LogisticRegressionCV(), param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)

In [136]:
# smote_dt.fit(X_train, y_train)
grid_logreg.fit(X_train, y_train)

y_hat_train = smote_lr.predict(X_train)
y_hat_test = smote_lr.predict(X_test)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  4.2min finished


In [138]:
y_hat_train = grid_logreg.predict(X_train)
y_hat_test = grid_logreg.predict(X_test)

In [139]:
print_metrics(y_train, y_hat_train)
print_metrics(y_test, y_hat_test)

Accuracy:  0.7840411840411841
Precision:  0.7559730920900023
Recall:  0.8388674388674389
F1:  0.7952659834065399
Accuracy:  0.7433276547416241
Precision:  0.5141318977119784
Recall:  0.8076109936575053
F1:  0.6282894736842105


In [141]:
y_hat_test_values = Counter(y_hat_test)
y_hat_test_values

Counter({1: 743, 0: 1018})

In [145]:
grid_logreg.best_estimator_.get_params

<bound method BaseEstimator.get_params of LogisticRegressionCV(Cs=100, penalty='l1', solver='liblinear')>

In [146]:
new_logreg = LogisticRegressionCV(Cs=300, solver = 'liblinear', penalty='l1')

In [147]:
new_logreg.fit(X_train, y_train)

y_hat_train = new_logreg.predict(X_train)
y_hat_test = new_logreg.predict(X_test)

KeyboardInterrupt: 

# XGBoost

In [22]:
#import the appropriate packages
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

In [23]:
scaler = StandardScaler()
y = df['Churn']
X = df.drop(['Churn'],axis=1)
X_scaled = scaler.fit_transform(X)

In [24]:
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=0)

In [19]:
sm = SMOTE(random_state=0)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [25]:
xg_clf = xgb.XGBClassifier(objective ='binary:logistic', 
                           colsample_bytree = 0.5, 
                           subsample = 0.5,
                           learning_rate = 0.1,
                           max_depth = 4, 
                           alpha = 1, 
                           #scale_pos_weight= titanic['Survived'].mean(),
                           n_estimators = 1000)

In [26]:
xg_clf.fit(X_train,y_train)

y_hat_train = xg_clf.predict(X_train)
y_hat_test = xg_clf.predict(X_test)


print_metrics(y_train, y_hat_train)
print_metrics(y_test, y_hat_test)

Accuracy:  0.9096761976898314
Precision:  0.8667198723064645
Recall:  0.7779369627507163
F1:  0.8199320498301246
Accuracy:  0.787052810902896
Precision:  0.6206896551724138
Recall:  0.53276955602537
F1:  0.5733788395904437


# XGBoost with GridsearchCV

In [53]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [54]:
scaler = StandardScaler()
y = df['Churn']
X = df.drop(['Churn'],axis=1)
X_scaled = scaler.fit_transform(X)

In [55]:
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=0)

In [56]:
#create a dictionary of all the parameters you want to tune
param_grid = { 
    'scale_pos_weights': [1,10,25,50,75,100]
}

In [60]:
model = xgb.XGBClassifier(scale_pos_weight=99, early_stopping_rounds=1)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

In [61]:
# smote_dt.fit(X_train, y_train)
grid_xg.fit(X_train, y_train)

y_hat_train = grid_xg.predict(X_train)
y_hat_test = grid_xg.predict(X_test)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    5.1s finished


Parameters: { scale_pos_weights } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [62]:
print_metrics(y_train, y_hat_train)
print_metrics(y_test, y_hat_test)

Accuracy:  0.9424351448589282
Precision:  0.9056463595839525
Recall:  0.8732091690544412
F1:  0.889132020423049
Accuracy:  0.78137421919364
Precision:  0.606280193236715
Recall:  0.5306553911205074
F1:  0.5659526493799324


# Big GridSearchCV

In [13]:
scaler = StandardScaler()
y = df['Churn']
X = df.drop(['Churn'],axis=1)
X_scaled = scaler.fit_transform(X)

In [14]:
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [15]:
max_feature = ['auto', 'sqrt']

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [1, 2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

grid = GridSearchCV(RandomForestClassifier(), random_grid, cv=3, verbose=True, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 5760 candidates, totalling 17280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 19.2min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 26.4min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 35.4min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 46.6min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 57.1min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 68.3min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 81.4min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 95.3min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 110.4min
[Parallel(n_jobs=-1)]: Done 11234 tasks      

GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100,
                                       110, None],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [1, 2, 5, 10],
                         'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400,
                                          1600, 1800, 2000]},
             verbose=True)

In [16]:
y_hat_train = grid.predict(X_train)
y_hat_test = grid.predict(X_test)

In [17]:
print_metrics(y_train, y_hat_train)
print()
print_metrics(y_test, y_hat_test)

Accuracy:  0.8492709714069305
Precision:  0.7814258911819888
Recall:  0.5967048710601719
F1:  0.676685621445979

Accuracy:  0.7961385576377058
Precision:  0.6565934065934066
Recall:  0.5052854122621564
F1:  0.5710872162485064


In [23]:
import pickle

In [24]:
with open('grid_rfc', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(grid.best_estimator_, f, pickle.HIGHEST_PROTOCOL)