## Modelling

**using this there is no need to reload your packages every time when you will make any improvement to them**

In [1]:
%load_ext autoreload

In [2]:
autoreload 2

**import necessary libraries**

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import sys
import warnings

**set warnings options**

In [4]:
warnings.filterwarnings('ignore')

#### to have ease in importing modules

In [5]:
sys.path.insert(0, '/Users/mjasiecz/PycharmProjects/new_offer_success_predictor/src')

**set pandas options**

In [6]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 200

**balancing dataset**

In [7]:
# from imblearn.over_sampling import SMOTE
# from imblearn.combine import SMOTEENN

**import packages for manage and preprocess data**

In [8]:
from data.data_manager import DataManager
from data.data_processor import DataProcessor, TestDataProcessor
from ml_preprocessing.categorical_encoders import LeaveOneOutEncoder
from ml_preprocessing.encoding_helpers import LOOGridSearchCV

Using TensorFlow backend.


**import scores function**

In [9]:
from utils.scores_function import scores_function

**load data pipeline**

In [10]:
DM = DataManager()
train_df, test_df = DM.load_train_test()

**ml data preprocessing**

In [11]:
DP = DataProcessor(train_df=train_df)
processed_train_df = DP.perform_initial_features_engineering()
TDP = TestDataProcessor(not_processed_train_df=train_df,
                        processed_train_df=processed_train_df,
                        test_df=test_df,
                        sneaky_peaky=True)
processed_test_df = TDP.perform_initial_features_engineering()

Imputing row 1/1031 with 1 missing, elapsed time: 0.248
Imputing row 1/1289 with 0 missing, elapsed time: 0.375
Imputing row 1033/1289 with 0 missing, elapsed time: 0.377


In [12]:
processed_train_df.head()

Unnamed: 0_level_0,cc_len,cc_startswith,log_salary,log_age_knn,log_estimated_expenses_knn,log_offer_value_knn,scaled_salary,scaled_age_knn,scaled_estimated_expenses_knn,scaled_offer_value_knn,nan_age,not_nan_age,target,offer_class,gender,age,phone_calls,emails,salary,customer_type,center,age_knn,estimated_expenses_knn,offer_value_knn
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
8550AB469CB2445,ELSE,ELSE,7.329474,3.401259,8.882808,5.71694,-0.34553,-0.297159,1.36225,3.035567,True,False,False,Premium,female,,1,2,1524.58,C,B,30.001848,7207.0,303.973257
07355EE27DD1493,ELSE,ELSE,7.17012,3.465736,8.209308,4.160055,-0.389281,-0.111655,-0.492006,-1.109564,False,True,False,High,male,32.0,0,0,1300.0,S,A,32.0,3675.0,64.075055
034E73A251554F0,ELSE,ELSE,7.34601,3.177471,8.4659,4.687503,-0.340578,-0.855654,0.072355,-0.340546,True,False,True,Premium,female,,1,1,1550.0,Q,B,23.986022,4750.0,108.58175
0AF961B4AC7A439,ELSE,ELSE,6.583299,3.323884,7.244942,5.04662,-0.501704,-0.504546,-1.685828,0.470071,True,False,False,Premium,male,,0,1,722.92,C,A,27.767979,1401.0,155.495957
8535BBCA690A4AE,ELSE,ELSE,8.639336,3.405772,7.652546,4.798852,0.458072,-0.284561,-1.315712,-0.119564,True,False,True,Premium,male,,0,1,5649.58,S,B,30.137541,2106.0,121.371033


In [13]:
processed_test_df.head()

Unnamed: 0_level_0,cc_len,cc_startswith,log_salary,log_age_knn,log_estimated_expenses_knn,log_offer_value_knn,scaled_salary,scaled_age_knn,scaled_estimated_expenses_knn,scaled_offer_value_knn,nan_age,not_nan_age,target,age,center,customer_type,emails,gender,offer_class,phone_calls,salary,age_knn,estimated_expenses_knn,offer_value_knn
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
DA3BAEB8BF604EB,ELSE,ELSE,6.594755,3.489576,8.940891,4.9636,-0.501839,-0.035049,1.603528,0.246935,True,False,False,,A,S,1,male,Premium,0,731.25,32.772049,7638.0,143.108044
AEF3DE08DFED4E0,ELSE,ELSE,6.656084,3.433987,8.662851,4.153202,-0.492946,-0.197055,0.631794,-1.130746,False,True,False,31.0,A,S,0,male,Premium,0,777.5,31.0,5784.0,63.637416
B621BB29484E46D,ELSE,ELSE,6.671501,3.218876,8.434898,5.348081,-0.490623,-0.745594,0.013846,1.410101,False,True,False,25.0,B,S,1,male,Premium,0,789.58,25.0,4605.0,210.204552
2D0945802F92423,ELSE,ELSE,6.514713,2.890372,8.40268,4.876044,-0.512654,-1.385557,-0.062677,0.038956,False,True,False,18.0,B,Q,1,female,Premium,0,675.0,18.0,4459.0,131.110924
640ABFC7E49B403,58,ELSE,8.029296,4.025352,8.38845,5.193064,-0.052226,2.088525,-0.095697,0.886816,False,True,False,56.0,B,C,1,male,Medium,0,3069.58,56.0,4396.0,180.019199


**verify if there is no mistake during preprocessing**

In [14]:
assert not set(processed_train_df.index).intersection(set(processed_test_df.index))

**sklearn modelling**

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, make_scorer

**sklearn models which have predict_proba method**

In [16]:
from sklearn.utils.testing import all_estimators

estimators = all_estimators()

for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)



AdaBoostClassifier
BaggingClassifier
BayesianGaussianMixture
BernoulliNB
CalibratedClassifierCV
ClassifierChain
ComplementNB
DecisionTreeClassifier
DummyClassifier
ExtraTreeClassifier
ExtraTreesClassifier
GaussianMixture
GaussianNB
GaussianProcessClassifier
GradientBoostingClassifier
GridSearchCV
HistGradientBoostingClassifier
KNeighborsClassifier
LabelPropagation
LabelSpreading
LinearDiscriminantAnalysis
LogisticRegression
LogisticRegressionCV
MLPClassifier
MultiOutputClassifier
MultinomialNB
NuSVC
OneVsRestClassifier
Pipeline
QuadraticDiscriminantAnalysis
RFE
RFECV
RandomForestClassifier
RandomizedSearchCV
SGDClassifier
SVC
VotingClassifier
_BinaryGaussianProcessClassifierLaplace
_ConstantPredictor




In [17]:
columns_to_encode = ['offer_class', 'gender', 'customer_type', 'center', 'phone_calls', 'cc_len', 'cc_startswith']

In [18]:
enc = LeaveOneOutEncoder(train_df=processed_train_df,
                         test_df=processed_test_df,
                         columns_to_encode=columns_to_encode,
                         target_column='target',
                         random_state=42,
                         mean=1,
                         std=0.05)

In [19]:
_, test_df_encoded = enc.fit()

In [20]:
X_test = test_df_encoded.reset_index().drop(columns=['target'])
y_test = test_df_encoded.reset_index()[['target']]

In [21]:
y_train = processed_train_df[['target']].reset_index().drop(columns=['name'])

In [22]:
print(processed_train_df.columns.values)

['cc_len' 'cc_startswith' 'log_salary' 'log_age_knn'
 'log_estimated_expenses_knn' 'log_offer_value_knn' 'scaled_salary'
 'scaled_age_knn' 'scaled_estimated_expenses_knn' 'scaled_offer_value_knn'
 'nan_age' 'not_nan_age' 'target' 'offer_class' 'gender' 'age'
 'phone_calls' 'emails' 'salary' 'customer_type' 'center' 'age_knn'
 'estimated_expenses_knn' 'offer_value_knn']


In [23]:
all_train_features = processed_train_df.columns.values

#### estimators

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [25]:
knn_clf = KNeighborsClassifier()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
log_clf = LogisticRegression()

In [26]:
knn_param_grid = {'n_neighbors': list(np.linspace(2, 20, 10, endpoint=True).astype(int)),
                  'weights': ['uniform', 'distance'],
                  'metric': ['minkowski'],
                  'p': list(np.linspace(1, 4, 4, endpoint=True).astype(int))}

In [27]:
dt_param_grid = {'max_depth': list(np.linspace(2, 20, 10, endpoint=True).astype(int)),
                 'min_samples_split': list(np.linspace(0.1, 1, 10, endpoint=True)),
                 'max_features': list(np.linspace(2, 8, 4, endpoint=True).astype(int)),
                 'min_samples_leaf': list(np.linspace(0.1, 0.5, 5, endpoint=True)),
                 'random_state': [42]}

In [28]:
rf_param_grid = {'n_estimators': [4, 8, 10, 20, 30, 50, 70, 90, 100, 200],
                 'max_depth': list(np.linspace(2, 10 , 5, endpoint=True).astype(int)),
                 'max_features': list(np.linspace(2, 8, 4, endpoint=True).astype(int)),
                 'random_state': [42]}

In [29]:
log_param_grid = {'C': [0.01, 0.05, 0.1, 0.2, 0.5, 0.6, 0.7, 0.8, 1, 2, 5, 10],
                  'random_state': [42]}

**first set of chosen features**

In [30]:
columns_to_drop_from_training1 = ['salary', 'scaled_salary','scaled_age_knn', 'scaled_estimated_expenses_knn',
                                  'scaled_offer_value_knn', 'estimated_expenses_knn', 'offer_value_knn',
                                  'age_knn', 'log_age_knn', 'age']

In [31]:
set(all_train_features).difference(set(columns_to_drop_from_training1))

{'cc_len',
 'cc_startswith',
 'center',
 'customer_type',
 'emails',
 'gender',
 'log_estimated_expenses_knn',
 'log_offer_value_knn',
 'log_salary',
 'nan_age',
 'not_nan_age',
 'offer_class',
 'phone_calls',
 'target'}

In [32]:
test_df_encoded_ohemails = test_df_encoded.copy(deep=True)
email_ohe_names = {0: '0_emails',
                   1: '1_email',
                   2: '2_emails',
                   3: '3_emails',
                   4: '4_emails',
                   5: '5_emails'}

test_df_encoded_ohemails = (pd.concat([test_df_encoded_ohemails, pd.get_dummies(test_df_encoded_ohemails['emails'])],
                                      axis=1).rename(columns=email_ohe_names)).drop(columns=['emails'])

In [33]:
test_df_encoded_numemails = test_df_encoded.copy(deep=True)
test_df_encoded_numemails['emails'] = test_df_encoded_numemails['emails'].astype(int)

In [34]:
LOOGS1_knn = LOOGridSearchCV(train_df=processed_train_df, model=knn_clf, params_grid=knn_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training1)

In [35]:
knn1_roc_auc = LOOGS1_knn.grid_search()

In [36]:
test1 = test_df_encoded_ohemails[LOOGS1_knn.processed_train().columns.values]

In [37]:
train1 = LOOGS1_knn.processed_train()

In [38]:
print(max(knn1_roc_auc))

0.7094290390318794


In [39]:
LOOGS1_dt = LOOGridSearchCV(train_df=processed_train_df, model=dt_clf, params_grid=dt_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training1, 
                        Xs_train=LOOGS1_knn.Xs_train,
                        ys_train=LOOGS1_knn.ys_train,
                        Xs_val=LOOGS1_knn.Xs_val,
                        ys_val=LOOGS1_knn.ys_val,
                        encoded_df=LOOGS1_knn.encoded_df_)

In [40]:
dt1_roc_auc = LOOGS1_dt.grid_search()

In [41]:
print(max(dt1_roc_auc))

0.711391678946292


In [42]:
LOOGS1_rf = LOOGridSearchCV(train_df=processed_train_df, model=rf_clf, params_grid=rf_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training1, 
                        Xs_train=LOOGS1_knn.Xs_train,
                        ys_train=LOOGS1_knn.ys_train,
                        Xs_val=LOOGS1_knn.Xs_val,
                        ys_val=LOOGS1_knn.ys_val,
                        encoded_df=LOOGS1_knn.encoded_df_)

In [43]:
rf1_roc_auc = LOOGS1_rf.grid_search()

In [44]:
print(max(rf1_roc_auc))

0.7728714678462344


In [45]:
scores_function(model=LOOGS1_rf, X_train=train1 , X_test=test1, y_train=y_train , y_test=y_test, mode='loo')

generalization error: 0.005235259864815434
roc_auc_score: 0.7602678571428573
recall_score: 0.7142857142857143
accuracy_score: 0.7713178294573644
precision_score: 0.693069306930693
confusion_matrix: 

[[129  31]
 [ 28  70]]


In [46]:
LOOGS1_log = LOOGridSearchCV(train_df=processed_train_df, model=log_clf, params_grid=log_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training1, 
                        Xs_train=LOOGS1_knn.Xs_train,
                        ys_train=LOOGS1_knn.ys_train,
                        Xs_val=LOOGS1_knn.Xs_val,
                        ys_val=LOOGS1_knn.ys_val,
                        encoded_df=LOOGS1_knn.encoded_df_)

In [47]:
log1_roc_auc = LOOGS1_log.grid_search()

In [48]:
print(max(log1_roc_auc))

0.7623847758862055


In [49]:
scores_function(model=LOOGS1_log, X_train=train1 , X_test=test1, y_train=y_train , y_test=y_test, mode='loo')

generalization error: 0.016438382418184627
roc_auc_score: 0.7469387755102042
recall_score: 0.6938775510204082
accuracy_score: 0.7596899224806202
precision_score: 0.68
confusion_matrix: 

[[128  32]
 [ 30  68]]


**second set of chosen features**

In [50]:
columns_to_drop_from_training2 = ['salary', 'age_knn', 'log_age_knn', 'age', 'estimated_expenses_knn', 
                                  'offer_value_knn', 'log_estimated_expenses_knn', 'log_offer_value_knn',
                                  'log_salary', 'scaled_age_knn']

In [51]:
set(all_train_features).difference(set(columns_to_drop_from_training2))

{'cc_len',
 'cc_startswith',
 'center',
 'customer_type',
 'emails',
 'gender',
 'nan_age',
 'not_nan_age',
 'offer_class',
 'phone_calls',
 'scaled_estimated_expenses_knn',
 'scaled_offer_value_knn',
 'scaled_salary',
 'target'}

In [52]:
LOOGS2_knn = LOOGridSearchCV(train_df=processed_train_df, model=knn_clf, params_grid=knn_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training2)

In [53]:
knn2_roc_auc = LOOGS2_knn.grid_search()

In [54]:
test2 = test_df_encoded_ohemails[LOOGS2_knn.processed_train().columns.values]

In [55]:
train2 = LOOGS2_knn.processed_train()

In [56]:
print(max(knn2_roc_auc))

0.6735844092208465


In [57]:
LOOGS2_dt = LOOGridSearchCV(train_df=processed_train_df, model=dt_clf, params_grid=dt_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training2, 
                        Xs_train=LOOGS2_knn.Xs_train,
                        ys_train=LOOGS2_knn.ys_train,
                        Xs_val=LOOGS2_knn.Xs_val,
                        ys_val=LOOGS2_knn.ys_val,
                        encoded_df=LOOGS2_knn.encoded_df_)

In [58]:
dt2_roc_auc = LOOGS2_dt.grid_search()

In [59]:
print(max(dt2_roc_auc))

0.711391678946292


In [60]:
LOOGS2_rf = LOOGridSearchCV(train_df=processed_train_df, model=rf_clf, params_grid=rf_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training2, 
                        Xs_train=LOOGS2_knn.Xs_train,
                        ys_train=LOOGS2_knn.ys_train,
                        Xs_val=LOOGS2_knn.Xs_val,
                        ys_val=LOOGS2_knn.ys_val,
                        encoded_df=LOOGS2_knn.encoded_df_)

In [61]:
rf2_roc_auc = LOOGS2_rf.grid_search()

In [62]:
print(max(rf2_roc_auc))

0.7728870676399151


In [63]:
scores_function(model=LOOGS2_rf, X_train=train2, X_test=test2, y_train=y_train , y_test=y_test,  mode='loo')

generalization error: 0.06879917727960749
roc_auc_score: 0.7489158163265306
recall_score: 0.7040816326530612
accuracy_score: 0.7596899224806202
precision_score: 0.6764705882352942
confusion_matrix: 

[[127  33]
 [ 29  69]]


In [64]:
LOOGS2_log = LOOGridSearchCV(train_df=processed_train_df, model=log_clf, params_grid=log_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training2, 
                        Xs_train=LOOGS2_knn.Xs_train,
                        ys_train=LOOGS2_knn.ys_train,
                        Xs_val=LOOGS2_knn.Xs_val,
                        ys_val=LOOGS2_knn.ys_val,
                        encoded_df=LOOGS2_knn.encoded_df_)

In [65]:
log2_roc_auc = LOOGS2_log.grid_search()

In [66]:
print(max(log2_roc_auc))

0.7608821854533825


In [67]:
scores_function(model=LOOGS2_log, X_train=train2, X_test=test2, y_train=y_train , y_test=y_test,  mode='loo')

generalization error: 0.005073292904118176
roc_auc_score: 0.7563137755102041
recall_score: 0.6938775510204082
accuracy_score: 0.7713178294573644
precision_score: 0.7010309278350515
confusion_matrix: 

[[131  29]
 [ 30  68]]


**third set of chosen features**

In [68]:
original_numerical = ['salary', 'estimated_expenses_knn', 'offer_value_knn', 'age_knn']

In [69]:
processed_train_df[original_numerical].var()

salary                    2.637465e+07
estimated_expenses_knn    3.631811e+06
offer_value_knn           3.352732e+03
age_knn                   1.161376e+02
dtype: float64

In [70]:
columns_to_drop_from_training3 = ['salary', 'age_knn', 'log_age_knn', 'age', 'estimated_expenses_knn', 
                                  'offer_value_knn', 'scaled_estimated_expenses_knn', 'log_offer_value_knn',
                                  'scaled_salary', 'scaled_age_knn']

In [71]:
set(all_train_features).difference(set(columns_to_drop_from_training3))

{'cc_len',
 'cc_startswith',
 'center',
 'customer_type',
 'emails',
 'gender',
 'log_estimated_expenses_knn',
 'log_salary',
 'nan_age',
 'not_nan_age',
 'offer_class',
 'phone_calls',
 'scaled_offer_value_knn',
 'target'}

In [72]:
LOOGS3_knn = LOOGridSearchCV(train_df=processed_train_df, model=knn_clf, params_grid=knn_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training3)

In [73]:
knn3_roc_auc = LOOGS3_knn.grid_search()

In [74]:
test3 = test_df_encoded_ohemails[LOOGS3_knn.processed_train().columns.values]

In [75]:
train3 = LOOGS3_knn.processed_train()

In [76]:
print(max(knn3_roc_auc))

0.6975377845364383


In [77]:
LOOGS3_dt = LOOGridSearchCV(train_df=processed_train_df, model=dt_clf, params_grid=dt_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training3, 
                        Xs_train=LOOGS3_knn.Xs_train,
                        ys_train=LOOGS3_knn.ys_train,
                        Xs_val=LOOGS3_knn.Xs_val,
                        ys_val=LOOGS3_knn.ys_val,
                        encoded_df=LOOGS3_knn.encoded_df_)

In [78]:
dt3_roc_auc = LOOGS3_dt.grid_search()

In [79]:
print(max(dt3_roc_auc))

0.711391678946292


In [80]:
LOOGS3_rf = LOOGridSearchCV(train_df=processed_train_df, model=rf_clf, params_grid=rf_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training3, 
                        Xs_train=LOOGS3_knn.Xs_train,
                        ys_train=LOOGS3_knn.ys_train,
                        Xs_val=LOOGS3_knn.Xs_val,
                        ys_val=LOOGS3_knn.ys_val,
                        encoded_df=LOOGS3_knn.encoded_df_)

In [81]:
rf3_roc_auc = LOOGS3_rf.grid_search()

In [82]:
print(max(rf3_roc_auc))

0.7728870676399151


In [83]:
scores_function(model=LOOGS3_rf, X_train=train3, X_test=test3, y_train=y_train , y_test=y_test, mode='loo')

generalization error: 0.06879917727960749
roc_auc_score: 0.7489158163265306
recall_score: 0.7040816326530612
accuracy_score: 0.7596899224806202
precision_score: 0.6764705882352942
confusion_matrix: 

[[127  33]
 [ 29  69]]


In [84]:
LOOGS3_log = LOOGridSearchCV(train_df=processed_train_df, model=log_clf, params_grid=log_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training3, 
                        Xs_train=LOOGS3_knn.Xs_train,
                        ys_train=LOOGS3_knn.ys_train,
                        Xs_val=LOOGS3_knn.Xs_val,
                        ys_val=LOOGS3_knn.ys_val,
                        encoded_df=LOOGS3_knn.encoded_df_)

In [85]:
log3_roc_auc = LOOGS3_log.grid_search()

In [86]:
print(max(log3_roc_auc))

0.7644200466144924


In [87]:
scores_function(model=LOOGS3_log, X_train=train3, X_test=test3, y_train=y_train , y_test=y_test, mode='loo')

generalization error: 0.008778796844824965
roc_auc_score: 0.7551658163265306
recall_score: 0.7040816326530612
accuracy_score: 0.7674418604651163
precision_score: 0.69
confusion_matrix: 

[[129  31]
 [ 29  69]]


**forth set of chosen features**

In [88]:
all_train_features

array(['cc_len', 'cc_startswith', 'log_salary', 'log_age_knn',
       'log_estimated_expenses_knn', 'log_offer_value_knn',
       'scaled_salary', 'scaled_age_knn', 'scaled_estimated_expenses_knn',
       'scaled_offer_value_knn', 'nan_age', 'not_nan_age', 'target',
       'offer_class', 'gender', 'age', 'phone_calls', 'emails', 'salary',
       'customer_type', 'center', 'age_knn', 'estimated_expenses_knn',
       'offer_value_knn'], dtype=object)

In [89]:
columns_to_encode.remove('center')

In [90]:
columns_to_drop_from_training4 = ['salary', 'age', 'center', 'age_knn', 'estimated_expenses_knn', 'offer_value_knn', 
                                  'log_age_knn', 'log_estimated_expenses_knn', 'log_offer_value_knn', 'log_salary',
                                  'nan_age', 'not_nan_age']

In [91]:
set(all_train_features).difference(set(columns_to_drop_from_training4))

{'cc_len',
 'cc_startswith',
 'customer_type',
 'emails',
 'gender',
 'offer_class',
 'phone_calls',
 'scaled_age_knn',
 'scaled_estimated_expenses_knn',
 'scaled_offer_value_knn',
 'scaled_salary',
 'target'}

In [92]:
LOOGS4_knn = LOOGridSearchCV(train_df=processed_train_df, model=knn_clf, params_grid=knn_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training4)

In [93]:
knn4_roc_auc = LOOGS4_knn.grid_search()

In [94]:
test4 = test_df_encoded_ohemails[LOOGS4_knn.processed_train().columns.values]

In [95]:
train4 = LOOGS4_knn.processed_train()

In [96]:
print(max(knn4_roc_auc))

0.6551277692792656


In [97]:
LOOGS4_dt = LOOGridSearchCV(train_df=processed_train_df, model=dt_clf, params_grid=dt_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training4, 
                        Xs_train=LOOGS4_knn.Xs_train,
                        ys_train=LOOGS4_knn.ys_train,
                        Xs_val=LOOGS4_knn.Xs_val,
                        ys_val=LOOGS4_knn.ys_val,
                        encoded_df=LOOGS4_knn.encoded_df_)

In [98]:
dt4_roc_auc = LOOGS4_dt.grid_search()

In [99]:
print(max(dt4_roc_auc))

0.7584170776445248


In [100]:
scores_function(model=LOOGS4_dt, X_train=train4, X_test=test4, y_train=y_train , y_test=y_test, mode='loo')

generalization error: -0.009129072824782147
roc_auc_score: 0.762563775510204
recall_score: 0.6938775510204082
accuracy_score: 0.7790697674418605
precision_score: 0.7157894736842105
confusion_matrix: 

[[133  27]
 [ 30  68]]


In [101]:
LOOGS4_rf = LOOGridSearchCV(train_df=processed_train_df, model=rf_clf, params_grid=rf_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training4, 
                        Xs_train=LOOGS4_knn.Xs_train,
                        ys_train=LOOGS4_knn.ys_train,
                        Xs_val=LOOGS4_knn.Xs_val,
                        ys_val=LOOGS4_knn.ys_val,
                        encoded_df=LOOGS4_knn.encoded_df_)

In [102]:
rf4_roc_auc = LOOGS4_rf.grid_search()

In [103]:
print(max(rf4_roc_auc))

0.7724817315738017


In [104]:
scores_function(model=LOOGS4_rf, X_train=train4, X_test=test4, y_train=y_train , y_test=y_test, mode='loo')

generalization error: 0.02083815863301841
roc_auc_score: 0.7656887755102041
recall_score: 0.6938775510204082
accuracy_score: 0.7829457364341085
precision_score: 0.723404255319149
confusion_matrix: 

[[134  26]
 [ 30  68]]


In [105]:
LOOGS4_log = LOOGridSearchCV(train_df=processed_train_df, model=log_clf, params_grid=log_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training4, 
                        Xs_train=LOOGS4_knn.Xs_train,
                        ys_train=LOOGS4_knn.ys_train,
                        Xs_val=LOOGS4_knn.Xs_val,
                        ys_val=LOOGS4_knn.ys_val,
                        encoded_df=LOOGS4_knn.encoded_df_)

In [106]:
log4_roc_auc = LOOGS4_log.grid_search()

In [107]:
print(max(log4_roc_auc))

0.7651390877477777


In [108]:
scores_function(model=LOOGS4_log, X_train=train4, X_test=test4, y_train=y_train , y_test=y_test, mode='loo')

generalization error: 0.003157132418184716
roc_auc_score: 0.7656887755102041
recall_score: 0.6938775510204082
accuracy_score: 0.7829457364341085
precision_score: 0.723404255319149
confusion_matrix: 

[[134  26]
 [ 30  68]]


**fifth set of chosen features**

In [109]:
columns_to_drop_from_training5 = ['salary', 'age', 'center', 'age_knn', 'estimated_expenses_knn', 'offer_value_knn', 
                                  'log_age_knn', 'log_estimated_expenses_knn', 'log_offer_value_knn', 'log_salary',
                                  'nan_age', 'not_nan_age', 'cc_len', 'cc_startswith']

In [110]:
columns_to_encode.remove('cc_len')

In [111]:
columns_to_encode.remove('cc_startswith')

In [112]:
set(all_train_features).difference(set(columns_to_drop_from_training5))

{'customer_type',
 'emails',
 'gender',
 'offer_class',
 'phone_calls',
 'scaled_age_knn',
 'scaled_estimated_expenses_knn',
 'scaled_offer_value_knn',
 'scaled_salary',
 'target'}

In [113]:
LOOGS5_knn = LOOGridSearchCV(train_df=processed_train_df, model=knn_clf, params_grid=knn_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training5)

In [114]:
knn5_roc_auc = LOOGS5_knn.grid_search()

In [115]:
test5 = test_df_encoded_ohemails[LOOGS5_knn.processed_train().columns.values]

In [116]:
train5 = LOOGS5_knn.processed_train()

In [117]:
print(max(knn5_roc_auc))

0.6598613283201422


In [118]:
LOOGS5_dt = LOOGridSearchCV(train_df=processed_train_df, model=dt_clf, params_grid=dt_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training5, 
                        Xs_train=LOOGS5_knn.Xs_train,
                        ys_train=LOOGS5_knn.ys_train,
                        Xs_val=LOOGS5_knn.Xs_val,
                        ys_val=LOOGS5_knn.ys_val,
                        encoded_df=LOOGS5_knn.encoded_df_)

In [119]:
dt5_roc_auc = LOOGS5_dt.grid_search()

In [120]:
print(max(dt5_roc_auc))

0.7584170776445248


In [121]:
LOOGS5_rf = LOOGridSearchCV(train_df=processed_train_df, model=rf_clf, params_grid=rf_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training5, 
                        Xs_train=LOOGS5_knn.Xs_train,
                        ys_train=LOOGS5_knn.ys_train,
                        Xs_val=LOOGS5_knn.Xs_val,
                        ys_val=LOOGS5_knn.ys_val,
                        encoded_df=LOOGS5_knn.encoded_df_)

In [122]:
rf5_roc_auc = LOOGS5_rf.grid_search()

In [123]:
print(max(rf5_roc_auc))

0.7776453425093461


In [124]:
scores_function(model=LOOGS5_rf, X_train=train5, X_test=test5, y_train=y_train , y_test=y_test, mode='loo')

generalization error: -0.0003994532595645861
roc_auc_score: 0.771938775510204
recall_score: 0.6938775510204082
accuracy_score: 0.7906976744186046
precision_score: 0.7391304347826086
confusion_matrix: 

[[136  24]
 [ 30  68]]


In [125]:
LOOGS5_log = LOOGridSearchCV(train_df=processed_train_df, model=log_clf, params_grid=log_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training5, 
                        Xs_train=LOOGS5_knn.Xs_train,
                        ys_train=LOOGS5_knn.ys_train,
                        Xs_val=LOOGS5_knn.Xs_val,
                        ys_val=LOOGS5_knn.ys_val,
                        encoded_df=LOOGS5_knn.encoded_df_)

In [126]:
log5_roc_auc = LOOGS5_log.grid_search()

In [127]:
print(max(log5_roc_auc))

0.7627804463891363


In [128]:
scores_function(model=LOOGS5_log, X_train=train5, X_test=test5, y_train=y_train , y_test=y_test, mode='loo')

generalization error: 0.0026726587374078514
roc_auc_score: 0.7707908163265307
recall_score: 0.7040816326530612
accuracy_score: 0.7868217054263565
precision_score: 0.7263157894736842
confusion_matrix: 

[[134  26]
 [ 29  69]]


**sixth set of chosen features**

In [129]:
columns_to_drop_from_training6 = ['salary', 'age', 'center', 'age_knn', 'estimated_expenses_knn', 'offer_value_knn', 
                                  'log_age_knn', 'log_estimated_expenses_knn', 'log_offer_value_knn', 'log_salary',
                                  'nan_age', 'not_nan_age', 'cc_len', 'cc_startswith']

In [130]:
set(all_train_features).difference(set(columns_to_drop_from_training6))

{'customer_type',
 'emails',
 'gender',
 'offer_class',
 'phone_calls',
 'scaled_age_knn',
 'scaled_estimated_expenses_knn',
 'scaled_offer_value_knn',
 'scaled_salary',
 'target'}

In [131]:
columns_to_encode.remove('phone_calls')

convert back phone_calls and emails to numerical

In [132]:
num_processed_train_df = processed_train_df.copy(deep=True)

In [133]:
num_processed_train_df['phone_calls'] = num_processed_train_df['phone_calls'].astype(int)
num_processed_train_df['emails'] = num_processed_train_df['emails'].astype(int)

In [134]:
LOOGS6_knn = LOOGridSearchCV(train_df=num_processed_train_df, model=knn_clf, params_grid=knn_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training6,
                        ohe_emails=False)

In [135]:
knn6_roc_auc = LOOGS6_knn.grid_search()

In [136]:
test6 = test_df_encoded_numemails[LOOGS6_knn.processed_train().columns.values]

In [137]:
test6['phone_calls'] = test6['phone_calls'].astype(int)

In [138]:
train6 = LOOGS6_knn.processed_train()

In [139]:
print(max(knn6_roc_auc))

0.6707671928402009


In [140]:
LOOGS6_dt = LOOGridSearchCV(train_df=num_processed_train_df, model=dt_clf, params_grid=dt_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training6, 
                        Xs_train=LOOGS6_knn.Xs_train,
                        ys_train=LOOGS6_knn.ys_train,
                        Xs_val=LOOGS6_knn.Xs_val,
                        ys_val=LOOGS6_knn.ys_val,
                        encoded_df=LOOGS6_knn.encoded_df_,
                        ohe_emails=False)

In [141]:
dt6_roc_auc = LOOGS6_dt.grid_search()

In [142]:
print(max(dt6_roc_auc))

0.7584170776445248


In [143]:
scores_function(model=LOOGS6_dt, X_train=train6, X_test=test6, y_train=y_train , y_test=y_test, mode='loo')

generalization error: -0.009129072824782147
roc_auc_score: 0.762563775510204
recall_score: 0.6938775510204082
accuracy_score: 0.7790697674418605
precision_score: 0.7157894736842105
confusion_matrix: 

[[133  27]
 [ 30  68]]


In [144]:
LOOGS6_rf = LOOGridSearchCV(train_df=num_processed_train_df, model=rf_clf, params_grid=rf_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training6, 
                        Xs_train=LOOGS6_knn.Xs_train,
                        ys_train=LOOGS6_knn.ys_train,
                        Xs_val=LOOGS6_knn.Xs_val,
                        ys_val=LOOGS6_knn.ys_val,
                        encoded_df=LOOGS6_knn.encoded_df_,
                        ohe_emails=False)

In [145]:
rf6_roc_auc = LOOGS6_rf.grid_search()

In [146]:
print(max(rf6_roc_auc))

0.7750538175130245


In [147]:
scores_function(model=LOOGS6_rf, X_train=train6, X_test=test6, y_train=y_train , y_test=y_test, mode='loo')

generalization error: 0.08376868410407645
roc_auc_score: 0.7730867346938776
recall_score: 0.6836734693877551
accuracy_score: 0.7945736434108527
precision_score: 0.7528089887640449
confusion_matrix: 

[[138  22]
 [ 31  67]]


In [148]:
LOOGS6_log = LOOGridSearchCV(train_df=num_processed_train_df, model=log_clf, params_grid=log_param_grid, 
                        columns_to_encode=columns_to_encode,
                        columns_to_drop_from_training=columns_to_drop_from_training6, 
                        Xs_train=LOOGS6_knn.Xs_train,
                        ys_train=LOOGS6_knn.ys_train,
                        Xs_val=LOOGS6_knn.Xs_val,
                        ys_val=LOOGS6_knn.ys_val,
                        encoded_df=LOOGS6_knn.encoded_df_,
                        ohe_emails=False)

In [149]:
log6_roc_auc = LOOGS6_log.grid_search()

In [150]:
print(max(log6_roc_auc))

0.7632253025639057


In [151]:
scores_function(model=LOOGS6_log, X_train=train6, X_test=test6, y_train=y_train , y_test=y_test, mode='loo')

generalization error: 0.0034278113419280443
roc_auc_score: 0.7574617346938776
recall_score: 0.6836734693877551
accuracy_score: 0.7751937984496124
precision_score: 0.7127659574468085
confusion_matrix: 

[[133  27]
 [ 31  67]]


**seventh set of chosen features**

In [152]:
columns_to_use_for_training7 = ['customer_type', 'gender', 'emails', 'phone_calls', 'cc_startswith', 'cc_len',
                                'scaled_salary', 'offer_class']

In [153]:
features_set_seventh = processed_train_df[columns_to_use_for_training7]

In [154]:
test7 = processed_test_df[columns_to_use_for_training7]

In [155]:
features_set_seventh['phone_calls'] = features_set_seventh['phone_calls'].astype(int)
features_set_seventh['emails'] = features_set_seventh['emails'].astype(int)

In [156]:
test7['phone_calls'] = test7['phone_calls'].astype(int)
test7['emails'] = test7['emails'].astype(int)

convert back phone_calls and emails to numerical, one hot encoding on categoricals columns

In [157]:
test7 = pd.get_dummies(test7)

In [158]:
features_set_seventh = pd.get_dummies(features_set_seventh)

In [159]:
roc_auc_scorer = make_scorer(roc_auc_score)

In [160]:
knn7 = GridSearchCV(estimator=knn_clf, param_grid= knn_param_grid, cv=5, scoring=roc_auc_scorer)

In [161]:
knn7.fit(features_set_seventh, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'metric': ['minkowski'],
                         'n_neighbors': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                         'p': [1, 2, 3, 4],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(roc_auc_score), verbose=0)

In [162]:
max(knn7.cv_results_['mean_test_score'])

0.7491887805028064

In [163]:
scores_function(model=knn7, X_train=features_set_seventh, X_test=test7, y_train=y_train, y_test=y_test, mode='skl')

generalization error: 0.16847455014614532
roc_auc_score: 0.7477678571428572
recall_score: 0.7142857142857143
accuracy_score: 0.7558139534883721
precision_score: 0.6666666666666666
confusion_matrix: 

[[125  35]
 [ 28  70]]


In [164]:
dt7 = GridSearchCV(estimator=dt_clf, param_grid= dt_param_grid, cv=5, scoring=roc_auc_scorer)

In [165]:
dt7.fit(features_set_seventh, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n...
             param_grid={'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                         'max_features': [2, 4, 6, 8],
             

In [166]:
max(dt7.cv_results_['mean_test_score'])

0.7581682977014718

In [167]:
scores_function(model=dt7, X_train=features_set_seventh, X_test=test7, y_train=y_train, y_test=y_test, mode='skl')

generalization error: -0.004371639960331852
roc_auc_score: 0.762563775510204
recall_score: 0.6938775510204082
accuracy_score: 0.7790697674418605
precision_score: 0.7157894736842105
confusion_matrix: 

[[133  27]
 [ 30  68]]


In [168]:
rf7 = GridSearchCV(estimator=rf_clf, param_grid= rf_param_grid, cv=5, scoring=roc_auc_scorer)

In [169]:
rf7.fit(features_set_seventh, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [170]:
max(rf7.cv_results_['mean_test_score'])

0.784362322962206

In [171]:
scores_function(model=rf7, X_train=features_set_seventh, X_test=test7, y_train=y_train, y_test=y_test, mode='skl')

generalization error: 0.05817113530194695
roc_auc_score: 0.7785076530612245
recall_score: 0.6632653061224489
accuracy_score: 0.8062015503875969
precision_score: 0.7926829268292683
confusion_matrix: 

[[143  17]
 [ 33  65]]


In [172]:
log7 = GridSearchCV(estimator=log_clf, param_grid= log_param_grid, cv=5, scoring=roc_auc_scorer)

In [173]:
log7.fit(features_set_seventh, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.01, 0.05, 0.1, 0.2, 0.5, 0.6, 0.7, 0.8, 1, 2,
                               5, 10],
                         'random_state': [42]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(roc_auc_score), verbose=0)

In [174]:
max(log7.cv_results_['mean_test_score'])

0.7658254188911985

In [175]:
scores_function(model=log7, X_train=features_set_seventh, X_test=test7, y_train=y_train, y_test=y_test, mode='skl')

generalization error: 0.01564807988412764
roc_auc_score: 0.7574617346938776
recall_score: 0.6836734693877551
accuracy_score: 0.7751937984496124
precision_score: 0.7127659574468085
confusion_matrix: 

[[133  27]
 [ 31  67]]


**eighth set of chosen features**

convert back phone_calls and emails to numerical, one hot encoding on categoricals columns

In [176]:
columns_to_use_for_training8 = ['customer_type',  'emails', 'phone_calls', 'scaled_salary', 'gender', 'scaled_offer_value_knn']

In [177]:
features_set_eighth = num_processed_train_df[columns_to_use_for_training8]

In [178]:
features_set_eighth['phone_calls'] = features_set_eighth['phone_calls'].astype(int)
features_set_eighth['emails'] = features_set_eighth['emails'].astype(int)

In [179]:
test8 = processed_test_df[columns_to_use_for_training8]

In [180]:
test8['phone_calls'] = test8['phone_calls'].astype(int)
test8['emails'] = test8['emails'].astype(int)

In [181]:
features_set_eighth = pd.get_dummies(features_set_eighth)

In [182]:
test8 = pd.get_dummies(test8)

In [183]:
knn8 = GridSearchCV(estimator=knn_clf, param_grid= knn_param_grid, cv=5, scoring=roc_auc_scorer)

In [184]:
knn8.fit(features_set_eighth, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'metric': ['minkowski'],
                         'n_neighbors': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                         'p': [1, 2, 3, 4],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(roc_auc_score), verbose=0)

In [185]:
max(knn8.cv_results_['mean_test_score'])

0.7630050669685231

In [186]:
scores_function(model=knn8, X_train=features_set_eighth, X_test=test8, y_train=y_train, y_test=y_test, mode='skl')

generalization error: 0.2617984693877551
roc_auc_score: 0.7382015306122449
recall_score: 0.6326530612244898
accuracy_score: 0.7635658914728682
precision_score: 0.7126436781609196
confusion_matrix: 

[[135  25]
 [ 36  62]]


In [187]:
dt8 = GridSearchCV(estimator=dt_clf, param_grid= dt_param_grid, cv=5, scoring=roc_auc_scorer)

In [188]:
dt8.fit(features_set_eighth, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n...
             param_grid={'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                         'max_features': [2, 4, 6, 8],
             

In [189]:
max(dt8.cv_results_['mean_test_score'])

0.7581682977014718

In [190]:
scores_function(model=dt8, X_train=features_set_eighth, X_test=test8, y_train=y_train, y_test=y_test, mode='skl')

generalization error: -0.004371639960331852
roc_auc_score: 0.762563775510204
recall_score: 0.6938775510204082
accuracy_score: 0.7790697674418605
precision_score: 0.7157894736842105
confusion_matrix: 

[[133  27]
 [ 30  68]]


In [191]:
rf8 = GridSearchCV(estimator=rf_clf, param_grid= rf_param_grid, cv=5, scoring=roc_auc_scorer)

In [192]:
rf8.fit(features_set_eighth, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [193]:
max(rf8.cv_results_['mean_test_score'])

0.7649902405697634

In [194]:
scores_function(model=rf8, X_train=features_set_eighth, X_test=test8, y_train=y_train, y_test=y_test, mode='skl')

generalization error: 0.011537087661151468
roc_auc_score: 0.762563775510204
recall_score: 0.6938775510204082
accuracy_score: 0.7790697674418605
precision_score: 0.7157894736842105
confusion_matrix: 

[[133  27]
 [ 30  68]]


In [195]:
log8 = GridSearchCV(estimator=log_clf, param_grid= log_param_grid, cv=5, scoring=roc_auc_scorer)

In [196]:
log8.fit(features_set_eighth, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.01, 0.05, 0.1, 0.2, 0.5, 0.6, 0.7, 0.8, 1, 2,
                               5, 10],
                         'random_state': [42]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(roc_auc_score), verbose=0)

In [197]:
max(log8.cv_results_['mean_test_score'])

0.7651927278663604

In [198]:
scores_function(model=log8, X_train=features_set_eighth, X_test=test8, y_train=y_train, y_test=y_test, mode='skl')

generalization error: 0.0075478560989613586
roc_auc_score: 0.7543367346938775
recall_score: 0.6836734693877551
accuracy_score: 0.7713178294573644
precision_score: 0.7052631578947368
confusion_matrix: 

[[132  28]
 [ 31  67]]
