In [9]:
from sklearn.model_selection import RepeatedStratifiedKFold
from ipynb.fs.full.data_wrangling import * #Data preprocessing
from sklearn.model_selection import cross_val_score
import numpy as np

SCORINGS = [
    'accuracy',
    'f1',
    'precision',
    'recall',
    'roc_auc'
]

cv_splits = 5
repetitions = 3
RANDOM_STATE = 42
cross_validation_setting = RepeatedStratifiedKFold(n_splits=cv_splits,
                                                   n_repeats=repetitions,
                                                   random_state= RANDOM_STATE)

def model_evaluation(model, features, target, 
                     cv = cross_validation_setting):
    scores = dict()
    for score_metric in SCORINGS:
        scores[score_metric] = cross_val_score(model,
                                               features,
                                               target,
                                               scoring = score_metric)
    return scores


    
    
    

In [10]:
"""
Config for every experiment
"""
RANDOM_STATE = 0
VOTING_METHOD = 'soft'


In [11]:
"""
Getting the best weak learners (GridSearchCV section)
"""
from sklearn.model_selection import GridSearchCV
# --------------------------------------------

In [12]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg_grid={'C':[0.001,0.01,.09,1,5,10],
              "penalty":["l1","l2"]} #l1 lasso l2 ridge
lr = LogisticRegression(random_state=RANDOM_STATE)
log_reg_gridsearch = GridSearchCV(lr, param_grid = log_reg_grid,
                             refit = True)
selected_log_reg = log_reg_gridsearch.fit(X,Y).best_estimator_
model_evaluation(selected_log_reg, X, Y)

Traceback (most recent call last):
  File "C:\Users\Nathan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Nathan\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Nathan\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of it

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TER

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TER

{'accuracy': array([0.48809524, 0.75      , 0.48015873, 0.48412698, 0.48207171]),
 'f1': array([0.01526718, 0.74074074, 0.        , 0.        , 0.        ]),
 'precision': array([1.        , 0.79646018, 0.        , 0.        , 0.        ]),
 'recall': array([0.00769231, 0.69230769, 0.        , 0.        , 0.        ]),
 'roc_auc': array([0.56333544, 0.79224464, 0.5369483 , 0.52802648, 0.47571519])}

In [13]:
"""
K Neighbors
"""
from sklearn.neighbors import KNeighborsClassifier

k_range = list(range(1,31))
weight_options = ["uniform", "distance"]

knn_grid = dict(n_neighbors = k_range, 
                weights = weight_options)
knn = KNeighborsClassifier()
knn_gridsearch =GridSearchCV(knn, param_grid = knn_grid,
                             refit = True)
selected_knn = knn_gridsearch.fit(X, Y).best_estimator_
model_evaluation(selected_knn, X , Y)

{'accuracy': array([0.68650794, 0.6031746 , 0.63888889, 0.6547619 , 0.62151394]),
 'f1': array([0.72852234, 0.6350365 , 0.66420664, 0.68363636, 0.65703971]),
 'precision': array([0.65838509, 0.60416667, 0.63829787, 0.64827586, 0.61904762]),
 'recall': array([0.81538462, 0.66923077, 0.69230769, 0.72307692, 0.7       ]),
 'roc_auc': array([0.72194199, 0.66040353, 0.65368852, 0.68262926, 0.670089  ])}

In [14]:
"""
Support Vector Machines
"""
from sklearn.svm import SVC

svm_grid = {'C': [0.1, 1, 10, 100, 1000], 
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf']} 
  
svm_gridsearch = GridSearchCV(SVC(),
                              param_grid = svm_grid,
                              refit = True)

selected_svm = svm_gridsearch.fit(X,Y).best_estimator_
model_evaluation(selected_svm, X , Y)

{'accuracy': array([0.78968254, 0.74206349, 0.8015873 , 0.72619048, 0.80876494]),
 'f1': array([0.80149813, 0.72803347, 0.8046875 , 0.73359073, 0.80952381]),
 'precision': array([0.7810219 , 0.79816514, 0.81746032, 0.73643411, 0.83606557]),
 'recall': array([0.82307692, 0.66923077, 0.79230769, 0.73076923, 0.78461538]),
 'roc_auc': array([0.81443884, 0.77723834, 0.81759142, 0.75945776, 0.81691036])}

In [15]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

nb_params = {'var_smoothing': np.logspace(0,-9, num=100)}
nb_gridsearch = GridSearchCV(estimator=nb, 
                     param_grid = nb_params,
                     refit = True)
selected_nb = nb_gridsearch.fit(X,Y).best_estimator_
model_evaluation(selected_nb, X, Y)


{'accuracy': array([0.51587302, 0.66269841, 0.51587302, 0.51587302, 0.51792829]),
 'f1': array([0.68062827, 0.71760797, 0.68062827, 0.68062827, 0.6824147 ]),
 'precision': array([0.51587302, 0.63157895, 0.51587302, 0.51587302, 0.51792829]),
 'recall': array([1.        , 0.83076923, 1.        , 1.        , 1.        ]),
 'roc_auc': array([0.55551702, 0.76576293, 0.50737705, 0.50138714, 0.4654164 ])}

In [16]:
"""Boosting Algorithms"""
from sklearn.ensemble import AdaBoostClassifier
boosting_models = dict()

In [21]:
#AdaBoost
ab_param_grid = {
    'n_estimators' : [100, 300, 500],
    'learning_rate' : [1e-3, 1e-2, 1e-1, 1]
}
ab_model = AdaBoostClassifier(random_state = RANDOM_STATE)
xgb_gridsearch = GridSearchCV(ab_model,
                              param_grid = ab_param_grid,
                              refit = True)

selected_xgb = xgb_gridsearch.fit(X, Y).best_estimator_
model_evaluation(selected_xgb, X, Y)

{'accuracy': array([0.78968254, 0.74206349, 0.79761905, 0.73015873, 0.812749  ]),
 'f1': array([0.80149813, 0.72803347, 0.8       , 0.73846154, 0.812749  ]),
 'precision': array([0.7810219 , 0.79816514, 0.816     , 0.73846154, 0.84297521]),
 'recall': array([0.82307692, 0.66923077, 0.78461538, 0.73846154, 0.78461538]),
 'roc_auc': array([0.78858764, 0.74445145, 0.7980454 , 0.72988651, 0.8137953 ])}

In [17]:
#XGBoost
import xgboost as xgb

parameters = {
            'max_depth': [3, 4, 5],
            'learning_rate': [0.01, 0.1, 1],
            'n_estimators': [200, 400],
            'gamma': [0.01, 0.1, 0.2],
            'min_child_weight': [0, 0.5, 1],
            'max_delta_step': [0],
            'subsample': [0.7, 1],
            'colsample_bytree': [0.6, 1],
            'reg_alpha': [0, 1e-2, 1],
            'reg_lambda': [0, 1e-2, 1],
            }

xgb_model = xgb.XGBClassifier(silent = True,
                              random_state = RANDOM_STATE)

xgb_gridsearch = GridSearchCV(xgb_model,
                              parameters,
                              refit = True)

selected_xgb = xgb_gridsearch.fit(X, Y).best_estimator_
model_evaluation(selected_xgb, X, Y)



Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 



Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 




Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down



Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 



Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 



Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




{'accuracy': array([0.75      , 0.73412698, 0.75793651, 0.71428571, 0.76095618]),
 'f1': array([0.76045627, 0.7372549 , 0.76264591, 0.72307692, 0.75206612]),
 'precision': array([0.7518797 , 0.752     , 0.77165354, 0.72307692, 0.8125    ]),
 'recall': array([0.76923077, 0.72307692, 0.75384615, 0.72307692, 0.7       ]),
 'roc_auc': array([0.81462799, 0.7925599 , 0.81456494, 0.76330391, 0.81805467])}

In [18]:
"""Bagging Algorithms"""
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging_models = dict()
bagging_models['rf'] = RandomForestClassifier(random_state = RANDOM_STATE)
bagging_models['et'] = ExtraTreesClassifier(random_state = RANDOM_STATE)
bagging_models['bdt'] = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                          random_state = RANDOM_STATE)


In [22]:
#RandomForest
rf_grid = {
    'max_depth': [4, 5, 6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [200, 400]
}

rf = RandomForestClassifier(random_state = RANDOM_STATE)
rf_gridsearch = GridSearchCV(rf, param_grid = rf_grid,
                             refit = True)
selected_rf = rf_gridsearch.fit(X, Y).best_estimator_
model_evaluation(selected_rf, X, Y)

{'accuracy': array([0.78174603, 0.75396825, 0.79761905, 0.73809524, 0.80876494]),
 'f1': array([0.79400749, 0.73728814, 0.8030888 , 0.74418605, 0.80645161]),
 'precision': array([0.77372263, 0.82075472, 0.80620155, 0.75      , 0.84745763]),
 'recall': array([0.81538462, 0.66923077, 0.8       , 0.73846154, 0.76923077]),
 'roc_auc': array([0.83947037, 0.78997478, 0.82509458, 0.79123581, 0.8640178 ])}

In [None]:
#Extra Trees

et_grid = {
    'max_depth': [4, 5, 6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [200, 400],
    'oob_score': [True, False]
}

In [19]:
"""Stacking Ensemble"""
from sklearn.ensemble import StackingClassifier

stacking_estimators = [
    ('lr', selected_lr),
    ('knn', selected_knn),
    ('svm', selected_svm),
    ('gnb', selected_nb),
    ('dt', selected_dt)
]

final_estimator = selected_lr

stacking_model = StackingClassifier(estimators = stacking_estimators,
                                    final_estimator = final_estimator)



In [20]:
"""Voting Ensemble"""
from sklearn.ensemble import VotingClassifier

voting_estimators = [
    ('lr', LogisticRegression(random_state=RANDOM_STATE)),
    ('knn', KNeighborsClassifier()),
    ('svm', SVC(random_state=RANDOM_STATE)),
    ('gnb', GaussianNB()),
    ('dt', DecisionTreeClassifier(random_state=RANDOM_STATE))
]

voting_classifier = VotingClassifier(voting_estimators,
                                     voting=VOTING_METHOD)