In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from ipynb.fs.full.data_wrangling import * #Data preprocessing notebook
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import statistics
from itertools import chain
from collections import Counter


SCORINGS = {
    'f1': make_scorer(f1_score, average = None),
    'precision': make_scorer(precision_score, average = None),
    'recall': make_scorer(recall_score, average = None),
    'roc_auc': make_scorer(roc_auc_score, average = None)
}

cv_splits = 10
repetitions = 1
RANDOM_STATE = 42
cross_validation_setting = RepeatedStratifiedKFold(n_splits=cv_splits,
                                                   n_repeats=repetitions,
                                                   random_state= RANDOM_STATE)
        
def cross_validate(model, X_train, X_test, y_train, y_test, metric):
    for index in range(len(X_train)):
        model.fit(X_train, y_train)
        Y_pred = model.predict(X_test)


def model_evaluation(model, features, target, 
                     cv = cross_validation_setting):
    scores = dict()
    formatted_scores = dict()
    formatted_scores['model'] = model
    for scoring_name, scoring_function in SCORINGS.items():
        scores[score_metric] = cross_validate(model, X, Y, 
                                              scoring = scoring_function,
                                              cv = cross_validation_setting)
        return scores[score_metric]
#         formatted_scores[score_metric + " score"] = scores[score_metric].mean()
#         formatted_scores[score_metric + " std"] = scores[score_metric].std()
#     return formatted_scores

def get_scores(Y_pred, Y_true):
    f1 = f1_score(Y_true, Y_pred, average=None)
    precision = precision_score(Y_true, Y_pred,
                                average = None)
    recall = recall_score(Y_true, Y_pred, 
                          average = None)
    roc = roc_auc_score(Y_true, Y_pred, 
                        average = None)
    acc = accuracy_score(Y_true, Y_pred)
    
    return f1, precision, recall, roc, acc

def convert_df(X, Y):
    return X.to_numpy(), Y.to_numpy()

def fault_cases(predictions, answers, indexes):
    failed_cases = []
    for i in range(len(predictions)):
        if (predictions[i] != answers[i]):
            failed_cases.append(indexes[i])
    return failed_cases
    
def evaluate_model(model_name, model, X, Y, sk_fold):
    X, Y = convert_df(X, Y)
    f1_list, precision_list, recall_list, auc_list, acc_list = [], [], [], [], []
    
    for train_index, test_index in sk_fold.split(X, Y):
        
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        model.fit(X_train, Y_train)
        
        failed_cases_index = []
        Y_pred = model.predict(X_test)
        failed_cases_index.append(fault_cases(Y_pred, Y_test, test_index))
    
        f1, precision, recall, roc, acc = get_scores(Y_pred, Y_test)
        f1_list.append(f1)
        precision_list.append(precision)
        recall_list.append(recall)
        auc_list.append(roc)
        acc_list.append(acc)
        
    return format_return(model_name, f1_list, precision_list,
                         recall_list, auc_list, acc_list), failed_cases_index[0]

COLUMNS = ["Model Name", "F1 Average", "F1 Class 0", "F1 Class 1",
           "Recall Average", "Recall Class 0", "Recall Class 1",
           "Precision Average", "Precision Class 0", "Precision Class 1",
           "AUC", "Accuracy"]

def format_return(model_name, f1_list, precision_list,
                  recall_list, auc_list, acc_list):
    dataframe = pd.DataFrame(columns = COLUMNS)
    dataframe_line = []
    dataframe_line.append(model_name)
    aux = []
    lists_of_score_list = []
    lists_of_score_list.append(f1_list)
    lists_of_score_list.append(precision_list)
    lists_of_score_list.append(recall_list)
    # auc_list
    for score_list in lists_of_score_list:
        dataframe_line.extend((statistics.mean(flatten_list(score_list)),
                               statistics.mean([score[0] for score in score_list]),
                               statistics.mean([score[1] for score in score_list])))
    dataframe_line.append(statistics.mean(auc_list))
    dataframe_line.append(statistics.mean(acc_list))
    return dataframe.append(pd.Series(dataframe_line, index = COLUMNS),
                            ignore_index = True)

def flatten_list(lista):
    return [value for sublist in lista for value in sublist]
    
    

    
        
        
        


    
    
    

In [None]:
"""
Config for every experiment
"""
RANDOM_STATE = 0
VOTING_METHOD = 'hard'
results = pd.DataFrame(columns = COLUMNS)


In [None]:
"""
Getting the best weak learners (GridSearchCV section)
"""
from sklearn.model_selection import GridSearchCV
# --------------------------------------------

In [None]:
"""
K Neighbors
"""
from sklearn.neighbors import KNeighborsClassifier

k_range = list(range(1,31))
weight_options = ["uniform", "distance"]

knn_grid = dict(n_neighbors = k_range, 
                weights = weight_options)
knn = KNeighborsClassifier()
knn_gridsearch = GridSearchCV(knn, param_grid = knn_grid,
                             refit = True)
selected_knn = knn_gridsearch.fit(X, Y).best_estimator_
failed_cases = []
scores, failed = evaluate_model("KNN", selected_knn, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)
#results = results.append(model_evaluation(selected_knn, X , Y), ignore_index = True)

In [None]:
results

In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg_grid={'C':[0.001,0.01,.09,1,5,10],
              "penalty":["l1","l2"]} #l1 lasso l2 ridge
lr = LogisticRegression(random_state=RANDOM_STATE)
log_reg_gridsearch = GridSearchCV(lr, param_grid = log_reg_grid,
                             refit = True)
selected_log_reg = log_reg_gridsearch.fit(X,Y).best_estimator_
scores, failed = evaluate_model("Logistic Regression",selected_log_reg, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)

In [None]:
"""
Support Vector Machines
"""
from sklearn.svm import SVC

svm_grid = {'C': [0.1, 1, 10, 100, 1000], 
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf']} 
  
svm_gridsearch = GridSearchCV(SVC(),
                              param_grid = svm_grid,
                              refit = True)

selected_svm = svm_gridsearch.fit(X,Y).best_estimator_
scores, failed = evaluate_model("SVM",selected_svm, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)


In [None]:
results

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

nb_params = {'var_smoothing': np.logspace(0,-9, num=100)}
nb_gridsearch = GridSearchCV(estimator=nb, 
                     param_grid = nb_params,
                     refit = True)
selected_nb = nb_gridsearch.fit(X,Y).best_estimator_
scores, failed = evaluate_model("Naive Bayes",selected_nb, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)


In [None]:
from sklearn.tree import DecisionTreeClassifier
tree_para = {'criterion':['gini','entropy'],
             'max_depth':[4,5,10]}
dt = DecisionTreeClassifier(random_state=0)
tree_gridsearch = GridSearchCV(dt, param_grid = tree_para, cv=5)
selected_dt = tree_gridsearch.fit(X, Y).best_estimator_
scores, failed = evaluate_model("Decision Trees",selected_dt, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)


In [None]:
results

In [None]:
"""Boosting Algorithms"""
from sklearn.ensemble import AdaBoostClassifier
boosting_models = dict()

In [None]:
#AdaBoost
ab_param_grid = {
    'n_estimators' : [100, 300, 500],
    'learning_rate' : [1e-3, 1e-2, 1e-1, 1]
}
ab_model = AdaBoostClassifier(random_state = RANDOM_STATE)
ab_gridsearchcv = GridSearchCV(ab_model,
                              param_grid = ab_param_grid,
                              refit = True)

selected_ab = ab_gridsearchcv.fit(X, Y).best_estimator_
scores, failed = evaluate_model("AdaBoost",selected_ab, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)


In [None]:
#XGBoost
import xgboost as xgb

parameters = {
            'max_depth': [3, 4, 5],
            'learning_rate': [0.01, 0.1, 1],
            'n_estimators': [200, 400],
            'gamma': [0.01, 0.1, 0.2],
            'min_child_weight': [0, 0.5, 1],
            'max_delta_step': [0],
            'subsample': [0.7, 1],
            'colsample_bytree': [0.6, 1],
            'reg_alpha': [0, 1e-2, 1],
            'reg_lambda': [0, 1e-2, 1],
            }

xgb_model = xgb.XGBClassifier(silent = True,
                              random_state = RANDOM_STATE)

xgb_gridsearch = GridSearchCV(xgb_model,
                              parameters,
                              refit = True)

selected_xgb = xgb_gridsearch.fit(X, Y).best_estimator_
scores, failed = evaluate_model("XGB", selected_xgb, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)


In [None]:
#RandomForest
from sklearn.ensemble import RandomForestClassifier

rf_grid = {
    'max_depth': [4, 5, 6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [200, 400]
}

rf = RandomForestClassifier(random_state = RANDOM_STATE)
rf_gridsearch = GridSearchCV(rf, param_grid = rf_grid,
                             refit = True)
selected_rf = rf_gridsearch.fit(X, Y).best_estimator_
scores, failed = evaluate_model("RandomForest",selected_rf, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)

In [None]:
#Extra Trees
from sklearn.ensemble import ExtraTreesClassifier

et_grid = {
    'max_depth': [4, 5, 6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [200, 400],
    'oob_score': [True, False]
}

et_clf = ExtraTreesClassifier(random_state = RANDOM_STATE)
et_gridsearch = GridSearchCV(et_clf, param_grid = et_grid,
                             refit = True)
selected_et = et_gridsearch.fit(X, Y).best_estimator_
scores, failed = evaluate_model("ExtraTrees",selected_et, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)


In [None]:
results

In [None]:
#BaggingClassifier(DecisionTree) 
from sklearn.ensemble import BaggingClassifier

bagging_param_grid = {
    'base_estimator__max_depth' : [1, 2, 3, 4, 5],
    'max_samples' : [0.05, 0.1, 0.2, 0.5]
}

bg_clf = BaggingClassifier(base_estimator=selected_dt,
                           random_state = RANDOM_STATE)
bg_gridsearch = GridSearchCV(bg_clf, param_grid = bagging_param_grid,
                             refit = True)
selected_bg = bg_gridsearch.fit(X, Y).best_estimator_
scores, failed = evaluate_model("Bagging (Decision Trees)",selected_bg, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)

In [263]:
"""Stacking Ensemble"""
from sklearn.ensemble import StackingClassifier

stacking_estimators = [
    ('lr', selected_log_reg),
    ('knn', selected_knn),
    ('svm', selected_svm),
    ('gnb', selected_nb),
    ('dt', selected_dt)
]

final_estimator = selected_svm

stacking_model = StackingClassifier(estimators = stacking_estimators,
                                    final_estimator = final_estimator)

scores, failed = evaluate_model("Stacking (LR, KNN, SVM, NB, DT)",stacking_model, X, Y, cross_validation_setting)
# results = results.append(scores)
# failed_cases.append(failed)




ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TER

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TER

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TER

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [264]:
scores

Unnamed: 0,Model Name,F1 Average,F1 Class 0,F1 Class 1,Recall Average,Recall Class 0,Recall Class 1,Precision Average,Precision Class 0,Precision Class 1,AUC,Accuracy
0,"Stacking (LR, KNN, SVM, NB, DT)",0.772531,0.770303,0.774759,0.774158,0.756386,0.791929,0.773265,0.78653,0.76,0.773265,0.7728


In [None]:
"""Voting Ensemble"""
from sklearn.ensemble import VotingClassifier

voting_estimators = [
    ('lr', selected_log_reg),
    ('knn', selected_knn),
    ('svm', selected_svm),
    ('gnb', selected_nb),
    ('dt', selected_dt)
]

voting_classifier = VotingClassifier(voting_estimators,
                                     voting=VOTING_METHOD)
scores, failed = evaluate_model("Voting (LR, KNN, SVM, NB, DT)",voting_classifier, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)


In [261]:
results

Unnamed: 0,Model Name,F1 Average,F1 Class 0,F1 Class 1,Recall Average,Recall Class 0,Recall Class 1,Precision Average,Precision Class 0,Precision Class 1,AUC,Accuracy
0,KNN,0.611198,0.578264,0.644132,0.615988,0.617013,0.614964,0.612646,0.546831,0.678462,0.612646,0.614787
0,Logistic Regression,0.374852,0.665757,0.083946,0.34779,0.515276,0.180303,0.530063,0.977049,0.083077,0.530063,0.515467
0,SVM,0.772531,0.770303,0.774759,0.774158,0.756386,0.791929,0.773265,0.78653,0.76,0.773265,0.7728
0,Naive Bayes,0.372808,0.058333,0.687283,0.304217,0.08,0.528435,0.517566,0.045902,0.989231,0.517566,0.532952
0,Decision Trees,0.743622,0.731886,0.755359,0.751644,0.749868,0.75342,0.745167,0.72418,0.766154,0.745167,0.745822
0,AdaBoost,0.774186,0.772098,0.776274,0.775681,0.756535,0.794828,0.774918,0.789836,0.76,0.774918,0.7744
0,XGB,0.770169,0.76835,0.771987,0.771977,0.752681,0.791272,0.770971,0.786557,0.755385,0.770971,0.770432
0,RandomForest,0.774923,0.773684,0.776162,0.776725,0.756837,0.796613,0.775788,0.793115,0.758462,0.775788,0.775187
0,ExtraTrees,0.773398,0.772729,0.774066,0.775173,0.7533,0.797046,0.7743,0.794754,0.753846,0.7743,0.773606
0,Bagging (Decision Trees),0.774186,0.772098,0.776274,0.775681,0.756535,0.794828,0.774918,0.789836,0.76,0.774918,0.7744


In [262]:
results.to_csv("resultados1.csv")

In [265]:
failed_cases

[6,
 28,
 44,
 145,
 178,
 190,
 198,
 218,
 230,
 254,
 260,
 263,
 283,
 295,
 321,
 350,
 354,
 393,
 411,
 431,
 481,
 482,
 500,
 516,
 521,
 534,
 541,
 546,
 556,
 585,
 631,
 634,
 637,
 652,
 707,
 913,
 923,
 948,
 955,
 1008,
 1011,
 1060,
 1087,
 1118,
 1190,
 1200,
 1,
 13,
 37,
 79,
 104,
 117,
 145,
 179,
 197,
 198,
 204,
 208,
 218,
 260,
 263,
 276,
 295,
 299,
 307,
 314,
 372,
 374,
 386,
 411,
 418,
 430,
 431,
 469,
 490,
 500,
 516,
 521,
 534,
 538,
 541,
 546,
 554,
 585,
 616,
 631,
 634,
 662,
 670,
 692,
 693,
 705,
 734,
 741,
 754,
 761,
 772,
 838,
 893,
 896,
 908,
 975,
 993,
 999,
 1011,
 1087,
 1118,
 1149,
 1152,
 1244,
 80,
 143,
 145,
 192,
 218,
 254,
 260,
 263,
 350,
 354,
 372,
 374,
 397,
 411,
 431,
 469,
 500,
 511,
 516,
 541,
 554,
 556,
 577,
 612,
 631,
 634,
 705,
 761,
 890,
 908,
 1058,
 1118,
 1152,
 1213,
 6,
 28,
 44,
 80,
 155,
 164,
 171,
 178,
 190,
 192,
 230,
 254,
 283,
 303,
 321,
 349,
 350,
 354,
 393,
 397,
 423,
 473,
 4

In [259]:
failed_cases = flatten_list(failed_cases)
counter = Counter(chain(failed_cases))
counter = sorted(counter.items(), key=lambda x: x[1], reverse = True)

TypeError: 'numpy.int32' object is not iterable