In [53]:
!pip install imbalanced-learn
from imblearn.datasets import fetch_datasets
X, Y = [], []
dataset_names = ['us_crime', 'car_eval_34']
# dataset_names = ['ecoli', 'optical_digits']
# , 'satimage', 'pen_digits',
#                  'abalone', 'sick_euthyroid', 'spectrometer',
#                  'car_eval_34', 'isolet', 'us_crime', 'yeast_ml8',
#                  'scene']
for ds_name in dataset_names:
    var = fetch_datasets()[ds_name]
    X.append(var['data'])
    Y.append(var['target'])



In [54]:
from sklearn.model_selection import RepeatedStratifiedKFold
# from ipynb.fs.full.data_wrangling import * #Data preprocessing notebook
#from ipynb.fs.full.data_preparation import *
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import statistics
from itertools import chain
from collections import Counter
from sklearn.model_selection import GridSearchCV
import pandas as pd




SCORINGS = {
    'f1': make_scorer(f1_score, average = None),
    'precision': make_scorer(precision_score, average = None),
    'recall': make_scorer(recall_score, average = None),
    'roc_auc': make_scorer(roc_auc_score, average = None)
}

cv_splits = 10
repetitions = 1
RANDOM_STATE = 42
cross_validation_setting = RepeatedStratifiedKFold(n_splits=cv_splits,
                                                   n_repeats=repetitions,
                                                   random_state= RANDOM_STATE)
        
def cross_validate(model, X_train, X_test, y_train, y_test, metric):
    for index in range(len(X_train)):
        model.fit(X_train, y_train)
        Y_pred = model.predict(X_test)


def model_evaluation(model, features, target, 
                     cv = cross_validation_setting):
    scores = dict()
    formatted_scores = dict()
    formatted_scores['model'] = model
    for scoring_name, scoring_function in SCORINGS.items():
        scores[score_metric] = cross_validate(model, X, Y, 
                                              scoring = scoring_function,
                                              cv = cross_validation_setting)
        return scores[score_metric]
#         formatted_scores[score_metric + " score"] = scores[score_metric].mean()
#         formatted_scores[score_metric + " std"] = scores[score_metric].std()
#     return formatted_scores

def get_scores(Y_pred, Y_true):
    f1 = f1_score(Y_true, Y_pred, average=None)
    precision = precision_score(Y_true, Y_pred,
                                average = None)
    recall = recall_score(Y_true, Y_pred, 
                          average = None)
    roc = roc_auc_score(Y_true, Y_pred, 
                        average = None)
    acc = accuracy_score(Y_true, Y_pred)
    
    return f1, precision, recall, roc, acc

def convert_df(X, Y):
    return X.to_numpy(), Y.to_numpy()

def fault_cases(predictions, answers, indexes):
    failed_cases = []
    for i in range(len(predictions)):
        if (predictions[i] != answers[i]):
            failed_cases.append(indexes[i])
    return failed_cases

def get_best_estimator(model, param_grid, X, Y):
    model_gridsearch = GridSearchCV(model, param_grid#,
                                    #refit = True
                                   )
    return model_gridsearch.fit(X, Y).best_estimator_
    
def evaluate_model(dataset_name, models,
                   hyperparameters_grid, X, Y, sk_fold,
                   metric = "F1 Average B"):
    
#     X, Y = convert_df(X, Y)

    best_metric_score = 0
    best_failed_cases = []
    folds = sk_fold.split(X, Y)
    model_names = list(weak_learners_base_models.keys())
    for model_name in model_names:
        print ("Model Name: " + model_name)
        estimator = get_best_estimator(models[model_name],
                                       hyperparameters_grid[model_name],
                                       X, Y)
        print ("Estimator: ")
        print(estimator)
        f1_list, precision_list, recall_list, auc_list, acc_list = [], [], [], [], []
        aux_failed_cases_index = []
        failed_cases_index = []
        for train_index, test_index in sk_fold.split(X, Y)
:
            print ("Model Name: " + model_name)
            print ("Passou")

            model_estimator = estimator
            
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            model_estimator.fit(X_train, Y_train)
            
            Y_pred = model_estimator.predict(X_test)
            f1, precision, recall, roc, acc = get_scores(Y_pred, Y_test)

            failed_cases_index.append(fault_cases(Y_pred, Y_test, test_index))
            f1_list.append(f1)
            precision_list.append(precision)
            recall_list.append(recall)
            auc_list.append(roc)
            acc_list.append(acc)
            
        print ("F1: ")
        print (f1_list)
        print ("Prec: ")
        print (precision_list)
        print ("Rec: ")
        print (recall_list)
        print("Auc: ")
        print (auc_list)
        print ("Acc: ")
        print (acc_list)
            
        aux_table, aux_failed_cases_index =  format_return(dataset_name, model_name,
                                                           f1_list, precision_list,
                                                           recall_list, auc_list,
                                                           acc_list), failed_cases_index[0]
        print("Model: ")
        print (aux_table)
        
        if aux_table.iloc[0][metric] > best_metric_score:
            best_model_table = aux_table
            best_metric_score = aux_table.iloc[0][metric]
            best_failed_cases = aux_failed_cases_index
            
    return best_model_table, best_failed_cases

COLUMNS = ["Dataset Name", "Base Model Name", "F1 Average B", "F1 Class 0 B", "F1 Class 1 B",
           "Recall Average B", "Recall Class 0 B", "Recall Class 1 B",
           "Precision Average B", "Precision Class 0 B", "Precision Class 1 B",
           "AUC B", "Accuracy B"]

ENSEMBLE_COLUMNS = ["Ensemble Model Name", "F1 Average E", "F1 Class 0 E", "F1 Class 1 E",
                   "Recall Average E", "Recall Class 0 E", "Recall Class 1 E",
                   "Precision Average E", "Precision Class 0 E", "Precision Class 1 E",
                   "AUC E", "Accuracy E"]

def format_return(dataset_name, model_name, f1_list, precision_list,
                  recall_list, auc_list, acc_list):
    dataframe = pd.DataFrame(columns = COLUMNS)
    dataframe_line = []
    dataframe_line.append(dataset_name)
    dataframe_line.append(model_name)
    aux = []
    lists_of_score_list = []
    lists_of_score_list.append(f1_list)
    lists_of_score_list.append(precision_list)
    lists_of_score_list.append(recall_list)
    # auc_list
    print (lists_of_score_list)
    for score_list in lists_of_score_list:
        dataframe_line.extend((statistics.mean(flatten_list(score_list)),
                               statistics.mean([score[0] for score in score_list]),
                               statistics.mean([score[1] for score in score_list])))
        print("Size: ")
        print(len(dataframe_line))
    dataframe_line.append(statistics.mean(auc_list))
    dataframe_line.append(statistics.mean(acc_list))
    return dataframe.append(pd.Series(dataframe_line, index = COLUMNS),
                            ignore_index = True)

def flatten_list(lista):
    return [value for sublist in lista for value in sublist]
    
    

    
        
        
        


    
    
    

In [55]:
"""
Config for every experiment
"""
RANDOM_STATE = 0
VOTING_METHOD = 'hard'
results = pd.DataFrame(columns = COLUMNS)

param_grids = dict()
weak_learners_base_models = dict()


In [56]:
"""
Weak Learners
"""

"""
KNN
"""
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

k_range = list(range(1,31))
weight_options = ["uniform", "distance"]

knn_grid = dict(n_neighbors = k_range, 
                weights = weight_options)

param_grids["K-Nearest Neighbors"] = knn_grid
knn = KNeighborsClassifier()
weak_learners_base_models["K-Nearest Neighbors"] = knn

# knn_gridsearch = GridSearchCV(knn, param_grid = knn_grid,
#                              refit = True)
# selected_knn = knn_gridsearch.fit(X, Y).best_estimator_
# failed_cases = []
# scores, failed = evaluate_model("KNN", selected_knn, X, Y, cross_validation_setting)
# results = results.append(scores)
# failed_cases.append(failed)
# results = results.append(model_evaluation(selected_knn, X , Y), ignore_index = True)

"""
LR
"""
# from sklearn.linear_model import LogisticRegression

# log_reg_grid={'C':[0.001,0.01,.09,1,5,10],
#               "penalty":["l1","l2"]} #l1 lasso l2 ridge
# lr = LogisticRegression(random_state=RANDOM_STATE)

# param_grids["Logistic Regression"] = log_reg_grid
# weak_learners_base_models["Logistic Regression"] = lr

"""
Support Vector Machines
"""
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

svm_grid = {'C': [0.1, 1, 10, 100, 1000], 
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf']}

param_grids["Support Vector Machines"] = svm_grid
weak_learners_base_models["Support Vector Machines"] = SVC()
"""
Naive Bayes
"""
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb_params = {'var_smoothing': np.logspace(0,-9, num=100)}

weak_learners_base_models["Naive Bayes"] = nb
param_grids["Naive Bayes"] = nb_params

"""
DECISION TREES
"""
from sklearn.tree import DecisionTreeClassifier
tree_parameters = {'criterion':['gini','entropy'],
             'max_depth':[4,5,10]}

param_grids['Decision Tree'] = tree_parameters

dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
weak_learners_base_models['Decision Tree'] = dt




In [57]:
dataframe = pd.DataFrame(columns = COLUMNS)
cases = []
for index in range(len(dataset_names)):
    data = X[index]
    target = Y[index]
    best, current_case = evaluate_model(dataset_names[index], 
                   weak_learners_base_models,
                   param_grids, data, target,
                   cross_validation_setting,
                   metric = "F1 Average B")
    cases.append(current_case)
    dataframe = dataframe.append(best)
    

Model Name: K-Nearest Neighbors
Estimator: 
KNeighborsClassifier(n_neighbors=8)
Model Name: K-Nearest Neighbors
Passou
Model Name: K-Nearest Neighbors
Passou
Model Name: K-Nearest Neighbors
Passou
Model Name: K-Nearest Neighbors
Passou
Model Name: K-Nearest Neighbors
Passou
Model Name: K-Nearest Neighbors
Passou
Model Name: K-Nearest Neighbors
Passou
Model Name: K-Nearest Neighbors
Passou
Model Name: K-Nearest Neighbors
Passou
Model Name: K-Nearest Neighbors
Passou
F1: 
[array([0.96858639, 0.33333333]), array([0.97112861, 0.42105263]), array([0.96335079, 0.22222222]), array([0.97340426, 0.58333333]), array([0.96      , 0.34782609]), array([0.97082228, 0.47619048]), array([0.97354497, 0.5       ]), array([0.96062992, 0.11764706]), array([0.96      , 0.34782609]), array([0.95514512, 0.10526316])]
Prec: 
[array([0.93908629, 1.        ]), array([0.94387755, 1.        ]), array([0.93401015, 0.66666667]), array([0.95811518, 0.77777778]), array([0.94240838, 0.5       ]), array([0.94818653, 0.

StatisticsError: mean requires at least one data point

In [None]:
cases

In [None]:
dataframe

In [53]:
#Logistic Regression
# log_reg_grid={'C':[0.001,0.01,.09,1,5,10],
#               "penalty":["l1","l2"]} #l1 lasso l2 ridge
# lr = LogisticRegression(random_state=RANDOM_STATE)
# log_reg_gridsearch = GridSearchCV(lr, param_grid = log_reg_grid,
#                              refit = True)
# selected_log_reg = log_reg_gridsearch.fit(X,Y).best_estimator_
# scores, failed = evaluate_model("Logistic Regression",selected_log_reg, X, Y, cross_validation_setting)
# results = results.append(scores)
# failed_cases.append(failed)

Traceback (most recent call last):
  File "C:\Users\Nathan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Nathan\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Nathan\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of it

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Traceback (m

In [None]:
# """
# Support Vector Machines
# """
# from sklearn.svm import SVC
# from sklearn.svm import LinearSVC

# svm_grid = {'C': [0.1, 1, 10, 100, 1000], 
#             'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#             'kernel': ['rbf','poly','sigmoid']} 

  
# svm_gridsearch = GridSearchCV(SVC(),
#                               param_grid = svm_grid,
#                               refit = True)

# selected_svm = svm_gridsearch.fit(X,Y).best_estimator_
# selected_svm_stacking = svm_gridsearch2.fit(X,Y).best_estimator_
# scores, failed = evaluate_model("SVM",selected_svm, X, Y, cross_validation_setting)
# results = results.append(scores)
# failed_cases.append(failed)


In [None]:
# results

In [None]:
# from sklearn.naive_bayes import GaussianNB
# nb = GaussianNB()

# nb_params = {'var_smoothing': np.logspace(0,-9, num=100)}
# nb_gridsearch = GridSearchCV(estimator=nb, 
#                      param_grid = nb_params,
#                      refit = True)
# selected_nb = nb_gridsearch.fit(X,Y).best_estimator_
# scores, failed = evaluate_model("Naive Bayes",selected_nb, X, Y, cross_validation_setting)
# results = results.append(scores)
# failed_cases.append(failed)


In [None]:
# from sklearn.tree import DecisionTreeClassifier
# tree_para = {'criterion':['gini','entropy'],
#              'max_depth':[4,5,10]}
# dt = DecisionTreeClassifier(random_state=0)
# tree_gridsearch = GridSearchCV(dt, param_grid = tree_para, cv=5)
# selected_dt = tree_gridsearch.fit(X, Y).best_estimator_
# scores, failed = evaluate_model("Decision Trees",selected_dt, X, Y, cross_validation_setting)
# results = results.append(scores)
# failed_cases.append(failed)


In [None]:
results

In [None]:
"""Boosting Algorithms"""
from sklearn.ensemble import AdaBoostClassifier
boosting_models = dict()

In [None]:
# #AdaBoost
# ab_param_grid = {
#     'n_estimators' : [100, 300, 500],
#     'learning_rate' : [1e-3, 1e-2, 1e-1, 1]
# }
# ab_model = AdaBoostClassifier(random_state = RANDOM_STATE)
# ab_gridsearchcv = GridSearchCV(ab_model,
#                               param_grid = ab_param_grid,
#                               refit = True)

# selected_ab = ab_gridsearchcv.fit(X, Y).best_estimator_
# scores, failed = evaluate_model("AdaBoost",selected_ab, X, Y, cross_validation_setting)
# results = results.append(scores)
# failed_cases.append(failed)


In [None]:
# #XGBoost
# import xgboost as xgb

# parameters = {
#             'max_depth': [3, 4, 5],
#             'learning_rate': [0.01, 0.1, 1],
#             'n_estimators': [200, 400],
#             'gamma': [0.01, 0.1, 0.2],
#             'min_child_weight': [0, 0.5, 1],
#             'max_delta_step': [0],
#             'subsample': [0.7, 1],
#             'colsample_bytree': [0.6, 1],
#             'reg_alpha': [0, 1e-2, 1],
#             'reg_lambda': [0, 1e-2, 1],
#             }

# xgb_model = xgb.XGBClassifier(silent = True,
#                               random_state = RANDOM_STATE)

# xgb_gridsearch = GridSearchCV(xgb_model,
#                               parameters,
#                               refit = True)

# selected_xgb = xgb_gridsearch.fit(X, Y).best_estimator_
# scores, failed = evaluate_model("XGB", selected_xgb, X, Y, cross_validation_setting)
# results = results.append(scores)
# failed_cases.append(failed)


In [None]:
#RandomForest
from sklearn.ensemble import RandomForestClassifier

rf_grid = {
    'max_depth': [4, 5, 6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [200, 400]
}

rf = RandomForestClassifier(random_state = RANDOM_STATE)
rf_gridsearch = GridSearchCV(rf, param_grid = rf_grid,
                             refit = True)
selected_rf = rf_gridsearch.fit(X, Y).best_estimator_
scores, failed = evaluate_model("RandomForest",selected_rf, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)

In [None]:
# #Extra Trees
# from sklearn.ensemble import ExtraTreesClassifier

# et_grid = {
#     'max_depth': [4, 5, 6],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [200, 400],
#     'oob_score': [True, False]
# }

# et_clf = ExtraTreesClassifier(random_state = RANDOM_STATE)
# et_gridsearch = GridSearchCV(et_clf, param_grid = et_grid,
#                              refit = True)
# # selected_et = et_gridsearch.fit(X, Y).best_estimator_
# # # scores, failed = evaluate_model("ExtraTrees",selected_et, X, Y, cross_validation_setting)
# # # results = results.append(scores)
# # # failed_cases.append(failed)


In [None]:
results

In [None]:
#BaggingClassifier(DecisionTree) 
from sklearn.ensemble import BaggingClassifier

bagging_param_grid = {
    'base_estimator__max_depth' : [1, 2, 3, 4, 5],
    'max_samples' : [0.05, 0.1, 0.2, 0.5]
}

bg_clf = BaggingClassifier(base_estimator=selected_dt,
                           random_state = RANDOM_STATE)
bg_gridsearch = GridSearchCV(bg_clf, param_grid = bagging_param_grid,
                             refit = True)
# selected_bg = bg_gridsearch.fit(X, Y).best_estimator_
# # scores, failed = evaluate_model("Bagging (Decision Trees)",selected_bg, X, Y, cross_validation_setting)
# # results = results.append(scores)
# # failed_cases.append(failed)

In [None]:
"""Stacking Ensemble"""
from sklearn.ensemble import StackingClassifier

stacking_estimators = [
    ('lr', selected_log_reg),
    ('knn', selected_knn),
    ('svm', selected_svm_stacking),
    ('gnb', selected_nb),
    ('dt', selected_dt)
]

final_estimator = selected_log_reg

stacking_model = StackingClassifier(estimators = stacking_estimators,
                                    final_estimator = final_estimator)

scores, failed = evaluate_model("Stacking (LR, KNN, SVM, NB, DT)",stacking_model, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)




In [None]:
results

In [None]:
"""Voting Ensemble"""
from sklearn.ensemble import VotingClassifier

voting_estimators = [
    ('lr', selected_log_reg),
    ('knn', selected_knn),
    ('svm', selected_svm),
    ('gnb', selected_nb),
    ('dt', selected_dt)
]

voting_classifier = VotingClassifier(voting_estimators,
                                     voting=VOTING_METHOD)
scores, failed = evaluate_model("Voting (LR, KNN, SVM, NB, DT)",voting_classifier, X, Y, cross_validation_setting)
results = results.append(scores)
failed_cases.append(failed)


In [None]:
results

In [None]:
len(failed_cases[0])

In [None]:
len(set(failed_cases[0]) & set(failed_cases[1]))

In [None]:
results.to_csv("resultados1.csv")

In [None]:
failed_cases[0] == failed_cases[2]

In [None]:
failed_cases = flatten_list(failed_cases)
counter = Counter(chain(failed_cases))
counter = sorted(counter.items(), key=lambda x: x[1], reverse = True)