In [None]:
!pip install imbalanced-learn
from imblearn.datasets import fetch_datasets
X, Y = [], []
# dataset_names = ['ecoli']
dataset_names = ['ecoli', 'optical_digits',
                 'satimage', 'pen_digits',
                 'abalone', 'sick_euthyroid', 'spectrometer',
                 'car_eval_34', 'isolet', 'us_crime', 'yeast_ml8',
                 'scene']
#dataset_names = ['ecoli']
for ds_name in dataset_names:
    var = fetch_datasets()[ds_name]
    X.append(var['data'])
    Y.append(var['target'])

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
# from ipynb.fs.full.data_wrangling import * #Data preprocessing notebook
#from ipynb.fs.full.data_preparation import *
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import statistics
from itertools import chain
from collections import Counter
from sklearn.model_selection import GridSearchCV
import pandas as pd

SCORINGS = {
    'f1': make_scorer(f1_score, average = None),
    'precision': make_scorer(precision_score, average = None),
    'recall': make_scorer(recall_score, average = None),
    'roc_auc': make_scorer(roc_auc_score, average = None)
}

"""
Config individual experiment
"""

MODELS_TO_OPTIMIZE = ["K-Nearest Neighbors", "Logistic Regression", "Support Vector Machines",
                     "Naive Bayes", "Decision Tree", "AdaBoost", "Random Forest",  "Bagging (DT)",
                     "Extra Trees"]

POSSIBLE_ESTIMATORS_FOR_ADABOOST = ["Decision Tree", "Random Forest", "Support Vector Machines"]

cv_splits = 10
repetitions = 1
RANDOM_STATE = 42
cross_validation_setting = RepeatedStratifiedKFold(n_splits=cv_splits,
                                                   n_repeats=repetitions,
                                                   random_state= RANDOM_STATE)
        
def cross_validate(model, X_train, X_test, y_train, y_test, metric):
    for index in range(len(X_train)):
        model.fit(X_train, y_train)
        Y_pred = model.predict(X_test)


def model_evaluation(model, features, target, 
                     cv = cross_validation_setting):
    scores = dict()
    formatted_scores = dict()
    formatted_scores['model'] = model
    for scoring_name, scoring_function in SCORINGS.items():
        scores[score_metric] = cross_validate(model, X, Y, 
                                              scoring = scoring_function,
                                              cv = cross_validation_setting)
        return scores[score_metric]

def get_scores(Y_pred, Y_true):
    f1 = f1_score(Y_true, Y_pred, average=None)
    precision = precision_score(Y_true, Y_pred,
                                average = None)
    recall = recall_score(Y_true, Y_pred, 
                          average = None)
    roc = roc_auc_score(Y_true, Y_pred, 
                        average = None)
    acc = accuracy_score(Y_true, Y_pred)
    
    return f1, precision, recall, roc, acc

def convert_df(X, Y):
    return X.to_numpy(), Y.to_numpy()

def fault_cases(predictions, answers, indexes):
    failed_cases = []
    for i in range(len(predictions)):
        if (predictions[i] != answers[i]):
            failed_cases.append(indexes[i])
    return failed_cases

def get_best_estimator(model, param_grid, X, Y):
    model_gridsearch = GridSearchCV(model, param_grid#,
                                    #refit = True
                                   )
    return model_gridsearch.fit(X, Y).best_estimator_
    
def evaluate_model(dataset_name, models,
                   hyperparameters_grid, X, Y, sk_fold,
                   metric = "F1 Average B", sampling = ''):
    
    best_metric_score = 0
    best_failed_cases = []
    estimators = dict()
    folds = sk_fold.split(X, Y)
    model_names = list(models.keys())
    aux_table = pd.DataFrame()
    for model_name in model_names:
        
        if model_name in MODELS_TO_OPTIMIZE:
            estimator = get_best_estimator(models[model_name],
                                           hyperparameters_grid[model_name],
                                           X, Y)
        else:
            estimator = model[model_name]

        f1_list, precision_list, recall_list, auc_list, acc_list = [], [], [], [], []
        aux_failed_cases_index, failed_cases_index = [], []
        for train_index, test_index in sk_fold.split(X, Y):

            model_estimator = estimator
            
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            model_estimator.fit(X_train, Y_train)
            Y_pred = model_estimator.predict(X_test)
            
            f1, precision, recall, roc, acc = get_scores(Y_pred, Y_test)

            failed_cases_index.append(fault_cases(Y_pred, Y_test, test_index))
            f1_list.append(f1)
            precision_list.append(precision)
            recall_list.append(recall)
            auc_list.append(roc)
            acc_list.append(acc)
            
        estimators[model_name] = estimator
            
        aux_table, aux_failed_cases_index =  format_return(dataset_name, model_name,
                                                           f1_list, precision_list,
                                                           recall_list, auc_list,
                                                           acc_list, metric,
                                                           sampling), failed_cases_index[0]
        if aux_table.iloc[0][metric] > best_metric_score:
            best_model_table = aux_table
            best_metric_score = aux_table.iloc[0][metric]
            best_failed_cases = aux_failed_cases_index
            best_estimator = estimators[model_name]
            best_estimator_name = model_name
            
    return estimators, best_model_table, best_failed_cases, best_estimator, best_estimator_name

WEAK_LEARNER_COLUMNS = ["Dataset Name", "Base Model Name", "F1 Average B", "F1 Class 0 B", "F1 Class 1 B",
           "Recall Average B", "Recall Class 0 B", "Recall Class 1 B",
           "Precision Average B", "Precision Class 0 B", "Precision Class 1 B",
           "AUC B", "Accuracy B"]

ENSEMBLE_COLUMNS = ["Dataset Name", "Ensemble Model Name", "F1 Average E", "F1 Class 0 E", "F1 Class 1 E",
                   "Recall Average E", "Recall Class 0 E", "Recall Class 1 E",
                   "Precision Average E", "Precision Class 0 E", "Precision Class 1 E",
                   "AUC E", "Accuracy E"]

RESULT = ["Dataset Name", "Base Model Name", "F1 Average B", "F1 Class 0 B", "F1 Class 1 B",
           "Recall Average B", "Recall Class 0 B", "Recall Class 1 B",
           "Precision Average B", "Precision Class 0 B", "Precision Class 1 B",
           "AUC B", "Accuracy B", "Ensemble Model Name", "F1 Average E", "F1 Class 0 E", "F1 Class 1 E",
           "Recall Average E", "Recall Class 0 E", "Recall Class 1 E",
           "Precision Average E", "Precision Class 0 E", "Precision Class 1 E",
           "AUC E", "Accuracy E"]

def format_return(dataset_name, model_name, f1_list, precision_list,
                  recall_list, auc_list, acc_list, metric, sampling = ''):
    if (metric == WEAK_LEARNER_METRIC):
        COLUMNS = WEAK_LEARNER_COLUMNS
    else:
        COLUMNS = ENSEMBLE_COLUMNS
    dataframe = pd.DataFrame(columns = COLUMNS)
    dataframe_line = []
    dataframe_line.append(dataset_name + sampling)
    dataframe_line.append(model_name)
    aux = []
    lists_of_score_list = []
    lists_of_score_list.append(f1_list)
    lists_of_score_list.append(precision_list)
    lists_of_score_list.append(recall_list)
    for score_list in lists_of_score_list:
        dataframe_line.extend((statistics.mean(flatten_list(score_list)),
                               statistics.mean([score[0] for score in score_list]),
                               statistics.mean([score[1] for score in score_list])))

    dataframe_line.append(statistics.mean(auc_list))
    dataframe_line.append(statistics.mean(acc_list))
    return dataframe.append(pd.Series(dataframe_line, index = COLUMNS),
                            ignore_index = True)

def flatten_list(lista):
    return [value for sublist in lista for value in sublist]  

In [None]:
"""
Config for every experiment
"""
RANDOM_STATE = 0
VOTING_METHOD = 'hard'
#results = pd.DataFrame(columns = COLUMNS)

param_grids = dict()
weak_learners_base_models = dict()


In [None]:

"""
Weak Learners
"""

"""
KNN
"""
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

k_range = list(range(1,31))
weight_options = ["uniform", "distance"]

knn_grid = dict(n_neighbors = k_range, 
                weights = weight_options)

param_grids["K-Nearest Neighbors"] = knn_grid
knn = KNeighborsClassifier()
weak_learners_base_models["K-Nearest Neighbors"] = knn

"""
LR
"""
from sklearn.linear_model import LogisticRegression

log_reg_grid={'C':[0.001,0.01,.09,1,5,10],
              "penalty":["l1","l2"]} #l1 lasso l2 ridge
lr = LogisticRegression(random_state=RANDOM_STATE)

param_grids["Logistic Regression"] = log_reg_grid
weak_learners_base_models["Logistic Regression"] = lr

"""
Support Vector Machines
"""
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

svm_grid = {'C': [0.1, 1, 10, 100, 1000], 
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf']}

param_grids["Support Vector Machines"] = svm_grid
weak_learners_base_models["Support Vector Machines"] = SVC()
"""
Naive Bayes
"""
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb_params = {'var_smoothing': np.logspace(0,-9, num=100)}

weak_learners_base_models["Naive Bayes"] = nb
param_grids["Naive Bayes"] = nb_params

"""
DECISION TREES
"""
from sklearn.tree import DecisionTreeClassifier
tree_parameters = {'criterion':['gini','entropy'],
             'max_depth':[4,5,10]}

param_grids['Decision Tree'] = tree_parameters

dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
weak_learners_base_models['Decision Tree'] = dt




In [None]:
def generate_ensembles(best_base_estimators, best_estimator, best_estimator_name):
    """
    ENSEMBLES
    """
    #AdaBoost
    from sklearn.ensemble import AdaBoostClassifier

    ensembles = dict()
    ensembles_param_grids = dict()

    ab_param_grid = {
        'n_estimators' : [400, 600],
        'learning_rate' : [1e-3, 1e-2, 1e-1, 1],
        'algorithm' : ['SAMME']
    }
    if best_estimator_name in POSSIBLE_ESTIMATORS_FOR_ADABOOST:
        ab_model = AdaBoostClassifier(base_estimator = best_estimator,
                                      random_state = RANDOM_STATE)
    else:
        ab_model = AdaBoostClassifier(base_estimator = best_base_estimators['Support Vector Machines'],
                                      random_state = RANDOM_STATE)

    ensembles_param_grids['AdaBoost'] = ab_param_grid
    ensembles['AdaBoost'] = ab_model


    #RandomForest
    from sklearn.ensemble import RandomForestClassifier

    rf_grid = {
        'criterion' : ['gini', 'entropy'],
#         'max_depth': [4, 5, 6],
        'min_samples_leaf': [3, 5, 10],
#         'min_samples_split': [8, 10, 12],
        'n_estimators': [200, 400, 600],
        'max_features' : ['auto', 'log2']
    }

    rf = RandomForestClassifier(random_state = RANDOM_STATE)

    ensembles_param_grids['Random Forest'] = rf_grid
    ensembles['Random Forest'] = rf

    #Bagging
    from sklearn.ensemble import BaggingClassifier

    bagging_param_grid = {
        #'base_estimator__max_depth' : [ 4, 5],
        'max_samples' : [0.05, 0.1, 0.2, 0.5]
    }

    bg_clf = BaggingClassifier(base_estimator=best_estimator,
                               random_state = RANDOM_STATE)

    ensembles_param_grids['Bagging (DT)'] = bagging_param_grid
    ensembles['Bagging (DT)'] = bg_clf

    #Extra Trees
    from sklearn.ensemble import ExtraTreesClassifier

    et_grid = {
        'max_depth': [4, 5, 6],
        'min_samples_leaf': [3, 4, 5],
        'min_samples_split': [8, 10, 12],
        'n_estimators': [200, 400],
        'oob_score': [True, False]
    }

    et_clf = ExtraTreesClassifier(random_state = RANDOM_STATE)

    ensembles_param_grids['Extra Trees'] = et_grid

    ensembles['Extra Trees'] = et_clf

    #Stacking 

    """Stacking Ensemble"""
    from sklearn.ensemble import StackingClassifier

    stacking_estimators = [
        ('lr', best_base_estimators['Logistic Regression']),
        ('knn', best_base_estimators['K-Nearest Neighbors']),
        ('svm', best_base_estimators['Support Vector Machines']),
        ('gnb', best_base_estimators['Naive Bayes']),
        ('dt',  best_base_estimators['Decision Tree'])
    ]

    final_estimator = best_estimator

    stacking_model = StackingClassifier(estimators = stacking_estimators,
                                        final_estimator = final_estimator)

    """Voting Ensemble"""
    from sklearn.ensemble import VotingClassifier

    voting_estimators = [
        ('lr', best_base_estimators['Logistic Regression']),
        ('knn', best_base_estimators['K-Nearest Neighbors']),
        ('svm', best_base_estimators['Support Vector Machines']),
        ('gnb', best_base_estimators['Naive Bayes']),
        ('dt', best_base_estimators['Decision Tree'])
    ]

    VOTING_METHOD = 'hard'

    voting_classifier = VotingClassifier(voting_estimators,
                                         voting=VOTING_METHOD)
    
    return ensembles, ensembles_param_grids

In [None]:
dataframe = pd.DataFrame(columns = WEAK_LEARNER_COLUMNS)
dataframe2 = pd.DataFrame(columns = ENSEMBLE_COLUMNS)
result = pd.DataFrame(columns = RESULT)
cases = []
WEAK_LEARNER_METRIC = "F1 Average B"
ENSEMBLE_LEARNER_METRIC = "F1 Average E"
best_base_estimators = dict()
best_base_clfs = []
for index in range(len(dataset_names)):
    
    data = X[index]
    target = Y[index]
    
    best_estimators, model_scores, _, current_best_clf, best_estimator_name = evaluate_model(
                   dataset_names[index], weak_learners_base_models, param_grids, data, target,
                   cross_validation_setting, metric = WEAK_LEARNER_METRIC, sampling = '')
    
    print(current_best_clf)
    
    ensembles, ensembles_param_grids = generate_ensembles(best_estimators,
                                                          current_best_clf,
                                                          best_estimator_name)
    _, ensemble_scores, ensemble_error_cases, _, _ = evaluate_model(dataset_names[index], 
                   ensembles, ensembles_param_grids, data, target,
                   cross_validation_setting, metric = ENSEMBLE_LEARNER_METRIC)
    print(ensemble_scores)
    dataframe = dataframe.append(model_scores)
    dataframe2 = dataframe2.append(ensemble_scores)

In [None]:
dataframe

In [None]:
#UNDERSAMPLING
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(sampling_strategy='majority')

for index in range(len(dataset_names)):
    data = X[index]
    target = Y[index]
    X_under, y_under = undersample.fit_resample(data, target)
    best_estimators, model_scores, _, current_best_clf, best_estimator_name = evaluate_model(
                   dataset_names[index], weak_learners_base_models, param_grids, X_under, y_under,
                   cross_validation_setting, metric = WEAK_LEARNER_METRIC, sampling = ' Undersampled')
    
    print(current_best_clf)
    
    ensembles, ensembles_param_grids = generate_ensembles(best_estimators,
                                                          current_best_clf,
                                                          best_estimator_name)
    _, ensemble_scores, ensemble_error_cases, _, _ = evaluate_model(dataset_names[index], 
                   ensembles, ensembles_param_grids, X_under, y_under,
                   cross_validation_setting, metric = ENSEMBLE_LEARNER_METRIC)
    print(ensemble_scores)
    dataframe = dataframe.append(model_scores)
    dataframe2 = dataframe2.append(ensemble_scores)

In [None]:
#OVERSAMPLING
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler()

for index in range(len(dataset_names)):
    
    data = X[index]
    target = Y[index]
    X_over, y_over = oversample.fit_resample(data, target)
    
    best_estimators, model_scores, _, current_best_clf, best_estimator_name = evaluate_model(
                   dataset_names[index], weak_learners_base_models, param_grids, X_over, y_over,
                   cross_validation_setting, metric = WEAK_LEARNER_METRIC, sampling = ' Oversampled')
    
    
    ensembles, ensembles_param_grids = generate_ensembles(best_estimators,
                                                          current_best_clf,
                                                          best_estimator_name)
    _, ensemble_scores, ensemble_error_cases, _, _ = evaluate_model(dataset_names[index], 
                   ensembles, ensembles_param_grids, X_over, y_over,
                   cross_validation_setting, metric = ENSEMBLE_LEARNER_METRIC)
    dataframe = dataframe.append(model_scores)
    dataframe2 = dataframe2.append(ensemble_scores)
    
aux_df = dataframe.reset_index()
dataframe2 = dataframe2.reset_index()
aux_df = aux_df.join(dataframe2, lsuffix='', rsuffix='_to_delete')
result = aux_df

In [None]:
"""
ENSEMBLES
"""

#AdaBoost
from sklearn.ensemble import AdaBoostClassifier

ensembles = dict()
ensembles_param_grids = dict()

ab_param_grid = {
    'n_estimators' : [100, 300, 500],
    'learning_rate' : [1e-3, 1e-2, 1e-1, 1]
}
ab_model = AdaBoostClassifier(random_state = RANDOM_STATE)

ensembles_param_grids['AdaBoost'] = ab_param_grid
ensembles['AdaBoost'] = ab_model


#RandomForest
from sklearn.ensemble import RandomForestClassifier

rf_grid = {
    'max_depth': [4, 5, 6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [200, 400]
}

rf = RandomForestClassifier(random_state = RANDOM_STATE)

ensembles_param_grids['Random Forest'] = rf_grid
ensembles['Random Forest'] = rf

#Bagging
from sklearn.ensemble import BaggingClassifier

bagging_param_grid = {
    'base_estimator__max_depth' : [1, 2, 3, 4, 5],
    'max_samples' : [0.05, 0.1, 0.2, 0.5]
}

bg_clf = BaggingClassifier(base_estimator=best_base_estimators['Decision Tree'],
                           random_state = RANDOM_STATE)

ensembles_param_grids['Bagging (DT)'] = bagging_param_grid
ensembles['Bagging (DT)'] = bg_clf

#Extra Trees
from sklearn.ensemble import ExtraTreesClassifier

et_grid = {
    'max_depth': [4, 5, 6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [200, 400],
    'oob_score': [True, False]
}

et_clf = ExtraTreesClassifier(random_state = RANDOM_STATE)

ensembles_param_grids['Extra Trees'] = et_grid

ensembles['Extra Trees'] = et_clf

#Stacking 

"""Stacking Ensemble"""
from sklearn.ensemble import StackingClassifier

stacking_estimators = [
    ('lr', best_base_estimators['Logistic Regression']),
    ('knn', best_base_estimators['K-Nearest Neighbors']),
    ('svm', best_base_estimators['Support Vector Machines']),
    ('gnb', best_base_estimators['Naive Bayes']),
    ('dt',  best_base_estimators['Decision Tree'])
]

final_estimator = best_base_estimators['Logistic Regression']

stacking_model = StackingClassifier(estimators = stacking_estimators,
                                    final_estimator = final_estimator)

"""Voting Ensemble"""
from sklearn.ensemble import VotingClassifier

voting_estimators = [
    ('lr', best_base_estimators['Logistic Regression']),
    ('knn', best_base_estimators['K-Nearest Neighbors']),
    ('svm', best_base_estimators['Support Vector Machines']),
    ('gnb', best_base_estimators['Naive Bayes']),
    ('dt', best_base_estimators['Decision Tree'])
]

VOTING_METHOD = 'hard'

voting_classifier = VotingClassifier(voting_estimators,
                                     voting=VOTING_METHOD)