In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import copy
import pickle
#import statistics as stats
from collections import Counter
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import MinMaxScaler
#from sklearn.preprocessing import MaxAbsScaler
#from sklearn.preprocessing import RobustScaler
#from sklearn.preprocessing import PowerTransformer
from sklearn.impute import SimpleImputer
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import cross_validate
from imblearn.metrics import geometric_mean_score
#from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import Perceptron
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.calibration import CalibratedClassifierCV
from deslib.static import Oracle
import oracle as oracle_v2
from rlo import * #rlo.py - Random Linear Oracle implementation based on Kuncheva's book. 
#from sklearn.svm import SVC
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.neighbors import KNeighborsClassifier
#from xgboost import XGBClassifier
#from datetime import datetime


### Loading data

#### First let's build a function to convert KEEL data to a regular CSV (remove annotations before data):

In [2]:
def keel2csv(file):
    ''' Reads a KEEL .dat file, converts it into a regular CSV data file 
    that contains a header line. The output .csv file is written to the same 
    dir as the original .dat file. This function also returns a dict 
    {'numeric':[], 'nominal':[]} containg two lists, one for the numeric 
    attributes and the other for the nominal atributes.'''
    filename = file.name
    # Let's read the attribute types (useful for preprocessing) and also the 
    # column names from the @annotations, inclunding the target (class) column:
    has_inputs = has_outputs = False
    numeric_atts = []
    nominal_atts = []
    for line in file:
        if '@attribute' in line:
            if (' real' in line) or (' integer' in line):
                numeric_atts.append(line.split(' ')[1])
            elif '{' in line:
                nominal_atts.append(line.split(' ')[1])
        if line.startswith('@inputs'):
            att_names = line[8:-1].replace(' ', '')
            has_inputs = True
        elif line.startswith('@input'):
            att_names = line[7:-1].replace(' ', '')
            has_inputs = True
        elif line.startswith('@outputs') or line.startswith('@output'):
            class_name = line[9:-1]
            has_outputs = True
            break
        elif line.startswith('@output'):
            class_name = line[8:-1]
            has_outputs = True
            break
    if (not has_inputs) or (not has_outputs):
        print('File ', filename, 'missing annotations?' )

    columns = att_names + ',' + class_name

    #Then, lets remove the annotations and save the column names and data into a csv file:
    lines = file.readlines() 
    file.close()
    new_file = open(filename[:-4]+'.csv','w')
    new_file.write(columns+'\n')
    for line in lines:
        if not line.startswith('@'):
            new_file.write(line)
    new_file.close()    
    return {'numeric':numeric_atts, 'nominal':nominal_atts}

#### Now, we need to run through the files and execute the keel2csv function for each KEEL dat file:

In [3]:
ds_names = ['glass1', 'wisconsin', 'pima', 'ecoli2', 'vowel0']
rootdir = '/Users/lucasamorim/Downloads/KEEL_imb_classification_data_exercicio1'
# Link para os datasets: https://sci2s.ugr.es/keel/datasets.php

In [4]:
# Converting KEEL .dat files to CSV:
att_types = {} #This dictionary will have each dataset name as key and will hold the attribute types.
for name in ds_names:
    for fold in range(1,6):
        full_path = rootdir+'/'+name+'-5-fold/'+name+'-5-'+str(fold)+'tra.dat' 
        f = open(full_path, 'r')
        att_types[name] = keel2csv(f)
        f.close()
        full_path = rootdir+'/'+name+'-5-fold/'+name+'-5-'+str(fold)+'tst.dat' 
        f = open(full_path, 'r')
        keel2csv(f)
        f.close()

#### Ok, now that we finally have all the data in CSV format, lets load them:

In [5]:
# I will create a dict structure such that I can access train fold 1 from 
# dataset wisconsin as datasets['wisconsin']['train'][0]

datasets = {}
for name in ds_names:
    datasets[name] = {}
    datasets[name]['train'] = []
    datasets[name]['test'] = []
    for fold in range(1,6):
        csv_filename = rootdir+'/'+name+'-5-fold/'+name+'-5-'+str(fold)+'tra.csv'
        df_train = pd.read_csv(csv_filename, encoding='utf8', engine='python', sep=',', 
                     header=0, error_bad_lines=False)
        csv_filename = rootdir+'/'+name+'-5-fold/'+name+'-5-'+str(fold)+'tst.csv'
        df_test = pd.read_csv(csv_filename, encoding='utf8', engine='python', sep=',', 
                     header=0, error_bad_lines=False)
        datasets[name]['train'].append(df_train)
        datasets[name]['test'].append(df_test)


In [6]:
datasets['pima']['test'][0]

Unnamed: 0,Preg,Plas,Pres,Skin,Insu,Mass,Pedi,Age,Class
0,3.0,107.0,62.0,13.0,48.0,22.9,0.678,23.0,positive
1,3.0,169.0,74.0,19.0,125.0,29.9,0.268,31.0,positive
2,4.0,171.0,72.0,0.0,0.0,43.6,0.479,26.0,positive
3,0.0,189.0,104.0,25.0,0.0,34.3,0.435,41.0,positive
4,5.0,109.0,62.0,41.0,129.0,35.8,0.514,25.0,positive
...,...,...,...,...,...,...,...,...,...
149,0.0,97.0,64.0,36.0,100.0,36.8,0.600,25.0,negative
150,1.0,140.0,74.0,26.0,180.0,24.1,0.828,23.0,negative
151,1.0,86.0,66.0,52.0,65.0,41.3,0.917,29.0,negative
152,4.0,83.0,86.0,19.0,0.0,29.3,0.317,34.0,negative


## Pre-processing

### Cleaning strings

In [7]:
# Cleaning (stripping) strings within dataframe and also changing class labels to 1 and 0.
for name in ds_names:
    for s in ['train', 'test']:
        for fold in range(5):
            df = datasets[name][s][fold]
            df_obj = df.select_dtypes(['object'])
            df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
            df['Class'] = df['Class'].replace(['positive', 'negative'],[1,0])
                

### One-hot encoding

Here we must apply this encoding method to the nominal attributes in order to allow them to be managed by the classification algorithms.

In [8]:
for name in ds_names:
    for s in ['train', 'test']:
        for fold in range(5):
            for att in att_types[name]['nominal'][:-1]: #For each nominal attribute, except the target one (last one)
                att_encoded = pd.get_dummies(datasets[name][s][fold][att], prefix = att)
                datasets[name][s][fold] = datasets[name][s][fold].drop([att], axis = 1)
                datasets[name][s][fold] = pd.concat([att_encoded, datasets[name][s][fold]], axis = 1)

### Scaling

Applying the Standard Scaler to the numeric attributes:

In [9]:
ss = StandardScaler()

for name in ds_names:
    for s in ['train', 'test']:
        for fold in range(5):
            datasets[name][s][fold][att_types[name]['numeric']] = ss.fit_transform(datasets[name][s][fold][att_types[name]['numeric']])

### Dealing with missing values

Applying a Simple Imputer to the numeric attributes.

In [10]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
for name in ds_names:
    for s in ['train', 'test']:
        for fold in range(5):
            datasets[name][s][fold][att_types[name]['numeric']] = imp_mean.fit_transform(datasets[name][s][fold][att_types[name]['numeric']])

## Bulding and running the ensembles of classifiers

In [11]:
## Creating a results.csv file with a header row:
results_file = open('./results.csv', 'a')
header = 'Dataset Name,model,n_estimators,accuracy,acc_std_dev,f1,f1_std_dev,g_mean,g_mean_std_dev,oracle_mean,oracle_std_dev\n'
results_file.write(header)
results_file.close()

### First let's run a single-model (perceptron) classifier to stablish a baseline.

In [12]:
results_file = open('./results.csv', 'a')

model_percep = Perceptron(random_state=0)
for name in ds_names:
    acc_folds = []
    f1_folds = []
    g_mean_folds = []
    roc_auc_folds = []
    for fold in range(5):
        # Gather training data:
        ds_train = datasets[name]['train'][fold]
        target_att = ds_train.columns.tolist()[-1]
        X_train = ds_train.drop(labels=target_att, axis = 1)
        y_train = ds_train[target_att]

        # Gather test data:
        ds_test = datasets[name]['test'][fold]
        X_test = ds_test.drop(labels=target_att, axis = 1)
        y_test = ds_test[target_att]

        # Train model with the training data, we need y_score for calculating ROC-AUC
        y_score = model_percep.fit(X_train, y_train).decision_function(X_test)
        
        # Test model:
        y_pred = model_percep.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, pos_label=1)
        g_mean = geometric_mean_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_score)
        
        # Store metrics for this fold
        acc_folds.append(acc)
        f1_folds.append(f1)
        g_mean_folds.append(g_mean)
        roc_auc_folds.append(roc_auc)
        
        #cnf_matrix = confusion_matrix(y_test, y_pred)
        #print('Dataset %s, Fold %d, Accuracy: %.2f, F1-score: %.2f, ROC-AUC %.2f, G-Mean %.2f' % (name, fold, acc, f1, roc_auc, g_mean))
        #print(cnf_matrix)
    
    # Calculate means and std devs for each metric 
    acc_mean = str(np.average(acc_folds))
    acc_std_dev = str(np.std(acc_folds))
    f1_mean = str(np.average(f1_folds))
    f1_std_dev = str(np.std(f1_folds))
    roc_auc_mean = str(np.average(roc_auc_folds))
    roc_auc_std_dev = str(np.std(roc_auc_folds))
    g_mean_mean = str(np.average(g_mean_folds))
    g_mean_std_dev = str(np.std(g_mean_folds))
    results_file.write(name+',Perceptron,1,'+acc_mean+','+acc_std_dev+','+f1_mean+','+f1_std_dev+','
                       +roc_auc_mean+','+roc_auc_std_dev+','+g_mean_mean+','+g_mean_std_dev+', ,\n')

results_file.close()

### Creating a generic function to run the ensembles with 5-fold cross val and save results:

In [13]:
def run_model(meta_model_name, meta_model, base_model, results_file):
    for n in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
        print('Running', meta_model_name, 'with %d base estimators'%n)
        meta_model.n_estimators=n
        for name in ds_names:
            acc_folds = []
            f1_folds = []
            g_mean_folds = []
            roc_auc_folds = []
            oracle_scores = []
            for fold in range(5):
                ## Gather training data:
                ds_train = datasets[name]['train'][fold]
                target_att = ds_train.columns.tolist()[-1]
                X_train = ds_train.drop(labels=target_att, axis = 1)
                y_train = ds_train[target_att]

                ## Gather test data:
                ds_test = datasets[name]['test'][fold]
                X_test = ds_test.drop(labels=target_att, axis = 1)
                y_test = ds_test[target_att]

                ## Train model with the training data:
                meta_model.fit(X_train,y_train)

                ## Save the model object to a file for later use
                #filename = 'saved_models/'+meta_model_name+'_n'+str(n)+'_dataset_'+name+'_fold_'+str(fold)+'.sav'
                #pickle.dump(meta_model, open(filename, 'wb'))
                ## Later load model with: loaded_model = pickle.load(open(filename, 'rb'))

                ## Test model:
                y_pred = meta_model.predict(X_test)
                
                ## Calculate metrics:
                acc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, pos_label=1)
                g_mean = geometric_mean_score(y_test, y_pred)

                ## Store metrics for this fold:
                acc_folds.append(acc)
                f1_folds.append(f1)
                g_mean_folds.append(g_mean)
                #cnf_matrix = confusion_matrix(y_test, y_pred)
                #print(cnf_matrix)

                oracle = Oracle(meta_model).fit(X_train, y_train)
                oracle_score = oracle.score(X_test, y_test)
                oracle_scores.append(oracle_score)

            ## Calculate means and std devs for each metric 
            acc_mean = str(np.average(acc_folds))
            acc_std_dev = str(np.std(acc_folds))
            f1_mean = str(np.average(f1_folds))
            f1_std_dev = str(np.std(f1_folds))
            g_mean_mean = str(np.average(g_mean_folds))
            g_mean_std_dev = str(np.std(g_mean_folds))
            oracle_mean = str(np.average(oracle_scores))
            oracle_std_dev = str(np.std(oracle_scores))
            results_file.write(name+','+meta_model_name+','+str(n)+','+acc_mean+','+acc_std_dev+','+f1_mean+','
                               +f1_std_dev+','+g_mean_mean+','+g_mean_std_dev+','+oracle_mean+','+oracle_std_dev+'\n')


### Bagging

In [14]:
results_file = open('./results.csv', 'a')

#base_model = CalibratedClassifierCV(Perceptron(random_state=0))
base_model = Perceptron(random_state=0)
meta_model = BaggingClassifier(base_estimator=base_model, random_state=0, bootstrap=True,
                                bootstrap_features=False, max_features=1.0, n_jobs=-1)            
run_model('Bagging',meta_model, base_model, results_file)           
results_file.close()

Running Bagging with 10 base estimators
Running Bagging with 20 base estimators
Running Bagging with 30 base estimators
Running Bagging with 40 base estimators
Running Bagging with 50 base estimators
Running Bagging with 60 base estimators
Running Bagging with 70 base estimators
Running Bagging with 80 base estimators
Running Bagging with 90 base estimators
Running Bagging with 100 base estimators


### AdaBoost

In [15]:
results_file = open('./results.csv', 'a')

#base_model = CalibratedClassifierCV(Perceptron(random_state=0))
base_model = Perceptron(random_state=0)
meta_model = AdaBoostClassifier(base_estimator=base_model,
                                algorithm='SAMME', random_state=0)           
run_model('AdaBoost',meta_model, base_model, results_file)           
results_file.close()

Running AdaBoost with 10 base estimators
Running AdaBoost with 20 base estimators
Running AdaBoost with 30 base estimators
Running AdaBoost with 40 base estimators
Running AdaBoost with 50 base estimators
Running AdaBoost with 60 base estimators
Running AdaBoost with 70 base estimators
Running AdaBoost with 80 base estimators
Running AdaBoost with 90 base estimators
Running AdaBoost with 100 base estimators


### Random Linear Oracles

In [16]:
def run_model_rlo(meta_model_name, meta_model, base_model, results_file):
    for n in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
        print('Running', meta_model_name, 'with %d base estimators'%n)
        meta_model.n_estimators=n
        #for name in ['vowel0']:
        for name in ds_names:
            acc_folds = []
            f1_folds = []
            g_mean_folds = []
            roc_auc_folds = []
            oracle_scores = []
            for fold in range(5):
                ## Gather training data:
                ds_train = datasets[name]['train'][fold]
                target_att = ds_train.columns.tolist()[-1]
                X_train = ds_train.drop(labels=target_att, axis = 1).to_numpy()
                y_train = ds_train[target_att].to_numpy()

                ## Gather test data:
                ds_test = datasets[name]['test'][fold]
                X_test = ds_test.drop(labels=target_att, axis = 1).to_numpy()
                y_test = ds_test[target_att].to_numpy()
                while True:
                    try:
                        ## Train model with the training data:
                        meta_model.fit(X_train,y_train)
                        ## Test model:
                        #y_pred = meta_model.predict(X_test)
                        predictions, pred_ens, erro = meta_model.predict(X_test, y_test)
                        break
                    except:
                        continue
                    break
                
                ## Save the model object to a file for later use
                #filename = 'saved_models/'+meta_model_name+'_n'+str(n)+'_dataset_'+name+'_fold_'+str(fold)+'.sav'
                #pickle.dump(meta_model, open(filename, 'wb'))
                ## Later load model with: loaded_model = pickle.load(open(filename, 'rb'))
                y_pred = pred_ens.T.ravel()
                ## Calculate metrics:
                acc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, pos_label=1)
                g_mean = geometric_mean_score(y_test, y_pred)

                ## Store metrics for this fold:
                acc_folds.append(acc)
                f1_folds.append(f1)
                g_mean_folds.append(g_mean)
                #cnf_matrix = confusion_matrix(y_test, y_pred)
                #print(cnf_matrix)

                oracle_score = meta_model.Oracle_score(X_test, y_test)
                oracle_scores.append(oracle_score)

            ## Calculate means and std devs for each metric 
            acc_mean = str(np.average(acc_folds))
            acc_std_dev = str(np.std(acc_folds))
            f1_mean = str(np.average(f1_folds))
            f1_std_dev = str(np.std(f1_folds))
            g_mean_mean = str(np.average(g_mean_folds))
            g_mean_std_dev = str(np.std(g_mean_folds))
            oracle_mean = str(np.average(oracle_scores))
            oracle_std_dev = str(np.std(oracle_scores))
            results_file.write(name+','+meta_model_name+','+str(n)+','+acc_mean+','+acc_std_dev+','+f1_mean+','
                               +f1_std_dev+','+g_mean_mean+','+g_mean_std_dev+','+oracle_mean+','+oracle_std_dev+'\n')


In [17]:
results_file = open('./results.csv', 'a')

#base_model = CalibratedClassifierCV(Perceptron(random_state=0))
base_model = Perceptron(random_state=0)
meta_model = RLO(base_estimator=base_model)

run_model_rlo('RLO',meta_model, base_model, results_file)           
results_file.close()

Built RLO
Running RLO with 10 base estimators
Running RLO with 20 base estimators
Running RLO with 30 base estimators
Running RLO with 40 base estimators
Running RLO with 50 base estimators
Running RLO with 60 base estimators
Running RLO with 70 base estimators
Running RLO with 80 base estimators
Running RLO with 90 base estimators
Running RLO with 100 base estimators


### Random Subspaces (Fix me!)

In [None]:
results_file = open('./results.csv', 'a')

#base_model = CalibratedClassifierCV(Perceptron(random_state=0))
base_model = Perceptron(random_state=0)
meta_model = BaggingClassifier(base_estimator=base_model, n_jobs=-1,
                                random_state=0, bootstrap=False,
                                bootstrap_features=False, max_features=0.5) 
#These three last parameters in the above class constructor call make it behave as the Random Subspace algoritm.

run_model('RandomSubspaces',meta_model, base_model, results_file)           
results_file.close()

In [50]:
#Test Random Subspaces

base_model = Perceptron(random_state=0)
meta_model = BaggingClassifier(base_estimator=base_model, n_jobs=-1,
                                random_state=0, bootstrap=False,
                                bootstrap_features=False, max_features=0.5) 
#These three last parameters in the above class constructor call make it behave as the Random Subspace algoritm.

ds_train = datasets['ecoli2']['train'][0]  #syntax: datasets[dataset name][train or test][fold number]
target_att = ds_train.columns.tolist()[-1]
X_train = ds_train.drop(labels=target_att, axis = 1)
y_train = ds_train[target_att]
ds_test = datasets['ecoli2]['test'][0]
X_test = ds_test.drop(labels=target_att, axis = 1)
y_test = ds_test[target_att]

meta_model.fit(X_train, y_train)
y_pred = meta_model.predict(X_test)

print('ensemble acc = ', accuracy_score(y_test, y_pred))
print('ensemble f1_score = ', f1_score(y_test, y_pred, pos_label=1))
print('ensemble g_mean = ', geometric_mean_score(y_test, y_pred))


base_models = meta_model.estimators_
base_models_feats = meta_model.estimators_features_

oracle_scores = []
base_models_preds = []
for i in range(len(base_models)):
    X_test_subspace = X_test.iloc[:,base_models_feats[i]] #selecting only the columns used for the ith base model.
    y_pred = base_models[i].predict(X_test_subspace)
    base_models_preds.append(y_pred)

oracle_hits = []
for i in range(len(y_test)):
    oracle_hit = 0
    for j in range(len(base_models_preds)):
        if base_models_preds[j][i] == y_test[i]:
            oracle_hit = 1
            break
    oracle_hits.append(oracle_hit)

oracle_score = np.sum(oracle_hits)/len(oracle_hits)
print('Oracle score = ', oracle_score)

ensemble acc =  0.8970588235294118
ensemble f1_score =  0.631578947368421
ensemble g_mean =  0.7254762501100117
Oracle score =  0.9852941176470589
