# Pipelines for classifiers using AUC

For each dataset, classifier and folds:
- Robust scaling
- 2, 3, 5, 10-fold outer CV
- AUC as score

We will use folders *datasets2* and *results_AUC*.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# remove warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold, LeaveOneOut
from sklearn.metrics import confusion_matrix,accuracy_score, roc_auc_score,f1_score, recall_score, precision_score
from sklearn.utils import class_weight

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.svm import LinearSVC

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import RFECV, VarianceThreshold, SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel, SelectPercentile, f_classif
import os

In [None]:
!ls ./datasets2/*

In [None]:
!ls ./results_AUC/*

In [None]:
# get list of files in datasets2 = all datasets
dsList = os.listdir('./datasets2')
print('--> Found', len(dsList), 'dataset files')

In [None]:
# create a list with all output variable names 
outVars = []
for eachdsFile in dsList:
    outVars.append( (eachdsFile[:-4])[3:] )

### Define script parameters

In [None]:
# define list of folds
foldTypes = [2,3,5,10]

# define a label for output files
targetName = '_AUC'

seed = 42

### Function definitions

In [None]:
def  set_weights(y_data, option='balanced'):
    """Estimate class weights for umbalanced dataset
       If ‘balanced’, class weights will be given by n_samples / (n_classes * np.bincount(y)). 
       If a dictionary is given, keys are classes and values are corresponding class weights. 
       If None is given, the class weights will be uniform """
    cw = class_weight.compute_class_weight(option, np.unique(y_data), y_data)
    w = {i:j for i,j in zip(np.unique(y_data), cw)}
    return w 

In [None]:
def getDataFromDataset(sFile, OutVar):
    # read details file
    print('\n-> Read dataset', sFile)
    df = pd.read_csv(sFile)
    #df = feather.read_dataframe(sFile)
    print('Shape', df.shape)
    # print(list(df.columns))
    
    # select X and Y
    ds_y = df[OutVar]
    ds_X = df.drop(OutVar,axis = 1)
    Xdata = ds_X.values # get values of features
    Ydata = ds_y.values # get output values

    print('Shape X data:', Xdata.shape)
    print('Shape Y data:',Ydata.shape)
    
    # return data for X and Y, feature names as list
    return (Xdata, Ydata, list(ds_X.columns))

In [None]:
def Pipeline_OuterCV(Xdata, Ydata, label = 'my', class_weights = {0: 1, 1: 1}, folds = 3, seed = 42):
    # inputs:
    # data for X, Y; a label about data, number of folds, seeed
    # default: 3-fold CV, 1:1 class weights (ballanced dataset)
    
    # define classifiers
    names = ['KNN', 'SVM linear', 'SVM', 'LR', 'DT', 'RF', 'XGB']
    classifiers = [KNeighborsClassifier(3),
                   SVC(kernel="linear",random_state=seed,gamma='scale'),
                   SVC(kernel = 'rbf', random_state=seed,gamma='auto'),
                   LogisticRegression(solver='lbfgs',random_state=seed),
                   DecisionTreeClassifier(random_state = seed),
                   RandomForestClassifier(n_estimators=50,n_jobs=-1,random_state=seed),
                   XGBClassifier(n_jobs=-1,seed=seed)
                  ]
    # results dataframe: each column for a classifier
    df_res = pd.DataFrame(columns=names)

    # build each classifier
    print('* Building scaling+feature selection+outer '+str(folds)+'-fold CV for '+str(len(names))+' classifiers:', str(names))
    total = time.time()
    
    # define a fold-CV for all the classifier
    outer_cv = StratifiedKFold(n_splits=folds,shuffle=True,random_state=seed)
    
    for name, clf in zip(names, classifiers):
        start = time.time()
        
        # create pipeline: scaler + classifier
        estimators = []
        
        # SCALER
        # MinMaxScaler(), StandardScaler(), RobustScaler(),
        estimators.append(('Scaler', RobustScaler() ))
        
        # add Classifier
        estimators.append(('Classifier', clf)) 
        
        # create pipeline
        model = Pipeline(estimators)
        
        # evaluate pipeline
        scores = cross_val_score(model, Xdata, Ydata, cv=outer_cv, scoring='roc_auc', n_jobs=-1)
        df_res[name] = scores
        print('%s, MeanScore=%0.2f, Time:%0.1f mins' % (name, scores.mean(), (time.time() - start)/60))
        
    # save results for each ML and dataset
    resFile = './results_AUC/'+str(label)+str(targetName)+'_Outer-'+str(folds)+'-foldCV.csv'
    df_res.to_csv(resFile, index=False)
    print('* Scores saved', resFile)  
    print('Total time:', (time.time() - total)/60, ' mins')             
    
    # return AUC scores for all classifiers as dataframe (each column a classifier)
    return df_res

### Calculations

In [None]:
# for each subset file
df_results = None # all results 

for OutVar in outVars:
    sFile = './datasets2/ds.'+str(OutVar)+'.csv'

    # get data from file
    Xdata, Ydata, Features = getDataFromDataset(sFile,OutVar)

    # Calculate class weights
    class_weights = set_weights(Ydata)
    print("Class weights = ", class_weights)
        
    # try different folds for each subset -> box plots
    for folds in foldTypes:
        
        # calculate outer CV for different binary classifiers
        df_fold = Pipeline_OuterCV(Xdata, Ydata, label = OutVar, class_weights = class_weights, folds = folds, seed = seed)
        df_fold['Dataset'] = OutVar
        df_fold['folds'] = folds
        
        # add each result to a summary dataframe
        df_results = pd.concat([df_results,df_fold])

In [None]:
resFile = './results_AUC/'+'ML_Outer-n-foldCV.csv'
df_results.to_csv(resFile, index=False)

### Mean scores

In [None]:
df_means =df_results.groupby(['Dataset','folds'], as_index = False).mean()[['Dataset', 'folds','KNN', 'SVM linear', 'SVM', 'LR', 'DT', 'RF', 'XGB']]

In [None]:
resFile_means = './results_AUC/'+'ML_Outer-n-foldCV_means.csv'
df_means.to_csv(resFile_means, index=False)

### Best ML results

In [None]:
# find the maximum value rows for all MLs
bestMLs = df_means[['KNN', 'SVM linear', 'SVM', 'LR', 'DT', 'RF', 'XGB']].idxmax()
print(bestMLs)

In [None]:
# get the best score by ML method
for ML in ['KNN', 'SVM linear', 'SVM', 'LR', 'DT', 'RF', 'XGB']:
    print(ML, '\t', list(df_means.iloc[df_means[ML].idxmax()][['Dataset', 'folds', ML]]))

In [None]:
# Add a new column with the original output name (get first 2 characters from Dataset column)
getOutOrig = []
for each in df_means['Dataset']:
    getOutOrig.append(each[:2])
df_means['Output'] = getOutOrig
df_means

In [None]:
resFile_means2 = './results_AUC/'+'ML_Outer-n-foldCV_means2.csv'
df_means.to_csv(resFile_means2, index=False)

### Get the best ML for each type of output

In [None]:
for outName in list(set(df_means['Output'])):
    print('*********************')
    print('OUTPUT =', outName)
    df_sel = df_means[df_means['Output'] == outName].copy()
    for ML in ['KNN', 'SVM linear', 'SVM', 'LR', 'DT', 'RF', 'XGB']:
        print(ML, '\t', list(df_sel.loc[df_sel[ML].idxmax(),:][['Dataset', 'folds', ML]]))

In [None]:
df_sel.loc[df_sel[ML].idxmax(),:]

### Get the best ML for each type of output for 10-fold CV

In [None]:
df_10fold = df_means[df_means['folds']==10].copy()
df_10fold.head()

In [None]:
for outName in list(set(df_10fold['Output'])):
    print('*********************')
    print('OUTPUT =', outName)
    
    df_sel = df_10fold[df_10fold['Output'] == outName].copy()
    print('MAX =',df_sel[['KNN', 'SVM linear', 'SVM', 'LR', 'DT', 'RF', 'XGB']].max().max())
    
    for ML in ['KNN', 'SVM linear', 'SVM', 'LR', 'DT', 'RF', 'XGB']:
        print(ML, '\t', list(df_sel.loc[df_sel[ML].idxmax(),:][['Dataset', 'folds', ML]]))

### 5-fold CV

In [None]:
df_5fold = df_means[df_means['folds']==5].copy()
df_5fold.head()

In [None]:
for outName in list(set(df_5fold['Output'])):
    print('*********************')
    print('OUTPUT =', outName)
    
    df_sel = df_5fold[df_5fold['Output'] == outName].copy()
    print('MAX =',df_sel[['KNN', 'SVM linear', 'SVM', 'LR', 'DT', 'RF', 'XGB']].max().max())
    
    for ML in ['KNN', 'SVM linear', 'SVM', 'LR', 'DT', 'RF', 'XGB']:
        print(ML, '\t', list(df_sel.loc[df_sel[ML].idxmax(),:][['Dataset', 'folds', ML]]))

In [None]:
print('5-fold CV')
for outName in list(set(df_5fold['Output'])):
    df_sel = df_5fold[df_5fold['Output'] == outName].copy()
    print(outName,df_sel[['KNN', 'SVM linear', 'SVM', 'LR', 'DT', 'RF', 'XGB']].max().max())
    
print('10-fold CV')
for outName in list(set(df_10fold['Output'])):
    df_sel = df_10fold[df_10fold['Output'] == outName].copy()
    print(outName,df_sel[['KNN', 'SVM linear', 'SVM', 'LR', 'DT', 'RF', 'XGB']].max().max())

**Conclusion**: using AUC as score, we are able to obtain classification models with AUC > 0.70 for the majority of outputs and even AUC > 0.85!