In this notebook we evaluate each of the following classification models:

1. Logistic regression (for classification)
2. Support vector classification
3. Decision tree classification
4. Random forest classification
5. k-nearest neighbours classification
6. AdaBoost classification
7. Gaussian naive Bayes classification
8. Neural network classification

**Run the code below** to import required packages.


In [2]:
import numpy as np
import os
import sklearn as sk
from sklearn import linear_model, tree, svm, ensemble, neighbors, naive_bayes, neural_network
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer
from scipy.io.arff import loadarff
import pandas as pd
import scipy


### Data and Classifier Parameters  

In [3]:
#Relative path root for datasets
file_loc = os.path.join(os.getcwd(),'Datasets/')

#Dataset files and parameters based on descriptions
dataset_details = {
    'credit':
        {
            'file': ['default of credit card clients.xls'],
            'load_params': {
                'index_col': 0,
                'skiprows': 1
            }
        },
    'breast_cancer':
        {
            'file': ['breast-cancer-wisconsin.data'],
            'load_params': {
                'na_values': '?',
                'index_col': 0
            },
            'weighted': 'Y'
        },
    'statlog':
        {
            'file': ['german.data-numeric'],
            'load_params': {
                'delim_whitespace': 'true'
            },
            'weighted': 'Y',
        },
    'adult':
        {
            'file': ['adult.data','adult.test'],
            'load_params': {
                'na_values': '?',
                'comment': '|'
            }
        },
    'yeast': {
        'file': ['yeast.data'],
        'load_params': {
            'index_col': 0,
            'delim_whitespace': 'true'
        }
    },
    'thoracic': {
        'file': ['ThoraricSurgery.arff'],
        'load_params': {}
    },
    'seismic': {
        'file': ['seismic-bumps.arff'],
        'load_params': {}
    },
    'retinopathy': {
        'file': ['messidor_features.arff'],
        'load_params': {
            'comment': '@'
        }
    }
}

# Classifiers used and their subset of hyperparameters chosen to test with gridsearch
CLASSIFIERS = {
    'logreg': {
        'clf': linear_model.LogisticRegression,
       'param_grid': {
            'C' : [100, 10, 1.0, 0.1, 0.01]
        },
        'params': {
            'random_state': 0
        }
    },
    'tree': {
        'clf': tree.DecisionTreeClassifier,
        'param_grid': {
            'min_samples_split': [5,10,100,500],
            'max_depth': [1,5,10,50,100]
        },
        'params': {
            'random_state': 0
        }
    },
    'kneighbors': {
        'clf': neighbors.KNeighborsClassifier,
        'param_grid': {
            'leaf_size': [1,5,10,20,50],
            'n_neighbors': [1,5,10,20,30],
            'p': [1,2]
        },
        'params': {}
    },
    'adaboost': {
        'clf': ensemble.AdaBoostClassifier,
        'param_grid': {
            'n_estimators': [10, 50, 100, 500],
            'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0]
            },
        'params': {
            'random_state': 0
        }
    },
    'nb': {
        'clf': naive_bayes.GaussianNB,
        'param_grid': {},        
        'params': {}
    }
}

### Preprocessing the data

In [16]:
def preprocessor(dataset_details, file_loc):

    '''
    Loads all datasets in
    Standardizes to dataframe
    Splits into train and testing data
    '''

    train_data = {}
    test_data = {}

    for dataset in dataset_details:
        X, y = load_dataset(dataset_details[dataset], file_loc)
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
        #if dataset=='breast_cancer': #train_test_split is converting all 10.0 values to NaN for this dataset for some reason
            #X_train = X_train.fillna(value=10)
            #X_test = X_test.fillna(value=10)
        
        
        #Encode categorical values and scale numeric values
        X_enc, y_enc = create_encoders(X_train, y_train)
        X_train = X_enc.transform(X_train)
        X_test = X_enc.transform(X_test)
        
        if scipy.sparse.issparse(X_train):
            X_train = X_train.toarray()
            X_test = X_test.toarray()
        if y_enc:
            y_train = y_enc.transform(y_train)
            y_test = y_enc.transform(y_test)

            
        #Impute missing values  
        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        X_imp = imp.fit(X_train)
        X_train = X_imp.transform(X_train)
        X_test = X_imp.transform(X_test)
        
        train_data[dataset] = {
            'X_train': X_train,
            'y_train': y_train
        }
        test_data[dataset] = {
            'X_test': X_test,
            'y_test': y_test
        }

    return train_data, test_data

def create_encoders(X, y):

    '''
    Splits dataset into numerical and categorical data
    creates relevant encoders for both features and labels
    '''

    cat_enc = OneHotEncoder(handle_unknown='ignore')
    num_enc = StandardScaler()

    cat_features = X.select_dtypes(include=['object']).columns
    num_features =X.select_dtypes(include=['int64', 'float64']).columns

    if len(cat_features)==0:
        X_enc = num_enc
    elif len(num_features)==0:
        X_enc = cat_enc
    else:
        X_enc = ColumnTransformer(
            transformers=[
                ("num", num_enc, num_features),
                ("cat", cat_enc, cat_features)
            ]
        )

    X_enc.fit(X)
    
    y_enc = None
    if y.dtypes=='object':
        y_enc = LabelEncoder().fit(y)


    return X_enc, y_enc
    
def load_dataset(dataset, file_loc):
    '''
    Loads in a dataset according to type and load_params
    Assumes dataset file is either .xls, .arff, or plain text
    If test and train are pre-split, assumes they are the same file type and combines for preprocessing.
    Separates out the last column as y
    '''

    #metadata = dataset_details[dataset]
    filenames = dataset['file']
    load_params = dataset['load_params']

    dfs = []
    for file in filenames:
        extension = file.split('.')[1]  # Get file type
        file = f'{file_loc}{file}'
        if extension == 'xls':
            df = load_excel(file, **load_params)
        elif extension == 'arff':
            df = load_arff(file)
        else:
            df = load_plaintext(file, **load_params)
        dfs.append(df)
        df = pd.concat(dfs)
    
    y = df.iloc[:,-1]
    X = df = df.iloc[: , :-1]

    return X, y

def load_excel(file,  **kwargs):
    df = pd.read_excel(file, dtype=None, engine='xlrd', **kwargs)
    return df

def load_arff(file):
    data = loadarff(file)
    df = pd.DataFrame(data[0])
    return df

def load_plaintext(file, **kwargs):
    df = pd.read_csv(file, header=None, dtype=None, **kwargs)
    return df

train_data, test_data = preprocessor(dataset_details, file_loc)

### Train Data

In [5]:
def train_classifiers(data, CLASSIFIERS):

    '''
    Trains every classifier on every dataset
    '''

    models = {}
    for clf in CLASSIFIERS:
        models[clf]={}
        for dataset in data:
            print("Training ",clf," on ",dataset)
            model, X_enc, y_enc, X_imp = train_clf(CLASSIFIERS[clf], data[dataset]['X_train'], data[dataset]['y_train'])
            models[clf][dataset] = {
                'model': model,
                'X_enc': X_enc,
                'y_enc': y_enc,
                'X_imp': X_imp
            }
        

    return models

def train_clf(clf_data, X, y):

    '''
    Trains a given classifier on a given dataset
    '''
    params = clf_data['params']
    clf = clf_data['clf']
    model = clf(**params).fit(X,y)
    
    return model


models = train_classifiers(train_data, CLASSIFIERS)


Training  tree  on  credit
Training  tree  on  breast_cancer
Training  tree  on  statlog
Training  tree  on  adult
Training  tree  on  yeast
Training  tree  on  thoracic
Training  tree  on  seismic
Training  tree  on  retinopathy
Training  kneighbors  on  credit
Training  kneighbors  on  breast_cancer
Training  kneighbors  on  statlog
Training  kneighbors  on  adult
Training  kneighbors  on  yeast
Training  kneighbors  on  thoracic
Training  kneighbors  on  seismic
Training  kneighbors  on  retinopathy
Training  adaboost  on  credit
Training  adaboost  on  breast_cancer
Training  adaboost  on  statlog
Training  adaboost  on  adult
Training  adaboost  on  yeast
Training  adaboost  on  thoracic
Training  adaboost  on  seismic
Training  adaboost  on  retinopathy
Training  nb  on  credit
Training  nb  on  breast_cancer
Training  nb  on  statlog
Training  nb  on  adult
Training  nb  on  yeast
Training  nb  on  thoracic
Training  nb  on  seismic
Training  nb  on  retinopathy


### Choose Hyperparameters

In [1]:
def find_hyperparams(model, classifier_details, X, y):

    param_grid = classifier_details['param_grid']
    cv = KFold(n_splits=5)
    search = GridSearchCV(model, param_grid, cv=cv, n_jobs=-1)

    result = search.fit(X, y)
    print (result.best_params_)
    return result.best_params_

def find_all_hyperparams(data, models, classifiers):
    count = 0
    for clf in models:
        if count == 0:
            count+=1
            continue
        for dataset in models[clf]:
            if (dataset=='adult'):
                continue
            hps = find_hyperparams(models[clf][dataset]['model'], classifiers[clf], data[dataset]['X_train'], data[dataset]['y_train'])
            models[clf][dataset]['params'] = hps


#find_all_hyperparams(train_data,models,CLASSIFIERS)

#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
#search = GridSearchCV(models['tree']['adult']['model'], param_grid, cv=cv, n_jobs=-1)

#result = search.fit(train_data['adult']['X_train'], train_data['adult']['y_train'])
#classifier_details['params']=result.best_params_
#print (result.best_params_)
print(models)

NameError: name 'models' is not defined

### Test Models

In [88]:
def test_classifiers(data, models):
    scores = {}

    for clf in models:
        scores[clf] = []
        for dataset in models[clf]:
            f1_score = test_clf(models[clf][dataset], data[dataset]['X_test'], data[dataset]['y_test'])
            scores[clf].append(f1_score)

    scores = pd.DataFrame.from_dict(scores,orient='index',columns=dataset_details.keys())
    np.set_printoptions(edgeitems=3)
    np.core.arrayprint._line_width = 100
    print(scores)
            

def test_clf(models, X, y):
    model = models['model']
    X = models['X_enc'].transform(X)
    X = models['X_imp'].transform(X)
    
    if scipy.sparse.issparse(X):
        X = X.toarray()
    if models.get('y_enc'):
        y = models['y_enc'].transform(y)
    
    y_pred = model.predict(X)
    f1 = f1_score(y,y_pred,average='micro')
    
    return round(f1,2)

test_classifiers(test_data, models)

            credit  breast_cancer  statlog  adult  yeast  thoracic  seismic  \
logreg        0.81           0.97     0.76   0.57   0.56      0.81     0.95   
tree          0.73           0.94     0.70   0.46   0.50      0.81     0.88   
kneighbors    0.79           0.98     0.69   0.50   0.55      0.82     0.94   
adaboost      0.82           0.96     0.78   0.57   0.45      0.81     0.94   
nb            0.64           0.95     0.69   0.18   0.11      0.19     0.41   

            retinopathy  
logreg             0.66  
tree               0.64  
kneighbors         0.64  
adaboost           0.67  
nb                 0.63  


### Decision Gridsearch (Novelty Component)

The aim of a "decision gridsearch" is to analyze the decisions made at every step of preprocessing, training and testing, to determine if what we as students thought made sense actually corresponds to the highest scores. While we can't try every decision or every possibility for a given decision, the goal is to give some insight as to what kind of decisions need to have more thought put into them and what kind have minimal impact on a given type of dataset.

In [None]:
example_dataset = 'yeast' #Chose a dataset that had particularly low scores to begin with 

decisions = {
    'strategy': ['mean','most frequent'],
    'scaler': [MinMaxScaler, StandardScaler],
    'gridsearch': [GridSearchCV, RandomizedSearchCV],
    'scoring': [f1_score, 'score']
}


