In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn import neighbors
from sklearn import neural_network
from sklearn.model_selection import GridSearchCV
%config InlineBackend.figure_format = 'retina'

In [134]:
# Config
RND_SEED        = 0
CLF_DICT        = {'svm': svm.SVC(),
                   'dt':  tree.DecisionTreeClassifier(),
                   'rf':  ensemble.RandomForestClassifier(),
                   'knn': neighbors.KNeighborsClassifier(),
                   'ann': neural_network.MLPClassifier()}

CLF_PARAM_DICT = {'svm': {'kernel': ['linear'],
                          'C': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]},
                  'dt':  {'criterion': ['entropy'],
                          'max_depth': [1, 2, 3, 4, 5]},
                  'rf':  {'max_features': [1, 2, 4]},
                  'knn': {'n_neighbors': [1, 2, 3, 4, 5]},
                  'ann': {'hidden_layer_sizes': [(1,), (2,), (4,), (8,), (32,), (128,)]}}

In [116]:
def split_train_test(data, ratio):
    """
    Splits dataset into training and test set of specified size.
    
        data:  The data to be split.
        
        ratio: Ratio of first (training) subset.
        
    Returns two datasets: training and test set.
    """
    
    train_num  = int(np.round(ratio*data.shape[0]))
    data_train = data[:train_num]
    data_test  = data[train_num:]
    return(data_train, data_test)

In [135]:
def shuffle(df):
    """
    Shuffles dataset using seed specified in RND_SEED (see config part above).
    
        df:  Dataset to be shuffled.
        
    Returns shuffled dataset.
    """
    
    return(df.sample(frac=1, random_state=np.random.RandomState(seed=RND_SEED)))

In [136]:
# Load, prepare, and shuffle breast cancer data
wdbc_X_and_y = pd.read_csv('data/wdbc.data', header = None).iloc[:, 1:] # drop ID, then first col = y
wdbc_X_and_y = shuffle(wdbc_X_and_y)
wdbc_y = wdbc_X_and_y.iloc[:, 0]
wdbc_X = wdbc_X_and_y.iloc[:, 1:]

# Transform y from (B, M) to (-1, 1)
wdbc_y = wdbc_y.map({'B': -1, 'M': 1})

# Split to 80% training and 20% test set
wdbc_X_train, wdbc_X_test = split_train_test(wdbc_X, 0.8)
wdbc_y_train, wdbc_y_test = split_train_test(wdbc_y, 0.8)

In [141]:
# Load, prepare, and shuffle adult income data
income_X_and_y = pd.read_csv('data/adult.data', header=None)
income_X_and_y.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                         'marital-status', 'occupation', 'relationship',
                         'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                         'native-country', 'income']
income_X_and_y = shuffle(income_X_and_y)

# one-hot encode categorical variables
income_categorical_vars = ['workclass', 'education', 'marital-status', 'occupation',
                           'relationship', 'race', 'sex', 'native-country']
income_X_and_y_onehot = pd.DataFrame()
for var in income_categorical_vars:
    dummy_coded_var_df = pd.get_dummies(income_X_and_y[var], prefix=var)
    income_X_and_y_onehot = pd.concat([income_X_and_y_onehot, dummy_coded_var_df], axis=1)

# add remaining columns to one-hot encoded df
income_X_and_y = pd.concat([income_X_and_y_onehot,
                            income_X_and_y.loc[:, income_X_and_y.columns[
                                np.logical_not(np.in1d(income_X_and_y.columns, income_categorical_vars))]]],
                           axis=1)

income_y = income_X_and_y.loc[:, 'income']
income_X = income_X_and_y.drop('income', axis=1)

# Transform y from (<=50K, >50K) to (-1, 1)
income_y = income_y.map({' <=50K': -1, ' >50K': 1})

# Split to 80% training and 20% test set
income_X_train, income_X_test = split_train_test(income_X, 0.8)
income_y_train, income_y_test = split_train_test(income_y, 0.8)

In [78]:
def init_clf(clf_code):
    if clf_code in CLF_DICT:
        return(CLF_DICT[clf_code])
    else:
        raise ValueError('You are probably trying to use a classifier that you haven\'t implemented yet!')
        return(None)

In [79]:
def get_clf_paramgrid(clf_code):
    if clf_code in CLF_PARAM_DICT:
        return(CLF_PARAM_DICT[clf_code])
    else:
        raise ValueError('I don\'t know the param grid for your classifier yet')

In [80]:
def learn_hyperparameters(data_X_train, data_y_train, clf_code_list, cv_fold):
    """
    Returns tuned hyperparameters for a list of classifiers.
    Classifier objects and hyperparameters to be tuned are specified
      in CLF_DICT and CLF_PARAM_DICT (see config part above).
    
        data_X_train:  training data X; shape: (N, k)
        
        data_y_train:  training data y; shape: (N, )
        
        clf_code_list: list of classifier codes (e.g., ['svm', 'dt', 'rf'])
        
        cv_fold:       fold parameter for cross-validation
    
    """
    
    if isinstance(clf_code_list, str):
        if clf_code_list == 'all':
            clf_code_list = list(CLF_DICT.keys())
        else:
            clf_code_list = [clf_code_list]
    
    learned_hyperparam_dict = dict()
    
    for clf_code in clf_code_list:
        gs_clf = GridSearchCV(init_clf(clf_code), get_clf_paramgrid(clf_code), cv=cv_fold, n_jobs=4)
        gs_clf.fit(data_X_train, data_y_train)
        learned_hyperparam_dict[clf_code] = gs_clf.best_params_
    
    return(learned_hyperparam_dict)

In [81]:
def compute_accuracies(data_X_train, data_y_train, data_X_test, data_y_test, clf_code_list, learned_hyperparams_dict):
    
    accuracy_dict = dict()
    
    for clf_code in clf_code_list:
        clf = init_clf(clf_code)
        learned_hyperparams = learned_hyperparams_dict[clf_code]
        
        for param_name, param_val in learned_hyperparams.items():
            setattr(clf, param_name, param_val)

        clf.fit(data_X_train, data_y_train)

        train_preds = clf.predict(data_X_train)
        test_preds  = clf.predict(data_X_test)

        train_acc   = sum(train_preds == data_y_train)/len(data_y_train)
        test_acc    = sum(test_preds == data_y_test)/len(data_y_test)
        
        accuracy_dict[clf_code] = train_acc, test_acc
    
    return(accuracy_dict)

In [143]:
learned_hyperparams_wdbc = learn_hyperparameters(wdbc_X_train, wdbc_y_train, ['svm', 'dt', 'rf', 'knn', 'ann'], 5)
learned_hyperparams_wdbc



{'ann': {'hidden_layer_sizes': (2,)},
 'dt': {'criterion': 'entropy', 'max_depth': 4},
 'knn': {'n_neighbors': 5},
 'rf': {'max_features': 4},
 'svm': {'C': 0.01, 'kernel': 'linear'}}

In [None]:
learned_hyperparams_income = learn_hyperparameters(income_X_train, income_y_train, 'all', 5)
learned_hyperparams_income

In [10]:
compute_accuracies(wdbc_X_train, wdbc_y_train, wdbc_X_test, wdbc_y_test, ['svm', 'dt', 'rf', 'knn', 'ann'], learned_hyperparams)

{'ann': (0.93626373626373627, 0.92105263157894735),
 'dt': (0.97802197802197799, 0.92105263157894735),
 'knn': (0.94285714285714284, 0.93859649122807021),
 'rf': (1.0, 0.94736842105263153),
 'svm': (0.95824175824175828, 0.96491228070175439)}

In [11]:
learned_hyperparams = learn_hyperparameters(wdbc_X_train, wdbc_y_train, 'ann', 5)

ann


