In [1]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn import neighbors
from sklearn import neural_network
from sklearn.model_selection import GridSearchCV
%config InlineBackend.figure_format = 'retina'

In [2]:
# Config
RND_SEED        = 0
CLF_DICT        = {'svm': svm.SVC(),
                   'dt':  tree.DecisionTreeClassifier(),
                   'rf':  ensemble.RandomForestClassifier(),
                   'knn': neighbors.KNeighborsClassifier(),
                   'ann': neural_network.MLPClassifier()}

In [3]:
def split_train_test(data, ratio):
    """
    Splits dataset into training and test set of specified size.
    
        data:  The data to be split.
        
        ratio: Ratio of first (training) subset.
        
    Returns two datasets: training and test set.
    """
    
    train_num  = int(np.round(ratio*data.shape[0]))
    data_train = data[:train_num]
    data_test  = data[train_num:]
    return(data_train, data_test)

In [4]:
def shuffle(df):
    """
    Shuffles dataset using seed specified in RND_SEED (see config part above).
    
        df:  Dataset to be shuffled.
        
    Returns shuffled dataset.
    """
    
    return(df.sample(frac=1, random_state=np.random.RandomState(seed=RND_SEED)))

In [5]:
# Load, prepare, and shuffle breast cancer data
wdbc_X_and_y = pd.read_csv('data/wdbc.data', header = None).iloc[:, 1:] # drop ID, then first col = y
wdbc_X_and_y = shuffle(wdbc_X_and_y)
wdbc_y = wdbc_X_and_y.iloc[:, 0]
wdbc_X = wdbc_X_and_y.iloc[:, 1:]

# Transform y from (B, M) to (-1, 1)
wdbc_y = wdbc_y.map({'B': -1, 'M': 1})

# Split to 80% training and 20% test set
wdbc_X_train, wdbc_X_test = split_train_test(wdbc_X, 0.8)
wdbc_y_train, wdbc_y_test = split_train_test(wdbc_y, 0.8)

In [6]:
# Load, prepare, and shuffle adult income data
income_X_and_y = pd.read_csv('data/adult.data', header=None)
income_X_and_y.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                         'marital-status', 'occupation', 'relationship',
                         'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                         'native-country', 'income']
income_X_and_y = shuffle(income_X_and_y)

# one-hot encode categorical variables
income_categorical_vars = ['workclass', 'education', 'marital-status', 'occupation',
                           'relationship', 'race', 'sex', 'native-country']
income_X_and_y_onehot = pd.DataFrame()
for var in income_categorical_vars:
    dummy_coded_var_df = pd.get_dummies(income_X_and_y[var], prefix=var)
    income_X_and_y_onehot = pd.concat([income_X_and_y_onehot, dummy_coded_var_df], axis=1)

# add remaining columns to one-hot encoded df
income_X_and_y = pd.concat([income_X_and_y_onehot,
                            income_X_and_y.loc[:, income_X_and_y.columns[
                                np.logical_not(np.in1d(income_X_and_y.columns, income_categorical_vars))]]],
                           axis=1)

income_y = income_X_and_y.loc[:, 'income']
income_X = income_X_and_y.drop('income', axis=1)

# Transform y from (<=50K, >50K) to (-1, 1)
income_y = income_y.map({' <=50K': -1, ' >50K': 1})

# Split to 80% training and 20% test set
income_X_train, income_X_test = split_train_test(income_X, 0.8)
income_y_train, income_y_test = split_train_test(income_y, 0.8)

In [7]:
def init_clf(clf_code):
    if clf_code in CLF_DICT:
        return(CLF_DICT[clf_code])
    else:
        raise ValueError('You are probably trying to use a classifier that you haven\'t implemented yet!')
        return(None)

In [16]:
class hyperparam_learner:
    data_X_train  = None
    data_y_train  = None
    data_desc     = None
    clf_code_list = None
    cv_fold       = None
    
    hyperparam_dict = None
    
    is_fitted     = False
    has_learned   = False
    
    @classmethod
    def load(cls, filename):
        with open(filename + '.pkl', 'rb') as f:
            return pickle.load(f)
    
    def save(self, filename):
        if not self.is_fitted:
            raise Exception('I am not fitted yet!')
        if not self.has_learned:
            raise Exception('I have not learned any hyperparameters yet!')
        
        with open(filename + '.pkl', 'wb') as f:
            pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
    
    def fit(self, data_X_train, data_y_train, data_desc, clf_code_list, cv_fold):
        self.data_X_train   = data_X_train
        self.data_y_train   = data_y_train
        self.data_desc      = data_desc
        self.clf_code_list  = clf_code_list
        self.cv_fold        = cv_fold
        self.is_fitted      = True
    
    def learn_hyperparams(self, clf_param_grid):
        if(not self.is_fitted):
            raise Exception('I am not fitted yet!')
            
        if isinstance(self.clf_code_list, str):
            if self.clf_code_list == 'all':
                self.clf_code_list = list(CLF_DICT.keys())
            else:
                self.clf_code_list = [self.clf_code_list]

        learned_hyperparam_dict = dict()
        considered_hyperparam_dict = dict()

        for clf_code in self.clf_code_list:
            considered_hyperparam_dict[clf_code] = clf_param_grid[clf_code]
            gs_clf = GridSearchCV(init_clf(clf_code), clf_param_grid[clf_code], cv=self.cv_fold, n_jobs=4)
            gs_clf.fit(self.data_X_train, self.data_y_train)
            learned_hyperparam_dict[clf_code] = gs_clf.best_params_

        hyperparam_dict = {'learned_hyperparams'    : learned_hyperparam_dict,
                           'considered_hyperparams' : considered_hyperparam_dict,
                           'data_desc'              : self.data_desc}
        
        self.hyperparam_dict = hyperparam_dict
        self.has_learned = True
        return(self.hyperparam_dict)
    
    def get_hyperparams(self):
        if(self.has_learned):
            return(self.hyperparam_dict)
        else:
            raise Exception('I have not learned any hyperparameters yet!')

In [9]:
def compute_accuracies(data_X_train, data_y_train, data_X_test, data_y_test, clf_code_list, learned_hyperparams_dict):
    
    accuracy_dict = dict()
    
    for clf_code in clf_code_list:
        clf = init_clf(clf_code)
        learned_hyperparams = learned_hyperparams_dict[clf_code]
        
        for param_name, param_val in learned_hyperparams.items():
            setattr(clf, param_name, param_val)

        clf.fit(data_X_train, data_y_train)

        train_preds = clf.predict(data_X_train)
        test_preds  = clf.predict(data_X_test)

        train_acc   = sum(train_preds == data_y_train)/len(data_y_train)
        test_acc    = sum(test_preds == data_y_test)/len(data_y_test)
        
        accuracy_dict[clf_code] = train_acc, test_acc
    
    return(accuracy_dict)

# Let's go!

In [14]:
# Define hyperparameter grid to be considered
clf_param_grid = {'svm': {'kernel': ['linear'],
                          'C': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]},
                  'dt':  {'criterion': ['entropy'],
                          'max_depth': [1, 2, 3, 4, 5]},
                  'rf':  {'max_features': [1, 2, 4]},
                  'knn': {'n_neighbors': [1, 2, 3, 4, 5]},
                  'ann': {'hidden_layer_sizes': [(1,), (2,), (4,), (8,), (32,), (128,)]}}

In [17]:
# Learn hyperparameters based on breast cancer data
wdbc_learner = hyperparam_learner()
wdbc_learner.fit(data_X_train = wdbc_X_train,
                 data_y_train = wdbc_y_train,
                 data_desc = 'wdbc_80', # 80% training
                 clf_code_list = 'all',
                 cv_fold = 5)
wdbc_learner.learn_hyperparams(clf_param_grid)



{'considered_hyperparams': {'ann': {'hidden_layer_sizes': [(1,),
    (2,),
    (4,),
    (8,),
    (32,),
    (128,)]},
  'dt': {'criterion': ['entropy'], 'max_depth': [1, 2, 3, 4, 5]},
  'knn': {'n_neighbors': [1, 2, 3, 4, 5]},
  'rf': {'max_features': [1, 2, 4]},
  'svm': {'C': [1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1],
   'kernel': ['linear']}},
 'data_desc': 'wdbc_80',
 'learned_hyperparams': {'ann': {'hidden_layer_sizes': (4,)},
  'dt': {'criterion': 'entropy', 'max_depth': 5},
  'knn': {'n_neighbors': 5},
  'rf': {'max_features': 1},
  'svm': {'C': 0.01, 'kernel': 'linear'}}}

In [18]:
wdbc_learner.save('wdbc_learner')

In [23]:
test = hyperparam_learner.load('wdbc_learner')
test.get_hyperparams()['learned_hyperparams']['svm']['C']

0.01