# Preamble

In [1]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn import neighbors
from sklearn import neural_network
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from hyperparams import hyperparam_explorer # my package
%config InlineBackend.figure_format = 'retina'

# Data preparation

In [2]:
# Config
RND_SEED        = 0
CLF_DICT        = {'svm': svm.SVC(),
                   'dt':  tree.DecisionTreeClassifier(),
                   'rf':  ensemble.RandomForestClassifier(),
                   'knn': neighbors.KNeighborsClassifier(),
                   'ann': neural_network.MLPClassifier()}

In [3]:
def split_train_test(data, ratio):
    """
    Splits dataset into training and test set of specified size.
    
        data:  The data to be split.
        
        ratio: Ratio of first (training) subset.
        
    Returns two datasets: training and test set.
    """
    
    train_num  = int(np.round(ratio*data.shape[0]))
    data_train = data[:train_num]
    data_test  = data[train_num:]
    return(data_train, data_test)

In [4]:
def shuffle(df):
    """
    Shuffles dataset using seed specified in RND_SEED (see config part above).
    
        df:  Dataset to be shuffled.
        
    Returns shuffled dataset.
    """
    
    return(df.sample(frac=1, random_state=np.random.RandomState(seed=RND_SEED)))

In [5]:
# Load, prepare, and shuffle breast cancer data
wdbc_X_and_y = pd.read_csv('data/wdbc.data', header = None).iloc[:, 1:] # drop ID, then first col = y
wdbc_X_and_y = shuffle(wdbc_X_and_y)
wdbc_y = wdbc_X_and_y.iloc[:, 0]
wdbc_X = wdbc_X_and_y.iloc[:, 1:]

# Transform y from (B, M) to (-1, 1)
wdbc_y = wdbc_y.map({'B': -1, 'M': 1})

# Split to 80% training and 20% test set
wdbc_X_train, wdbc_X_test = split_train_test(wdbc_X, 0.8)
wdbc_y_train, wdbc_y_test = split_train_test(wdbc_y, 0.8)

In [6]:
# Load, prepare, and shuffle adult income data
income_X_and_y = pd.read_csv('data/adult.data', header=None)
income_X_and_y.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                         'marital-status', 'occupation', 'relationship',
                         'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                         'native-country', 'income']
income_X_and_y = shuffle(income_X_and_y)

# one-hot encode categorical variables
income_categorical_vars = ['workclass', 'education', 'marital-status', 'occupation',
                           'relationship', 'race', 'sex', 'native-country']
income_X_and_y_onehot = pd.DataFrame()
for var in income_categorical_vars:
    dummy_coded_var_df = pd.get_dummies(income_X_and_y[var], prefix=var)
    income_X_and_y_onehot = pd.concat([income_X_and_y_onehot, dummy_coded_var_df], axis=1)

# add remaining columns to one-hot encoded df
income_X_and_y = pd.concat([income_X_and_y_onehot,
                            income_X_and_y.loc[:, income_X_and_y.columns[
                                np.logical_not(np.in1d(income_X_and_y.columns, income_categorical_vars))]]],
                           axis=1)

income_y = income_X_and_y.loc[:, 'income']
income_X = income_X_and_y.drop('income', axis=1)

# Transform y from (<=50K, >50K) to (-1, 1)
income_y = income_y.map({' <=50K': -1, ' >50K': 1})

# Split to 80% training and 20% test set
income_X_train, income_X_test = split_train_test(income_X, 0.8)
income_y_train, income_y_test = split_train_test(income_y, 0.8)

# Hyperparam study

In [7]:
data_dict       = {'wdbc':   [wdbc_X_train, wdbc_y_train],
                   'income': [income_X_train, income_y_train]}
hyperparam_dict = {'knn': {'n_neighbors': np.arange(1, 51)}}

In [8]:
hp_exp1 = hyperparam_explorer(data_dict, hyperparam_dict, CLF_DICT, cv=5)
hp_exp1.work()

Working on part 1 of 100...
Working on part 2 of 100...
Working on part 3 of 100...
Working on part 4 of 100...
Working on part 5 of 100...
Working on part 6 of 100...
Working on part 7 of 100...
Working on part 8 of 100...
Working on part 9 of 100...
Working on part 10 of 100...
Working on part 11 of 100...
Working on part 12 of 100...
Working on part 13 of 100...
Working on part 14 of 100...
Working on part 15 of 100...
Working on part 16 of 100...
Working on part 17 of 100...
Working on part 18 of 100...
Working on part 19 of 100...
Working on part 20 of 100...
Working on part 21 of 100...
Working on part 22 of 100...
Working on part 23 of 100...
Working on part 24 of 100...
Working on part 25 of 100...
Working on part 26 of 100...
Working on part 27 of 100...
Working on part 28 of 100...
Working on part 29 of 100...
Working on part 30 of 100...
Working on part 31 of 100...
Working on part 32 of 100...
Working on part 33 of 100...
Working on part 34 of 100...
Working on part 35 of 1

{'income': {'knn':     n_neighbors  scores_mean  scores_sd
  0           1.0     0.723828   0.004849
  1           2.0     0.782218   0.002868
  2           3.0     0.752236   0.002545
  3           4.0     0.783792   0.003735
  4           5.0     0.772659   0.004268
  5           6.0     0.788744   0.003078
  6           7.0     0.780567   0.004144
  7           8.0     0.791431   0.003210
  8           9.0     0.786249   0.003340
  9          10.0     0.794387   0.001353
  10         11.0     0.790779   0.002665
  11         12.0     0.794426   0.002463
  12         13.0     0.793812   0.002208
  13         14.0     0.796038   0.001513
  14         15.0     0.795117   0.002223
  15         16.0     0.797689   0.001792
  16         17.0     0.796998   0.002379
  17         18.0     0.796883   0.002610
  18         19.0     0.797228   0.002764
  19         20.0     0.797152   0.002643
  20         21.0     0.797497   0.002954
  21         22.0     0.796537   0.002604
  22         23.0

In [9]:
hp_exp1.save('wdbc_income_knn_k_1_50')