# Preamble

In [1]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools as it
from sklearn import datasets
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn import neighbors
from sklearn import neural_network
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from hyperparams import hyperparam_explorer # my package
from hyperparams import init_clf
%config InlineBackend.figure_format = 'retina'

# Config

In [2]:
# Config
RND_SEED        = 0
CLF_DICT        = {'svm': svm.SVC(),
                   'dt':  tree.DecisionTreeClassifier(),
                   'rf':  ensemble.RandomForestClassifier(),
                   'knn': neighbors.KNeighborsClassifier(),
                   'ann': neural_network.MLPClassifier()}

# Methods

In [3]:
def shuffle(df):
    """
    Shuffles dataset using seed specified in RND_SEED (see config part above).
    
        df:  Dataset to be shuffled.
        
    Returns shuffled dataset.
    """
    
    return(df.sample(frac=1, random_state=np.random.RandomState(seed=RND_SEED)))

# Data preparation

## wdbc: Wisconsin Breast Cancer data

In [4]:
# Load, prepare, and shuffle breast cancer data
wdbc_X_and_y = pd.read_csv('data/wdbc.data', header = None).iloc[:, 1:] # drop ID, then first col = y
wdbc_X_and_y = shuffle(wdbc_X_and_y)
wdbc_y = wdbc_X_and_y.iloc[:, 0]
wdbc_X = wdbc_X_and_y.iloc[:, 1:]

# Transform y from (B, M) to (-1, 1)
wdbc_y = wdbc_y.map({'B': -1, 'M': 1})

# Split to 80% training and 20% test set
wdbc_X_train, wdbc_X_test, wdbc_y_train, wdbc_y_test = \
    model_selection.train_test_split(wdbc_X, wdbc_y, test_size=0.2)

## income: US Census Income data

In [5]:
# Load, prepare, and shuffle adult income data
income_X_and_y = pd.read_csv('data/adult.data', header=None)
income_X_and_y.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                         'marital-status', 'occupation', 'relationship',
                         'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                         'native-country', 'income']
income_X_and_y = shuffle(income_X_and_y)

# one-hot encode categorical variables
income_categorical_vars = ['workclass', 'education', 'marital-status', 'occupation',
                           'relationship', 'race', 'sex', 'native-country']
income_X_and_y_onehot = pd.DataFrame()
for var in income_categorical_vars:
    dummy_coded_var_df = pd.get_dummies(income_X_and_y[var], prefix=var)
    income_X_and_y_onehot = pd.concat([income_X_and_y_onehot, dummy_coded_var_df], axis=1)

# add remaining columns to one-hot encoded df
income_X_and_y = pd.concat([income_X_and_y_onehot,
                            income_X_and_y.loc[:, income_X_and_y.columns[
                                np.logical_not(np.in1d(income_X_and_y.columns, income_categorical_vars))]]],
                           axis=1)

income_y = income_X_and_y.loc[:, 'income']
income_X = income_X_and_y.drop('income', axis=1)

# Transform y from (<=50K, >50K) to (-1, 1)
income_y = income_y.map({' <=50K': -1, ' >50K': 1})

# Split to 80% training and 20% test set

income_X_train, income_X_test, income_y_train, income_y_test = \
    model_selection.train_test_split(income_X, income_y, test_size = 0.2)

## Iris data

In [6]:
iris_X = datasets.load_iris()['data']
iris_y = datasets.load_iris()['target']
iris_X_train, iris_X_test, iris_y_train, iris_y_test = \
    model_selection.train_test_split(iris_X, iris_y, test_size=0.2)

# Hyperparam study

In [9]:
data_dict       = {'wdbc':   [wdbc_X_train, wdbc_y_train],
                   'income': [income_X_train, income_y_train],
                   'iris':   [iris_X_train, iris_y_train]}
hyperparam_dict = {'knn': {'n_neighbors': np.arange(1, 51)}}
expl = hyperparam_explorer(data_dict, hyperparam_dict, CLF_DICT, cv=5)
expl.explore()

Working on part 1 of 150...
Working on part 2 of 150...
Working on part 3 of 150...
Working on part 4 of 150...
Working on part 5 of 150...
Working on part 6 of 150...
Working on part 7 of 150...
Working on part 8 of 150...
Working on part 9 of 150...
Working on part 10 of 150...
Working on part 11 of 150...
Working on part 12 of 150...
Working on part 13 of 150...
Working on part 14 of 150...
Working on part 15 of 150...
Working on part 16 of 150...
Working on part 17 of 150...
Working on part 18 of 150...
Working on part 19 of 150...
Working on part 20 of 150...
Working on part 21 of 150...
Working on part 22 of 150...
Working on part 23 of 150...
Working on part 24 of 150...
Working on part 25 of 150...
Working on part 26 of 150...
Working on part 27 of 150...
Working on part 28 of 150...
Working on part 29 of 150...
Working on part 30 of 150...
Working on part 31 of 150...
Working on part 32 of 150...
Working on part 33 of 150...
Working on part 34 of 150...
Working on part 35 of 1

{'income': {'knn':     n_neighbors  scores_mean  scores_sd
  0           1.0     0.727311   0.003458
  1           2.0     0.785511   0.001809
  2           3.0     0.758177   0.002397
  3           4.0     0.789235   0.001323
  4           5.0     0.776912   0.001357
  5           6.0     0.792307   0.000543
  6           7.0     0.784552   0.000530
  7           8.0     0.794418   0.001304
  8           9.0     0.789427   0.002497
  9          10.0     0.796146   0.002074
  10         11.0     0.794111   0.001876
  11         12.0     0.796990   0.001659
  12         13.0     0.795992   0.002208
  13         14.0     0.797259   0.001556
  14         15.0     0.797067   0.002110
  15         16.0     0.797182   0.002001
  16         17.0     0.796453   0.001469
  17         18.0     0.797950   0.001291
  18         19.0     0.798104   0.001052
  19         20.0     0.798065   0.000726
  20         21.0     0.798411   0.001254
  21         22.0     0.798142   0.000851
  22         23.0

In [13]:
result_dict = expl.get_results()
result_dict

{'income': {'knn':     n_neighbors  scores_mean  scores_sd
  0           1.0     0.727311   0.003458
  1           2.0     0.785511   0.001809
  2           3.0     0.758177   0.002397
  3           4.0     0.789235   0.001323
  4           5.0     0.776912   0.001357
  5           6.0     0.792307   0.000543
  6           7.0     0.784552   0.000530
  7           8.0     0.794418   0.001304
  8           9.0     0.789427   0.002497
  9          10.0     0.796146   0.002074
  10         11.0     0.794111   0.001876
  11         12.0     0.796990   0.001659
  12         13.0     0.795992   0.002208
  13         14.0     0.797259   0.001556
  14         15.0     0.797067   0.002110
  15         16.0     0.797182   0.002001
  16         17.0     0.796453   0.001469
  17         18.0     0.797950   0.001291
  18         19.0     0.798104   0.001052
  19         20.0     0.798065   0.000726
  20         21.0     0.798411   0.001254
  21         22.0     0.798142   0.000851
  22         23.0

In [14]:
expl.save('wdbc_income_iris_knn_k_1_50')

# Package dev