_Questions_
- Large and small datasets, what is better: same relative or absolute train size?
  (or choose subset of data a priori)

In [2]:
### Imports

import copy
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools as it
import string
from sklearn import datasets
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn import neighbors
from sklearn import neural_network
from sklearn import model_selection
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
%config InlineBackend.figure_format = 'retina'

In [3]:
### Config

TRAIN_SIZE = 0.8

MAX_DATA_SIZE = 1000

RND_SEED = 1

CLF_DICT       = {'logreg': linear_model.LogisticRegression(),
                  'knn':    neighbors.KNeighborsClassifier(),
                  'rf':     ensemble.RandomForestClassifier(),
                  'svm':    svm.SVC()}

In [191]:
### Methods and classes

def size_info():
    ### Size info
    print("Data sizes:")
    for data_name, data_tuple in all_data_dict.items():
        print("\n{}:\nX: {}\ny: {}".format(data_name, data_tuple[0].shape, data_tuple[1].shape))

def shuffle(df):
    """
    Shuffles dataset using seed specified in RND_SEED (see config part above).
    
        df:  Dataset to be shuffled.
        
    Returns shuffled dataset.
    """
    
    return(df.sample(frac=1, random_state=np.random.RandomState(seed=RND_SEED)))

def init_clf(clf_name, clf_dict=CLF_DICT):
    return(copy.deepcopy(clf_dict[clf_name]))

class MagicSearcher:
    
    """
    Finds hyperparams for set of datasets and set of classifiers with specified hyperparam grids
    """
    
    data_dict       = None
    clf_param_dict  = None
    cv              = None
    n_jobs          = None
    verbose         = None
    method          = None
    
    # Randomized Search
    n_iter = None
    
    # Results
    searcher_obj_dict = None
    best_params_dict  = None
    scores_df         = None # Used for plotting
    
    def __init__(self, clf_param_dict, data_dict=None, cv=5, n_jobs=4, verbose=False, method='grid_search'):
        self.data_dict      = data_dict
        self.clf_param_dict = clf_param_dict
        self.cv             = cv
        self.n_jobs         = n_jobs
        self.verbose        = verbose
        self.method         = method
    
    @classmethod
    def load(cls, filename):
        with open(filename, 'rb') as f:
            return pickle.load(f)
    
    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
            
    def create_scores_df(self):
        # Prepare score df dict
        clf_scores_df_dict = dict()
        for clf_name, param_dict in self.clf_param_dict.items():
            columns = ['data_name']
            columns.extend(list(self.clf_param_dict[clf_name].keys()))
            columns.extend(['scores_mean', 'scores_sd'])

            clf_scores_df_dict[clf_name] = pd.DataFrame(columns=columns)

        for data_name, clf_searcher_obj_dict in self.searcher_obj_dict.items():
            for clf_name, searcher_obj in clf_searcher_obj_dict.items():
                grid_scores = searcher_obj.skl_search_obj.grid_scores_

                for grid_score in grid_scores:
                    param_comb_dict = grid_score[0]
                    scores_mean = np.mean(grid_score[2])
                    scores_sd = np.std(grid_score[2])

                    row = {'data_name': data_name,
                           'scores_mean': scores_mean,
                           'scores_sd': scores_sd}

                    for param_name, param_val in param_comb_dict.items():
                        row[param_name] = param_val

                    clf_scores_df_dict[clf_name] = clf_scores_df_dict[clf_name].append(row, ignore_index = True)
        self.scores_df = clf_scores_df_dict
        
    def search(self, data_dict=None, n_iter=None):
        if self.method == 'randomized_search' and n_iter is None:
            raise Exception('You need to specify n_iter for randomized search')
        self.n_iter = n_iter
            
        if (self.data_dict is None) and (data_dict is None):
            raise Exception('You need to specify data!') 
        
        searcher_obj_dict = dict()
        best_params_dict = dict()
        for data_name, data_tuple in self.data_dict.items():
            print("Working on dataset {} ...".format(data_name))
            
            X = data_tuple[0]
            y = data_tuple[1]
            
            searcher_obj_dict[data_name] = dict()
            best_params_dict[data_name] = dict()
            for clf_name, param_dict in clf_param_dict.items():
                print("  Doing {} magic ...".format(clf_name))
                searcher_obj = ParamSearcher(X, y, clf_name, param_dict, self.method, self.n_jobs, self.cv, self.verbose)
                searcher_obj.search()
                searcher_obj_dict[data_name][clf_name] = searcher_obj
                
                best_params_dict[data_name][clf_name] = searcher_obj.best_params_
        self.searcher_obj_dict = searcher_obj_dict
        self.best_params_dict = best_params_dict
        
        self.create_scores_df()

class ParamSearcher:
    
    """
    Finds hyperparams for one dataset and one classifier
    """
    
    X = None
    y = None
    clf_name = None
    param_dict = None
    method = None
    verbose = None
    n_jobs = None
    cv = None
    
    # Randomized Search
    n_iter = None
    
    # Results
    skl_search_obj = None
    best_params_ = None
    
    def __init__(self, X, y, clf_name, param_dict, method='grid_search', n_jobs=4, cv=5, verbose=False):
        self.X = X
        self.y = y
        self.clf_name = clf_name
        self.param_dict = param_dict
        self.method = method
        self.n_jobs = n_jobs
        self.cv = cv
        self.verbose = verbose
        
    def check_params(self):
        # Check for exceeded hard limits of some params
        if self.clf_name == 'knn':
            if 'n_neighbors' in self.param_dict:
                max_n_neighbors = int(np.floor(self.X.shape[0]/self.cv)-1)
                if np.any(self.param_dict['n_neighbors'] > max_n_neighbors):
                    print('ParamSearcher: knn: some n_neighbors > n_samples/cv-1. Restricting range to n_samples/cv-1.')
                    self.param_dict['n_neighbors'][self.param_dict['n_neighbors'] > max_n_neighbors] = max_n_neighbors
                    
                    # Remove duplicates
                    self.param_dict['n_neighbors'] = np.unique(self.param_dict['n_neighbors'])
        elif self.clf_name == 'rf':
            if 'max_features' in self.param_dict:
                if np.any(self.param_dict['max_features'] > self.X.shape[1]):
                    print('ParamSearcher: rf: some max_features > n_features. Restricting range to max_features.')
                    self.param_dict['max_features'][self.param_dict['max_features'] > self.X.shape[1]] = self.X.shape[1]
                    
                    # Remove duplicates
                    self.param_dict['max_features'] = np.unique(self.param_dict['max_features'])
                    
                    
    def search(self, n_iter=None):
        self.check_params()
        
        if self.method == 'grid_search':
            skl_search_obj = GridSearchCV(estimator  = init_clf(self.clf_name),
                                          param_grid = self.param_dict,
                                          n_jobs     = self.n_jobs,
                                          cv         = self.cv,
                                          verbose    = self.verbose)
        elif self.method == 'randomized_search':
            if n_iter is None:
                raise Exception('You need to specify n_iter for randomized search')
            self.n_iter = n_iter
            
            skl_search_obj = RandomizedSearchCV(estimator           = init_clf(self.clf_name),
                                                 param_distributions = self.param_dict,
                                                 n_iter              = n_iter,
                                                 n_jobs              = self.n_jobs,
                                                 cv                  = self.cv,
                                                 verbose             = self.verbose)
        skl_search_obj.fit(self.X, self.y)
        self.skl_search_obj = skl_search_obj
        self.best_params_ = skl_search_obj.best_params_

In [5]:
### Load data
## iris
iris_X = pd.DataFrame(datasets.load_iris()['data'])
iris_y = pd.Series(datasets.load_iris()['target'])


## wdbc
wdbc_X_and_y = pd.read_csv('data/wdbc.data', header = None).iloc[:, 1:] # drop ID, then first col = y
wdbc_y = wdbc_X_and_y.iloc[:, 0]
wdbc_X = wdbc_X_and_y.iloc[:, 1:]

wdbc_y = wdbc_y.map({'B': -1, 'M': 1}) # Transform y from (B, M) to (-1, 1)


## income
# Load, prepare, and shuffle adult income data
income_X_and_y = pd.read_csv('data/adult.data', header=None)
income_X_and_y.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                         'marital-status', 'occupation', 'relationship',
                         'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                         'native-country', 'income']

# one-hot encode categorical variables
income_categorical_vars = ['workclass', 'education', 'marital-status', 'occupation',
                           'relationship', 'race', 'sex', 'native-country']
income_X_and_y_onehot = pd.DataFrame()
for var in income_categorical_vars:
    dummy_coded_var_df = pd.get_dummies(income_X_and_y[var], prefix=var)
    income_X_and_y_onehot = pd.concat([income_X_and_y_onehot, dummy_coded_var_df], axis=1)

# add remaining columns to one-hot encoded df
income_X_and_y = pd.concat([income_X_and_y_onehot,
                            income_X_and_y.loc[:, income_X_and_y.columns[
                                np.logical_not(np.in1d(income_X_and_y.columns, income_categorical_vars))]]],
                           axis=1)

income_y = income_X_and_y.loc[:, 'income']
income_X = income_X_and_y.drop('income', axis=1)

# Transform y from (<=50K, >50K) to (-1, 1)
income_y = income_y.map({' <=50K': -1, ' >50K': 1})


## Letter
letter_X_and_y = pd.read_csv('data/letter.data', header=None)
letter_X = letter_X_and_y.iloc[:, 1:]
letter_y = letter_X_and_y.iloc[:, 0]

# Transform y from A:M -> -1 and N:Z -> 1
def alph_to_cat(letter):
    if str.upper(letter) in list(string.ascii_uppercase[:13]):
        return(1)
    elif str.upper(letter) in list(string.ascii_uppercase[13:]):
        return(-1)
    
letter_y = letter_y.map(alph_to_cat)

## covtype
covtype_X_and_y = pd.read_csv('data/covtype.data')
covtype_X = covtype_X_and_y.iloc[:, :-1]
covtype_y = covtype_X_and_y.iloc[:, -1]

covtype_y = covtype_y.map({7:1}).fillna(0)



all_data_dict = {'wdbc':      (wdbc_X, wdbc_y),
                 'income':    (income_X, income_y),
                 'iris':      (iris_X, iris_y),
                 'covtype':   (covtype_X, covtype_y),
                 'letter':    (letter_X, letter_y)}

### Shuffle
for data_name, data_tuple in all_data_dict.items():
    X = data_tuple[0]
    y = data_tuple[1]
    
    X = shuffle(X)
    y = shuffle(y)
    
    all_data_dict[data_name] = (X, y)

size_info()

Data sizes:

wdbc:
X: (569, 30)
y: (569,)

income:
X: (32561, 108)
y: (32561,)

iris:
X: (150, 4)
y: (150,)

covtype:
X: (581011, 54)
y: (581011,)

letter:
X: (20000, 16)
y: (20000,)


In [6]:
### Limit dataset sizes
for data_name, data_tuple in all_data_dict.items():
    X = data_tuple[0]
    y = data_tuple[1]
    
    assert X.shape[0] == y.shape[0]
    
    if y.shape[0] > MAX_DATA_SIZE:
        X = X.sample(MAX_DATA_SIZE, random_state=RND_SEED)
        y = y.sample(MAX_DATA_SIZE, random_state=RND_SEED)

        all_data_dict[data_name] = (X, y)

size_info()

Data sizes:

wdbc:
X: (569, 30)
y: (569,)

income:
X: (1000, 108)
y: (1000,)

iris:
X: (150, 4)
y: (150,)

covtype:
X: (1000, 54)
y: (1000,)

letter:
X: (1000, 16)
y: (1000,)


In [192]:
### Go!
# clf_param_dict = {'knn':    {'n_neighbors': np.arange(1, 51)},
#                   'logreg': {'C': [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]},
#                   'rf':     {'n_estimators': [1024],
#                              'max_features': [1, 2, 4, 6, 8, 12, 16, 20]}}
# clf_param_dict = {'knn':    {'n_neighbors':  np.arange(1, 51)},
#                   'rf':     {'n_estimators': np.array([1024]),
#                              'max_features': np.array([1, 2, 4, 6, 8, 12, 16, 20])},
#                   'svm':    {'kernel':       ['rbf', 'linear']}}
clf_param_dict = {'knn':    {'n_neighbors':  np.arange(1, 51)},
                  'rf':     {'n_estimators': np.array([256]),
                             'max_features': np.array([1, 2, 4, 6])}}


everything = MagicSearcher(clf_param_dict, all_data_dict, cv=2, n_jobs=None, verbose=False, method='grid_search')
everything.search()

Working on dataset wdbc ...
  Doing knn magic ...
  Doing rf magic ...
Working on dataset income ...
  Doing knn magic ...
  Doing rf magic ...
Working on dataset iris ...
  Doing knn magic ...
  Doing rf magic ...
ParamSearcher: rf: some max_features > n_features. Restricting range to max_features.
Working on dataset covtype ...
  Doing knn magic ...
  Doing rf magic ...
Working on dataset letter ...
  Doing knn magic ...
  Doing rf magic ...


In [193]:
everything.scores_df

{'knn':     data_name n_neighbors  scores_mean  scores_sd
 0        wdbc           1     0.919193   0.020948
 1        wdbc           2     0.910434   0.036749
 2        wdbc           3     0.920948   0.019193
 3        wdbc           4     0.912157   0.017420
 4        wdbc           5     0.919181   0.013918
 5        wdbc           6     0.920935   0.012163
 6        wdbc           7     0.922702   0.017439
 7        wdbc           8     0.919181   0.013918
 8        wdbc           9     0.926211   0.013930
 9        wdbc          10     0.927965   0.012176
 10       wdbc          11     0.926205   0.010415
 11       wdbc          12     0.924450   0.012170
 12       wdbc          13     0.927965   0.012176
 13       wdbc          14     0.922683   0.006894
 14       wdbc          15     0.922690   0.010409
 15       wdbc          16     0.917420   0.012157
 16       wdbc          17     0.917420   0.012157
 17       wdbc          18     0.917420   0.012157
 18       wdbc          

In [13]:
everything.save('knn_rf.pkl')

In [15]:
loaded = MagicSearcher.load('./knn_rf.pkl')

In [17]:
bla = loaded.searcher_obj_dict['iris']['knn']
bla.skl_search_obj.grid_scores_

[mean: 0.95333, std: 0.02000, params: {'n_neighbors': 1},
 mean: 0.94667, std: 0.00000, params: {'n_neighbors': 2},
 mean: 0.96000, std: 0.00000, params: {'n_neighbors': 3},
 mean: 0.94667, std: 0.01333, params: {'n_neighbors': 4},
 mean: 0.96000, std: 0.01333, params: {'n_neighbors': 5},
 mean: 0.95333, std: 0.00667, params: {'n_neighbors': 6},
 mean: 0.94667, std: 0.01333, params: {'n_neighbors': 7},
 mean: 0.95333, std: 0.02000, params: {'n_neighbors': 8},
 mean: 0.96000, std: 0.01333, params: {'n_neighbors': 9},
 mean: 0.96000, std: 0.01333, params: {'n_neighbors': 10},
 mean: 0.96667, std: 0.02000, params: {'n_neighbors': 11},
 mean: 0.96667, std: 0.02000, params: {'n_neighbors': 12},
 mean: 0.97333, std: 0.01333, params: {'n_neighbors': 13},
 mean: 0.96667, std: 0.02000, params: {'n_neighbors': 14},
 mean: 0.96667, std: 0.02000, params: {'n_neighbors': 15},
 mean: 0.95333, std: 0.00667, params: {'n_neighbors': 16},
 mean: 0.95333, std: 0.02000, params: {'n_neighbors': 17},
 mean:

In [185]:
ms = MagicSearcher.load('./knn_rf.pkl')

# Prepare score df dict
clf_scores_df_dict = dict()
for clf_name, param_dict in ms.clf_param_dict.items():
    columns = ['data_name']
    columns.extend(list(ms.clf_param_dict[clf_name].keys()))
    columns.extend(['scores_mean', 'scores_sd'])
    
    clf_scores_df_dict[clf_name] = pd.DataFrame(columns=columns)

for data_name, clf_searcher_obj_dict in ms.searcher_obj_dict.items():
    for clf_name, searcher_obj in clf_searcher_obj_dict.items():
        grid_scores = searcher_obj.skl_search_obj.grid_scores_
        
        for grid_score in grid_scores:
            param_comb_dict = grid_score[0]
            scores_mean = np.mean(grid_score[2])
            scores_sd = np.std(grid_score[2])
            
            row = {'data_name': data_name,
                   'scores_mean': scores_mean,
                   'scores_sd': scores_sd}
            
            for param_name, param_val in param_comb_dict.items():
                row[param_name] = param_val
                
            clf_scores_df_dict[clf_name] = clf_scores_df_dict[clf_name].append(row, ignore_index = True)

In [186]:
clf_scores_df_dict

{'knn':     data_name n_neighbors  scores_mean  scores_sd
 0        wdbc           1     0.919193   0.020948
 1        wdbc           2     0.910434   0.036749
 2        wdbc           3     0.920948   0.019193
 3        wdbc           4     0.912157   0.017420
 4        wdbc           5     0.919181   0.013918
 5        wdbc           6     0.920935   0.012163
 6        wdbc           7     0.922702   0.017439
 7        wdbc           8     0.919181   0.013918
 8        wdbc           9     0.926211   0.013930
 9        wdbc          10     0.927965   0.012176
 10       wdbc          11     0.926205   0.010415
 11       wdbc          12     0.924450   0.012170
 12       wdbc          13     0.927965   0.012176
 13       wdbc          14     0.922683   0.006894
 14       wdbc          15     0.922690   0.010409
 15       wdbc          16     0.917420   0.012157
 16       wdbc          17     0.917420   0.012157
 17       wdbc          18     0.917420   0.012157
 18       wdbc          

In [109]:
clf_scores_df_dict

{'data_name': 'letter', 'knn': Empty DataFrame
 Columns: [data_name, scores_mean, scores_sd]
 Index: [], 'max_features': 4, 'n_estimators': 256, 'n_neighbors': 50, 'rf': Empty DataFrame
 Columns: [data_name, scores_mean, scores_sd]
 Index: [], 'scores_mean': 0.83099999999999996, 'scores_sd': 0.030999999999999972}

In [79]:
q = searcher_obj.skl_search_obj.grid_scores_
q

[mean: 0.82300, std: 0.02900, params: {'max_features': 1, 'n_estimators': 256},
 mean: 0.83200, std: 0.01800, params: {'max_features': 2, 'n_estimators': 256},
 mean: 0.83100, std: 0.03100, params: {'max_features': 4, 'n_estimators': 256}]

In [65]:
o

mean: 0.83100, std: 0.03100, params: {'max_features': 4, 'n_estimators': 256}

In [66]:
o[0]

{'max_features': 4, 'n_estimators': 256}

In [68]:
o[2]

array([ 0.8  ,  0.862])