# Feature Exploration

In this notebook, [prevously prepared](./feature-extraction.ipynb) small datasets with different features will be explored for the accuracy.

In [1]:
import pickle
import numpy as np
import timeit
import glob
import os
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

## Classifier

In [2]:
# SVM
# support vector regressor instance
svr = svm.SVC(kernel='linear')
# parameters for gridsearch
parameters = {'C':[0.01, 0.1, 1, 10], 'gamma':[0.01, 0.1, 1, 10, 'auto']}
# the classifier
clf = GridSearchCV(svr, parameters)

## Explore

In [3]:
# a helper function
def explore_datasets(dataset_types):
    for dataset_paths in dataset_types:
        for dataset_path in dataset_paths:
            with open(dataset_path, mode='rb') as f:
                dataset = pickle.load(f)
                # load features and labels
                X = dataset['features']
                y = dataset['labels']
                # shuffle and split dataset (for shuffle=False sklearn should be v0.19)
                X_train, X_test, y_train, y_test =\
                    train_test_split(X, y, test_size=0.33, shuffle=False)
                # normalize features
                X_scaler = StandardScaler().fit(X_train)
                scaled_X_train = X_scaler.transform(X_train)
                # Train the classifier
                start_time = timeit.default_timer()
                print(
                    'fit linear kernel for dataset: '
                    + dataset_path.split('/')[-2]
                    + '/'
                    + dataset_path.split('/')[-1])
                print('Number of features: ' + str(X.shape[1]))
                print('Number of examples: ' + str(X.shape[0]))
                clf.fit(scaled_X_train, y_train)
                print('Best params: ' + str(clf.best_params_))
                end_time = timeit.default_timer()
                print('Run time: %.4f minutes' %((end_time-start_time)/60))
                # evaluate
                # normalize test features
                scaled_X_test = X_scaler.transform(X_test)
                # predict
                y_pred = clf.predict(scaled_X_test)
                acc = accuracy_score(y_test, y_pred)
                # or just use clf.score(scaled_X_test, y_test)
                print('Accuracy: ' + str(acc))
                print('')

In [4]:
# Explore Datasets
grays = glob.glob('./datasets/gray-tiny/*.pkl')
grays.sort(key=os.path.getmtime)
rgbs = glob.glob('./datasets/rgb-tiny/*.pkl')
rgbs.sort(key=os.path.getmtime)
hsvs = glob.glob('./datasets/hsv-tiny/*.pkl')
hsvs.sort(key=os.path.getmtime)

dataset_types = [grays, rgbs, hsvs]

explore_datasets(dataset_types)

fit linear kernel for dataset: gray-tiny/dataset-pix-8-8-cell-1-1-histeq-1.pkl
Number of features: 576
Number of examples: 4000
Best params: {'C': 0.01, 'gamma': 0.01}
Run time: 1.3430 minutes
Accuracy: 0.916666666667

fit linear kernel for dataset: gray-tiny/dataset-pix-8-8-cell-1-1-histeq-0.pkl
Number of features: 576
Number of examples: 4000
Best params: {'C': 0.01, 'gamma': 0.01}
Run time: 1.4368 minutes
Accuracy: 0.89696969697

fit linear kernel for dataset: gray-tiny/dataset-pix-8-8-cell-2-2-histeq-1.pkl
Number of features: 1764
Number of examples: 4000
Best params: {'C': 0.01, 'gamma': 0.01}
Run time: 2.7847 minutes
Accuracy: 0.950757575758

fit linear kernel for dataset: gray-tiny/dataset-pix-8-8-cell-2-2-histeq-0.pkl
Number of features: 1764
Number of examples: 4000
Best params: {'C': 0.01, 'gamma': 0.01}
Run time: 3.1181 minutes
Accuracy: 0.940151515152

fit linear kernel for dataset: gray-tiny/dataset-pix-16-16-cell-1-1-histeq-1.pkl
Number of features: 144
Number of examples

---