In [1]:
# %load_ext autoreload
# %autoreload 1

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from skimage import io
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split

# %aimport fit_and_classify
import fit_and_classify

In [2]:
def extract_features(path, filenames):
    hog_length = len(fit_and_classify.extract_hog(io.imread(path + '/' + filenames[0], plugin='matplotlib')))
    data = np.zeros((len(filenames), hog_length))
    for i in range(0, len(filenames)):
        filename = path + '/' + filenames[i]
        data[i, :] = fit_and_classify.extract_hog(io.imread(filename, plugin='matplotlib'))
    return data

In [3]:
gt = pd.read_csv('../data/train/gt.csv')
X_fnames = gt.filename.values
y = gt.class_id.values

### CROSS VAL

In [42]:
X_tr_fnames, X_te_fnames, y_tr, y_te = train_test_split(X_fnames, y)
X_tr = extract_features("../data/train/", X_tr_fnames[:2000])
# X_te = extract_features("../data/train/", X_te_fnames[:1000])

In [5]:
kf = StratifiedKFold(shuffle=True)
params = {'C': np.arange(100, 1001, 200), 'gamma': np.linspace(0.01, 0.001, 5)}
y_tr_ = y_tr[:2000]

In [6]:
%%time
results = dict()
size = len(params['C']) * len(params['gamma'])
progress = 0
for c in params['C']:
    results[c] = dict()
    start_ = time.time()
    print('C = ', c, end='; gamma = ')
    for gamma in params['gamma']:
        results[c][gamma] = []
        print(gamma, end=', ')
        clf = OneVsRestClassifier(SVC(C=c, gamma=gamma, cache_size=5000), n_jobs=-1)
        for train, test in kf.split(X_tr, y_tr_):
            clf.fit(X_tr[train], y_tr_[train])
            results[c][gamma] += [accuracy_score(y_tr_[test], clf.predict(X_tr[test]))]
            
        progress += 1
    
    print('P = {:.2f}%; {:.2f}sec'.format(100 * progress / size, time.time() - start_))

C =  100; gamma = 0.01, 0.00775, 0.0055, 0.00325, 0.001, P = 20.00%; 215.32sec
C =  300; gamma = 0.01, 0.00775, 0.0055, 0.00325, 0.001, P = 40.00%; 213.56sec
C =  500; gamma = 0.01, 0.00775, 0.0055, 0.00325, 0.001, P = 60.00%; 213.22sec
C =  700; gamma = 0.01, 0.00775, 0.0055, 0.00325, 0.001, P = 80.00%; 209.41sec
C =  900; gamma = 0.01, 0.00775, 0.0055, 0.00325, 0.001, P = 100.00%; 207.16sec
CPU times: user 15min 16s, sys: 9.64 s, total: 15min 26s
Wall time: 17min 38s


In [40]:
results

{100: {0.001: [0.9019033674963397, 0.92792792792792789, 0.91705069124423966],
  0.0032499999999999994: [0.91361639824304541,
   0.92042042042042038,
   0.92933947772657455],
  0.0054999999999999997: [0.92532942898975112,
   0.91741741741741745,
   0.91397849462365588],
  0.0077499999999999999: [0.92386530014641288,
   0.92192192192192191,
   0.92933947772657455],
  0.01: [0.91947291361639827, 0.89789789789789787, 0.9339477726574501]},
 300: {0.001: [0.91068814055636893, 0.92642642642642647, 0.92012288786482332],
  0.0032499999999999994: [0.92093704245973651,
   0.91291291291291288,
   0.9124423963133641],
  0.0054999999999999997: [0.92386530014641288,
   0.91291291291291288,
   0.91397849462365588],
  0.0077499999999999999: [0.91800878477306003,
   0.92042042042042038,
   0.92012288786482332],
  0.01: [0.92240117130307464, 0.92042042042042038, 0.93548387096774188]},
 500: {0.001: [0.9019033674963397, 0.93093093093093093, 0.90476190476190477],
  0.0032499999999999994: [0.916544655929721

#### CHECK

In [4]:
X_tr_fnames, X_te_fnames, y_tr, y_te = train_test_split(X_fnames, y)
X_tr = extract_features("../data/train/", X_tr_fnames[:10000])
X_te = extract_features("../data/train/", X_te_fnames[:5000])

In [10]:
# best by mean value of CV
clf = OneVsRestClassifier(SVC(C=300, gamma=0.01, cache_size=5000), n_jobs=-1)

In [11]:
%%time
clf.fit(X_tr, y_tr[:10000])
print(accuracy_score(y_te[:5000], clf.predict(X_te)))

0.9844
CPU times: user 3min 45s, sys: 270 ms, total: 3min 45s
Wall time: 5min 16s


In [12]:
# best by best value of CV
clf = OneVsRestClassifier(SVC(C=700, gamma=0.01, cache_size=5000), n_jobs=-1)

In [13]:
%%time
clf.fit(X_tr, y_tr[:10000])
print(accuracy_score(y_te[:5000], clf.predict(X_te)))

0.9844
CPU times: user 3min 46s, sys: 267 ms, total: 3min 46s
Wall time: 5min 5s
