In [1]:
from os.path import join
from fit_and_classify import fit_and_classify, extract_hog
from glob import glob
from numpy import zeros
from os.path import basename, join
from skimage.io import imread
from tqdm import tqdm
from sklearn.model_selection import cross_validate, GroupKFold, GridSearchCV
from sklearn import svm
from notify import notify

def read_gt(gt_dir):
    fgt = open(join(gt_dir, 'gt.csv'))
    next(fgt)
    lines = fgt.readlines()

    filenames = []
    labels = zeros(len(lines))
    groups = zeros(len(lines))
    for i, line in enumerate(lines):
        filename, label, group = line.rstrip('\n').split(',')
        filenames.append(filename)
        labels[i] = int(label)
        groups[i] = int(group)

    return filenames, labels, groups

def extract_features(path, filenames):
    hog_length = len(extract_hog(imread(join(path, filenames[0]))))
    data = zeros((len(filenames), hog_length))
    for i in tqdm(range(0, len(filenames))):
        filename = join(path, filenames[i])
        data[i, :] = extract_hog(imread(filename))
    return data

def cross_validate_clf(clf, n_splits=3):
    group_kfold = GroupKFold(n_splits=n_splits)
    result = cross_validate(
        estimator=clf,
        X=features,
        y=labels,
        groups=groups,
        cv=group_kfold,
        scoring='accuracy',
        return_train_score=True,
        n_jobs=3
    )
    return {
        'train_score': result['train_score'],
        'test_score': result['test_score']
    }

In [15]:
from skimage.io import imread, imsave, imshow
from skimage.filters import sobel_h, sobel_v
from skimage.color import rgb2gray
from skimage.transform import resize
from sklearn import svm
import numpy as np

def grad_magn_orient(img):
    dx = sobel_h(img)
    dy = sobel_v(img)
    return np.hypot(dx, dy), np.pi + np.arctan2(dx, dy)

def hog(img):
    bin_count = 8
    N_SEGMENTS = 14
    seg_h = (img.shape[0] + N_SEGMENTS - 1) // N_SEGMENTS
    seg_w = (img.shape[1] + N_SEGMENTS - 1) // N_SEGMENTS
    indent = 3
    hist = np.zeros((N_SEGMENTS-2*indent, N_SEGMENTS-2*indent, bin_count))
    magn, orient = grad_magn_orient(img)
    for i in range(indent, N_SEGMENTS - indent):
        for j in range(indent, N_SEGMENTS - indent):
            orient_seg = orient[i*seg_h : (i+1)*seg_h, j*seg_w : (j+1)*seg_w]
            magn_seg = magn[i*seg_h : (i+1)*seg_h, j*seg_w : (j+1)*seg_w]
            result = np.histogram(
                orient_seg,
                bins=bin_count,
                range=(-np.pi, np.pi),
                weights=magn_seg
            )[0]
            result /= (np.linalg.norm(result) + 1e-6)
            hist[i - indent, j - indent] = result
    return hist.flatten()

def extract_hog(img):
    image_resized = resize(img, (140, 140), anti_aliasing=True)
    return hog(rgb2gray(image_resized))

In [16]:
train_dir = 'public_tests/00_test_img_input/train/'
filenames, labels, groups = read_gt(train_dir)

In [17]:
features = extract_features(train_dir, filenames)
features.shape

100%|██████████| 39209/39209 [11:51<00:00, 55.09it/s]


(39209, 512)

In [18]:
%%time
clf = svm.SVC(gamma="scale", kernel='rbf', C=16)
result = cross_validate_clf(clf)
notify(f'14x14, indent=3, rbf, C=16, {result}')

14x14, indent=3, rbf, C=16, {'train_score': array([1., 1., 1.]), 'test_score': array([0.92729358, 0.9379922 , 0.93655172])}
CPU times: user 384 ms, sys: 197 ms, total: 581 ms
Wall time: 12min 3s


In [12]:
%%time
clf = svm.LinearSVC(dual=False)
cross_validate_clf(clf)

CPU times: user 71.7 ms, sys: 144 ms, total: 216 ms
Wall time: 2min 53s


{'fit_time': array([169.94207025, 171.51208878, 171.65262055]),
 'score_time': array([0.08732867, 0.0675683 , 0.04119205]),
 'test_score': array([0.75749235, 0.76886612, 0.75862069]),
 'train_score': array([0.91009989, 0.90899349, 0.91047058])}

In [13]:
%%time
clf = svm.LinearSVC(dual=True)
cross_validate_clf(clf)

CPU times: user 46.4 ms, sys: 52.1 ms, total: 98.5 ms
Wall time: 2min 58s


{'fit_time': array([168.45046496, 178.55216527, 176.55208707]),
 'score_time': array([0.07305455, 0.03891444, 0.04365897]),
 'test_score': array([0.75603976, 0.76741341, 0.7594636 ]),
 'train_score': array([0.90983199, 0.91017987, 0.91112046])}

In [21]:
%%time
parameters = {'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.25, 0.5, 1]}
svc = svm.SVC(gamma="scale")
group_kfold = GroupKFold(n_splits=3)
clf = GridSearchCV(
    svc,
    parameters,
    cv=group_kfold,
    n_jobs=3
)
clf.fit(
    X=features,
    y=labels,
    groups=groups
)
notify(f'I have finished. Best params is {clf.best_params_}. Best score is {clf.best_score_}. CV results is {clf.cv_results_}')

I have finished. Best params is {'C': 1, 'kernel': 'rbf'}. Best score is 0.8178734474227856. CV results is {'mean_fit_time': array([  67.79227742,  212.91076612,  246.69034362,   75.66905355,
       1944.91685462,  171.68165127,   75.26056973,  152.15376774,
        154.02126463]), 'std_fit_time': array([1.91259267e+00, 8.25753689e+00, 6.38447837e+00, 1.94837566e+00,
       2.49480853e+03, 4.89821411e+00, 2.51969454e+00, 9.73268255e+00,
       2.22478805e+00]), 'mean_score_time': array([  96.3252008 ,  177.04769063,  188.5040772 ,  102.77892057,
       3669.81173539,  169.50102687,   97.67173735,  163.05978672,
        162.33038036]), 'std_score_time': array([1.78599121e+00, 6.33499482e+00, 6.19682129e+00, 3.28895923e+00,
       2.49489388e+03, 7.94303161e+00, 7.38631362e+00, 4.07875636e+00,
       1.15574685e+01]), 'param_C': masked_array(data=[0.25, 0.25, 0.25, 0.5, 0.5, 0.5, 1, 1, 1],
             mask=[False, False, False, False, False, False, False, False,
                   False

In [22]:
%%time
parameters = {'C':[0.1, 0.3, 0.5, 1, 2]}
svc = svm.SVC(gamma="scale")
group_kfold = GroupKFold(n_splits=3)
clf = GridSearchCV(
    svc,
    parameters,
    cv=group_kfold,
    n_jobs=3
)
clf.fit(
    X=features,
    y=labels,
    groups=groups
)
notify(f'I have finished. Best params is {clf.best_params_}')

CPU times: user 7min 22s, sys: 387 ms, total: 7min 22s
Wall time: 1h 11min 6s


GridSearchCV(cv=GroupKFold(n_splits=3), error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=3, param_grid={'C': [0.1, 0.3, 0.5, 1, 2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [31]:
clf.best_params_

{'C': 2}

In [5]:
%%time
parameters = {'C':[0.1, 0.3, 0.5, 1, 2, 4, 8]}
svc = svm.LinearSVC()
group_kfold = GroupKFold(n_splits=3)
clf = GridSearchCV(
    svc,
    parameters,
    cv=group_kfold,
    n_jobs=3
)
clf.fit(
    X=features,
    y=labels,
    groups=groups
)
notify(f'I have finished. Best params is {clf.best_params_}')



I have finished. Best params is {'C': 0.1}
CPU times: user 2min 12s, sys: 251 ms, total: 2min 13s
Wall time: 22min 36s


In [7]:
clf.best_score_

0.7719401157897422

In [8]:
clf.cv_results_

{'mean_fit_time': array([112.49994715, 130.68060025, 189.55095522, 175.23486733,
        183.21680911, 201.01593757, 216.3277417 ]),
 'std_fit_time': array([ 3.96266838,  2.74050432,  3.72693469,  2.60450198,  1.20119837,
        12.05550314, 13.66163109]),
 'mean_score_time': array([0.0774289 , 0.06504027, 0.06626654, 0.05795519, 0.05435793,
        0.06494141, 0.04405673]),
 'std_score_time': array([0.00520845, 0.00031973, 0.0099168 , 0.00815267, 0.00969629,
        0.00031717, 0.01138722]),
 'param_C': masked_array(data=[0.1, 0.3, 0.5, 1, 2, 4, 8],
              mask=[False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.1},
  {'C': 0.3},
  {'C': 0.5},
  {'C': 1},
  {'C': 2},
  {'C': 4},
  {'C': 8}],
 'split0_test_score': array([0.76659021, 0.76414373, 0.76192661, 0.75535168, 0.75022936,
        0.73998471, 0.72782875]),
 'split1_test_score': array([0.78109947, 0.77704717, 0.77353009, 0.7677957 , 0.76213778,
       

In [6]:
%%time
parameters = {'C':[0.05, 0.1, 0.3, 1.0]}
svc = svm.LinearSVC()
group_kfold = GroupKFold(n_splits=3)
clf = GridSearchCV(
    svc,
    parameters,
    cv=group_kfold,
    n_jobs=3
)
clf.fit(
    X=features,
    y=labels,
    groups=groups
)
notify(f'I have finished. Best params is {clf.best_params_}. Best score is {clf.best_score_}.')

I have finished. Best params is {'C': 0.1}. Best score is 0.7609222372414497.
CPU times: user 18.1 s, sys: 172 ms, total: 18.3 s
Wall time: 2min 43s


In [8]:
clf.cv_results_

{'mean_fit_time': array([11.7760156 , 16.40993396, 32.53084254, 75.76127736]),
 'std_fit_time': array([0.87303238, 0.94474288, 1.66640147, 2.00775077]),
 'mean_score_time': array([0.08418282, 0.04998048, 0.05277809, 0.03559152]),
 'std_score_time': array([0.01608375, 0.00893657, 0.01062165, 0.00231066]),
 'param_C': masked_array(data=[0.05, 0.1, 0.3, 1.0],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.05}, {'C': 0.1}, {'C': 0.3}, {'C': 1.0}],
 'split0_test_score': array([0.75542813, 0.75359327, 0.75405199, 0.75053517]),
 'split1_test_score': array([0.76733695, 0.76924841, 0.76840737, 0.76443153]),
 'split2_test_score': array([0.75969349, 0.75992337, 0.75931034, 0.75609195]),
 'mean_test_score': array([0.76082022, 0.76092224, 0.76059068, 0.75702007]),
 'std_test_score': array([0.00492826, 0.0064324 , 0.0059321 , 0.00571303]),
 'rank_test_score': array([2, 1, 3, 4], dtype=int32)}

In [30]:
%%time
parameters = {'C':[12, 14, 16, 18]}
svc = svm.SVC(gamma='scale', kernel='rbf')
group_kfold = GroupKFold(n_splits=3)
clf = GridSearchCV(
    svc,
    parameters,
    cv=group_kfold,
    n_jobs=3
)
clf.fit(
    X=features,
    y=labels,
    groups=groups
)
notify(f'I have finished. Best params is {clf.best_params_}. Best score is {clf.best_score_}. CV results is {clf.cv_results_}')

I have finished. Best params is {'C': 14}. Best score is 0.8851029100461628. CV results is {'mean_fit_time': array([101.66486883, 101.31647984, 102.67569645, 111.41642006]), 'std_fit_time': array([2.00742793, 1.49671578, 0.99503831, 2.10547507]), 'mean_score_time': array([143.69837523, 142.53930306, 142.5631125 , 198.484833  ]), 'std_score_time': array([0.92019416, 1.41108557, 1.33918453, 2.43057248]), 'param_C': masked_array(data=[12, 14, 16, 18],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 12}, {'C': 14}, {'C': 16}, {'C': 18}], 'split0_test_score': array([0.87385321, 0.87400612, 0.87362385, 0.87400612]), 'split1_test_score': array([0.89525193, 0.89525193, 0.89509901, 0.89502256]), 'split2_test_score': array([0.88597701, 0.88605364, 0.88559387, 0.88582375]), 'mean_test_score': array([0.8850264 , 0.88510291, 0.88477135, 0.88494988]), 'std_test_score': array([0.00876503, 0.00870278, 0.00878971, 0.00860532]), 'rank_te

In [25]:
clf.cv_results_

{'mean_fit_time': array([119.18861985, 117.54446514, 113.83335567, 113.62335499]),
 'std_fit_time': array([2.39207686, 0.16834283, 1.36316436, 0.72590079]),
 'mean_score_time': array([161.97537041, 157.79711445, 156.68563199, 153.22638822]),
 'std_score_time': array([2.8898718 , 1.36166758, 1.28903877, 1.04988127]),
 'param_C': masked_array(data=[0.8, 1.0, 1.2, 1.4],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.8}, {'C': 1.0}, {'C': 1.2}, {'C': 1.4}],
 'split0_test_score': array([0.85879205, 0.86062691, 0.86284404, 0.86437309]),
 'split1_test_score': array([0.88309504, 0.8868415 , 0.88798838, 0.88928817]),
 'split2_test_score': array([0.87386973, 0.87586207, 0.87808429, 0.87954023]),
 'mean_test_score': array([0.87191716, 0.87444209, 0.87630391, 0.87773215]),
 'std_test_score': array([0.01002074, 0.01075296, 0.0103457 , 0.01025519]),
 'rank_test_score': array([4, 3, 2, 1], dtype=int32)}