# HOG + SVM

In [1]:
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt

### Read in Data

In [2]:
def read_and_extract_hog(path, hog, print_intvl=1000):
    filenames = []
    X = []
    count = 0
    for file in os.listdir(path):
        if file.endswith('jpg'):
            filepath = os.path.join(path, file)
            img = cv2.imread(filepath)
            img_resize = cv2.resize(img, hog.winSize)
            hog_values = hog.compute(img_resize).reshape((1, -1))
            filename = file[:-4]
            filenames.append(filename)
            X.append(hog_values)
            count += 1
            if count % print_intvl == 0: print(count, end=' ')
    print()
    return (filenames, np.concatenate(X, axis=0))

In [3]:
winSize = (64, 64)
blockSize = (16, 16)
blockStride = (8, 8)
cellSize = (8, 8)
nbins = 9
hog = cv2.HOGDescriptor(winSize,blockSize,blockStride,cellSize,nbins)

# Train
train_cat_path = '../data/train/cat'
train_dog_path = '../data/train/dog'
_, X_train_cat = read_and_extract_hog(train_cat_path, hog)
y_train_cat = np.zeros((X_train_cat.shape[0],))
_, X_train_dog = read_and_extract_hog(train_dog_path, hog)
y_train_dog = np.ones((X_train_dog.shape[0],))

X_train = np.concatenate([X_train_cat, X_train_dog], axis=0)
y_train = np.concatenate([y_train_cat, y_train_dog])

# Test
test_path = '../data/test'
test_ids, X_test = read_and_extract_hog(test_path, hog)

1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 
1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 
1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 


### Specify prediction pipeline and tune hyperparameters

In [6]:
# Function for parameter selection
from sklearn.model_selection import GridSearchCV
def grid_search_param_selection(X, y, nfolds, model, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=nfolds, n_jobs=-1, verbose=1)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [11]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Specify hyperparameter range and grid-search
l1_ratios = [0.1, 0.2, 0.4, 0.6, 0.8]
alphas = [0.00001, 0.0001, 0.001, 0.01, 0.1]
param_grid = {'l1_ratio': l1_ratios, 'alpha': alphas}
nfolds = 5
X_train_scaled = StandardScaler().fit_transform(X_train)
best_params = grid_search_param_selection(X_train_scaled, y_train, nfolds, 
                                          SGDClassifier(random_state=random_state), param_grid)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.0s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:  1.2min finished


In [14]:
best_params

{'alpha': 0.1, 'l1_ratio': 0.1}

### Train on full training set and predict test set

In [15]:
random_state=2018
pipe = make_pipeline(
    #PCA(n_components=0.95, random_state=random_state),
    StandardScaler(),
    SGDClassifier(random_state=random_state, l1_ratio=0.1, alpha=0.1)
)
yhat_test = pipe.fit(X_train, y_train).predict(X_test)



In [16]:
import pandas as pd
df = pd.DataFrame({
    'id': test_ids,
    'label': yhat_test
})
df.to_csv('../output/predictions_hog_svm_no_pca.csv', index=False)

### In-sample cross-validation

In [17]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
cv_predict = cross_val_predict(pipe, X_train, y_train, n_jobs=-1, cv=5, verbose=1)
confusion_matrix(y_train, cv_predict)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.5s finished


array([[9084, 3416],
       [3080, 9420]])

In [18]:
(y_train == cv_predict).sum() / len(y_train)

0.74016