# HOG + XGBoost

In [10]:
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt

### Read in Data

In [3]:
def read_and_extract_hog(path, hog, print_intvl=1000):
    filenames = []
    X = []
    count = 0
    for file in os.listdir(path):
        if file.endswith('jpg'):
            filepath = os.path.join(path, file)
            img = cv2.imread(filepath)
            img_resize = cv2.resize(img, hog.winSize)
            hog_values = hog.compute(img_resize).reshape((1, -1))
            filename = file[:-4]
            filenames.append(filename)
            X.append(hog_values)
            count += 1
            if count % print_intvl == 0: print(count, end=' ')
    print()
    return (filenames, np.concatenate(X, axis=0))

In [4]:
winSize = (64, 64)
blockSize = (16, 16)
blockStride = (8, 8)
cellSize = (8, 8)
nbins = 9
hog = cv2.HOGDescriptor(winSize,blockSize,blockStride,cellSize,nbins)

# Train
train_cat_path = '../data/train/cat'
train_dog_path = '../data/train/dog'
_, X_train_cat = read_and_extract_hog(train_cat_path, hog)
y_train_cat = np.zeros((X_train_cat.shape[0],))
_, X_train_dog = read_and_extract_hog(train_dog_path, hog)
y_train_dog = np.ones((X_train_dog.shape[0],))

X_train = np.concatenate([X_train_cat, X_train_dog], axis=0)
y_train = np.concatenate([y_train_cat, y_train_dog])

# Test
test_path = '../data/test'
test_ids, X_test = read_and_extract_hog(test_path, hog)

1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 
1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 
1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 


### Hyperparameter Tuning

[This tutorial](https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/) gives the following guidelines on how to select the best parameter:

1. Start with a relatively high learning rate - 0.1 is usual, 0.05-0.3 is fine. The optimum number of trees for this learning rate will pop up as we run the cv function
2. Tune tree-specific parameters (max_depth, min_child_weight, gamma, subsample, colsample_bytree) for decided learning rate and number of trees)
3. Tune regularization parameters (lambda, alpha)
4. Lower the learning rate and decide the optimal parameters

### Step 1: Find optimum number of trees

In [16]:
import pandas as pd
from sklearn import metrics
import xgboost as xgb
import matplotlib.pyplot as plt
def modelfit(alg, X, y, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X, label=y)
        cvresult = xgb.cv(xgb_param, xgtrain, 
                          num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics=['auc', 'logloss'], early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X, y, eval_metric='auc')
        
    #Predict training set:
    train_predictions = alg.predict(X)
    train_predprob = alg.predict_proba(X)[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(y, train_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(y, train_predprob))
    print("Log Loss : %f" % metrics.log_loss(y, train_predprob))
                    
    #feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    return alg
    #feat_imp.plot(kind='bar', title='Feature Importances')
    #plt.ylabel('Feature Importance Score')

In [None]:
from xgboost import XGBClassifier
xgb1 = XGBClassifier(
    learning_rate=0.3, # Usual starting point
    n_estimators=1000, # Usual starting point, will trim down later
    max_depth=5,       # Should be between 3-10, 4-6 is fine
    min_child_weight=1,# Default, will tune later to control overfitting
    gamma=0,           # 0.1-0.2 is usually chosen to start, but 0 is fine for starting configuration
    subsample=0.8,     # Commonly used start value, 0.5-0.9 is fine
    colsample_bytree=0.8, # Same as above
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1, # Our dataset is not imbalanced, but leave default
    seed=2018
)
alg = modelfit(xgb1, X_train, y_train)

Model Report

Accuracy : 0.9284

AUC Score (Train): 0.979140

Log Loss : 0.255223

In [15]:
xgb1

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.3, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=76,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=2018,
       silent=True, subsample=0.8)

Best n_estimators for learning rate 0.3: 76

Best n_estimators for learning rate 0.1: 492

### Step 2: Find optimal max_depth and min_child_weight

In [None]:
from sklearn.model_selection import GridSearchCV

param_test1 = {
    'max_depth': range(3, 10, 2),
    'min_child_weight': range(1, 6, 2)
}

xgb1 = XGBClassifier(
    learning_rate=0.3, # Usual starting point
    n_estimators=76, # Usual starting point, will trim down later
    max_depth=5,       # Should be between 3-10, 4-6 is fine
    min_child_weight=1,# Default, will tune later to control overfitting
    gamma=0,           # 0.1-0.2 is usually chosen to start, but 0 is fine for starting configuration
    subsample=0.8,     # Commonly used start value, 0.5-0.9 is fine
    colsample_bytree=0.8, # Same as above
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1, # Our dataset is not imbalanced, but leave default
    seed=2018
)

gsearch1 = GridSearchCV(
    estimator = xgb1,
    param_grid = param_test1,
    scoring = 'neg_log_loss',
    n_jobs=-1,
    iid=False,
    cv=5
)
gsearch1.fit(X_train, y_train)

In [30]:
print("Best parameters set and log loss found on development set:")
print()
print(gsearch1.best_params_, gsearch1.best_score_)
print()
print("Grid scores on development set:")
print()
means = gsearch1.cv_results_['mean_test_score']
stds = gsearch1.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gsearch1.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

Best parameters set and log loss found on development set:

{'max_depth': 5, 'min_child_weight': 3} -0.5109441752548004

Grid scores on development set:

-0.512 (+/-0.015) for {'max_depth': 3, 'min_child_weight': 1}
-0.513 (+/-0.018) for {'max_depth': 3, 'min_child_weight': 3}
-0.511 (+/-0.019) for {'max_depth': 3, 'min_child_weight': 5}
-0.511 (+/-0.017) for {'max_depth': 5, 'min_child_weight': 1}
-0.511 (+/-0.018) for {'max_depth': 5, 'min_child_weight': 3}
-0.512 (+/-0.020) for {'max_depth': 5, 'min_child_weight': 5}
-0.541 (+/-0.022) for {'max_depth': 7, 'min_child_weight': 1}
-0.535 (+/-0.014) for {'max_depth': 7, 'min_child_weight': 3}
-0.530 (+/-0.018) for {'max_depth': 7, 'min_child_weight': 5}
-0.567 (+/-0.018) for {'max_depth': 9, 'min_child_weight': 1}
-0.555 (+/-0.024) for {'max_depth': 9, 'min_child_weight': 3}
-0.543 (+/-0.026) for {'max_depth': 9, 'min_child_weight': 5}



Dig a bit deeper into finer range of parameters

In [29]:
param_test2 = {
 'max_depth':[4,5,6],
 'min_child_weight':[2,3,4]
}

xgb2 = XGBClassifier(
    learning_rate=0.3, # Selected to speed up training
    n_estimators=76, # Optimal for the chosen learning rate
    max_depth=5,       # Should be between 3-10, 4-6 is fine
    min_child_weight=3,# Default, will tune later to control overfitting
    gamma=0,           # 0.1-0.2 is usually chosen to start, but 0 is fine for starting configuration
    subsample=0.8,     # Commonly used start value, 0.5-0.9 is fine
    colsample_bytree=0.8, # Same as above
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1, # Our dataset is not imbalanced, but leave default
    seed=2018
)

gsearch2 = GridSearchCV(
    estimator = xgb2,
    param_grid = param_test2,
    scoring = 'neg_log_loss',
    n_jobs=-1,
    iid=False,
    cv=5
)
gsearch2.fit(X_train, y_train)

print("Best parameters set and log loss found on development set:")
print()
print(gsearch2.best_params_, gsearch2.best_score_)
print()
print("Grid scores on development set:")
print()
means = gsearch2.cv_results_['mean_test_score']
stds = gsearch2.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gsearch2.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

Best parameters set and log loss found on development set:

{'max_depth': 4, 'min_child_weight': 3} -0.5087565969602718

Grid scores on development set:

-0.512 (+/-0.014) for {'max_depth': 4, 'min_child_weight': 2}
-0.509 (+/-0.014) for {'max_depth': 4, 'min_child_weight': 3}
-0.510 (+/-0.012) for {'max_depth': 4, 'min_child_weight': 4}
-0.511 (+/-0.022) for {'max_depth': 5, 'min_child_weight': 2}
-0.511 (+/-0.018) for {'max_depth': 5, 'min_child_weight': 3}
-0.511 (+/-0.017) for {'max_depth': 5, 'min_child_weight': 4}
-0.528 (+/-0.022) for {'max_depth': 6, 'min_child_weight': 2}
-0.529 (+/-0.019) for {'max_depth': 6, 'min_child_weight': 3}
-0.520 (+/-0.023) for {'max_depth': 6, 'min_child_weight': 4}



### Step 3: Tune gamma, based on parameters tuned above

In [31]:
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}

xgb3 = XGBClassifier(
    learning_rate=0.3, # Selected to speed up training
    n_estimators=76, # Optimal for the chosen learning rate
    max_depth=4,       # Should be between 3-10, 4-6 is fine
    min_child_weight=3,# Default, will tune later to control overfitting
    gamma=0,           # 0.1-0.2 is usually chosen to start, but 0 is fine for starting configuration
    subsample=0.8,     # Commonly used start value, 0.5-0.9 is fine
    colsample_bytree=0.8, # Same as above
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1, # Our dataset is not imbalanced, but leave default
    seed=2018
)

gsearch3 = GridSearchCV(
    estimator = xgb3,
    param_grid = param_test3,
    scoring = 'neg_log_loss',
    n_jobs=-1,
    iid=False,
    cv=5
)
gsearch3.fit(X_train, y_train)

print("Best parameters set and log loss found on development set:")
print()
print(gsearch3.best_params_, gsearch3.best_score_)
print()
print("Grid scores on development set:")
print()
means = gsearch3.cv_results_['mean_test_score']
stds = gsearch3.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gsearch3.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

Best parameters set and log loss found on development set:

{'gamma': 0.4} -0.508565354911047

Grid scores on development set:

-0.509 (+/-0.014) for {'gamma': 0.0}
-0.509 (+/-0.013) for {'gamma': 0.1}
-0.509 (+/-0.013) for {'gamma': 0.2}
-0.509 (+/-0.013) for {'gamma': 0.3}
-0.509 (+/-0.014) for {'gamma': 0.4}



Because 0.4 is at the tip of our range, we give it a further stretch to see if it improves further

In [32]:
param_test4 = {
    'gamma': [0.4, 0.5, 0.6]
}

gsearch4 = GridSearchCV(
    estimator = xgb3,
    param_grid = param_test4,
    scoring = 'neg_log_loss',
    n_jobs=-1,
    iid=False,
    cv=5
)
gsearch4.fit(X_train, y_train)

print("Best parameters set and log loss found on development set:")
print()
print(gsearch4.best_params_, gsearch4.best_score_)
print()
print("Grid scores on development set:")
print()
means = gsearch4.cv_results_['mean_test_score']
stds = gsearch4.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gsearch4.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

Best parameters set and log loss found on development set:

{'gamma': 0.4} -0.508565354911047

Grid scores on development set:

-0.509 (+/-0.014) for {'gamma': 0.4}
-0.509 (+/-0.014) for {'gamma': 0.5}
-0.509 (+/-0.014) for {'gamma': 0.6}



Recalibrate number of boosting rounds, which might have changed following previous tuning

In [33]:
xgb5 = XGBClassifier(
    learning_rate=0.3, # Selected to speed up training
    n_estimators=1000, # Usual starting point, will trim down after recalibration
    max_depth=4,       # Chosen through grid search
    min_child_weight=3,# Chosen through grid search
    gamma=0.4,         # Chosen through grid search
    subsample=0.8,     # Commonly used start value, 0.5-0.9 is fine
    colsample_bytree=0.8, # Same as above
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1, # Our dataset is not imbalanced, but leave default
    seed=2018
)
alg = modelfit(xgb5, X_train, y_train)


Model Report
Accuracy : 0.9357
AUC Score (Train): 0.983136
Log Loss : 0.235981


While it is possible to further tune *subsample* and *colsample_bytree*, the improvement will be marginal, so I stop here and begin to produce deliverables for Kaggle competition page

In [35]:
yhat_test = alg.predict_proba(X_test)

In [41]:
df = pd.DataFrame({
    'id': test_ids,
    'label': yhat_test[:, 1]
})
df.to_csv('../output/predictions_hog_xgboost.csv', index=False)

### In-sample CV to get a sense of actual accuracy

In [42]:
from sklearn.base import clone
alg_clone = clone(alg)

In [45]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
xgb6 = XGBClassifier(
    learning_rate=0.1, # Selected to speed up training
    n_estimators=1000, # Usual starting point, will trim down after recalibration
    max_depth=4,       # Chosen through grid search
    min_child_weight=3,# Chosen through grid search
    gamma=0.4,         # Chosen through grid search
    subsample=0.8,     # Commonly used start value, 0.5-0.9 is fine
    colsample_bytree=0.8, # Same as above
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1, # Our dataset is not imbalanced, but leave default
    seed=2018
)
cv_predict = cross_val_predict(xgb6, X_train, y_train, n_jobs=-1, cv=5, verbose=1)
confusion_matrix(y_train, cv_predict)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 80.1min finished


array([[9904, 2596],
       [2640, 9860]])

In [46]:
metrics.accuracy_score(y_train, cv_predict)

0.79056