## PART 1:  Extract HOG features from images using scikit-image

## Summary:
    

Reference: http://scikit-image.org/docs/dev/auto_examples/plot_hog.html

In [1]:
data_root = '/Volumes/My Passport/yelp/'

import os
import time

import numpy as np
import matplotlib.pyplot as plt

from skimage.feature import hog
from skimage.transform import resize
from skimage import io, data, color, exposure

### Extract features from training image

In [2]:
def extract_hog_features(image_path):
    image = io.imread(image_path)
    image = color.rgb2gray(image)
    image_resized = resize(image, (256, 256))
    return hog(image_resized, orientations=8,
        pixels_per_cell=(16, 16), cells_per_block=(1, 1))

In [3]:
# extract image features and save it to .h5

# Initialize files
import h5py
f = h5py.File(data_root+'train_image_HOGfeatures.h5','w')
filenames = f.create_dataset('photo_id',(0,), maxshape=(None,),dtype='|S54')
feature = f.create_dataset('feature',(0,2048), maxshape = (None,2048))
f.close()

import pandas as pd 
train_photos = pd.read_csv(data_root+'train_photo_to_biz_ids.csv')
train_folder = data_root+'train_photos/'
train_images = [os.path.join(train_folder, str(x)+'.jpg') for x in train_photos['photo_id']]  # get full filename

num_train = len(train_images)
print "Number of training images: ", num_train

tic = time.time()

# Training Images
for i in range(0, num_train):
    feature = extract_hog_features(train_images[i])
    num_done = i+1
    f= h5py.File(data_root+'train_image_HOGfeatures.h5','r+')
    f['photo_id'].resize((num_done,))
    f['photo_id'][i] = train_images[i]
    f['feature'].resize((num_done,feature.shape[0]))
    f['feature'][i, :] = feature
    f.close()
    if num_done%10000==0 or num_done==num_train:
        print "Train images processed: ", num_done

toc = time.time()
print '\nFeatures extracted in %fs' % (toc - tic)        

Number of training images:  234842
Train images processed:  10000
Train images processed:  20000
Train images processed:  30000
Train images processed:  40000
Train images processed:  50000
Train images processed:  60000
Train images processed:  70000
Train images processed:  80000
Train images processed:  90000
Train images processed:  100000
Train images processed:  110000
Train images processed:  120000
Train images processed:  130000
Train images processed:  140000
Train images processed:  150000
Train images processed:  160000
Train images processed:  170000
Train images processed:  180000
Train images processed:  190000
Train images processed:  200000
Train images processed:  210000
Train images processed:  220000
Train images processed:  230000
Train images processed:  234842

Features extracted in 14866.624218s


In [4]:
### Check the file content

f = h5py.File(data_root+'train_image_HOGfeatures.h5','r')
print 'train_image_features.h5:'
for key in f.keys():
    print key, f[key].shape
    
print "\nA photo:", f['photo_id'][0]
print "Its feature vector (first 10-dim): ", f['feature'][0][0:10], " ..."
f.close()

train_image_features.h5:
feature (234842, 2048)
photo_id (234842,)

A photo: /Volumes/My Passport/yelp/train_photos/204149.jpg
Its feature vector (first 10-dim):  [ 0.34791261  0.0733768   0.01276903  0.00645388  0.15348932  0.09106192
  0.04581008  0.26656917  0.04787623  0.01552287]  ...


### Extract feature from test images

In [6]:
f = h5py.File(data_root+'test_image_HOGfeatures.h5','w')
filenames = f.create_dataset('photo_id',(0,), maxshape=(None,),dtype='|S54')
feature = f.create_dataset('feature',(0,2048), maxshape = (None,2048))
f.close()

test_photos = pd.read_csv(data_root+'test_photo_to_biz.csv')
test_folder = data_root+'test_photos/'
test_images = [os.path.join(test_folder, str(x)+'.jpg') for x in test_photos['photo_id'].unique()]

num_test = len(test_images)
print "Number of test images: ", num_test

tic = time.time()

# Test Images
for i in range(0, num_test):
    feature = extract_hog_features(test_images[i])
    num_done = i+1
    f= h5py.File(data_root+'test_image_HOGfeatures.h5','r+')
    f['photo_id'].resize((num_done,))
    f['photo_id'][i] = test_images[i]
    f['feature'].resize((num_done,feature.shape[0]))
    f['feature'][i, :] = feature
    f.close()
    if num_done%20000==0 or num_done==num_test:
        print "Test images processed: ", num_done

toc = time.time()
print '\nFeatures extracted in %fs' % (toc - tic)  

Number of test images:  237152
Test images processed:  20000
Test images processed:  40000
Test images processed:  60000
Test images processed:  80000
Test images processed:  100000
Test images processed:  120000
Test images processed:  140000
Test images processed:  160000
Test images processed:  180000
Test images processed:  200000
Test images processed:  220000
Test images processed:  237152

Features extracted in 15702.819823s


In [None]:
### Check the file content
f = h5py.File(data_root+'test_image_HOGfeatures.h5','r')
print 'test_image_features.h5:'
for key in f.keys():
    print key, f[key].shape
    
print "\nA photo:", f['photo_id'][0]
print "Its feature vector (first 10-dim): ", f['feature'][0][0:10], " ..."
f.close()