# Feature Extraction
Simon Schellaert

This notebook is an adaptation of the provided notebook `Creating a visual bag-of-words.ipynb`. The logic is largely unchanged, the major difference is that we extract only the SIFT and DAISY features and extract 3000 instead of 500 features.

## 0. Required dependencies ##

In [1]:
# standard packages used to handle files
import sys
import os 
import glob
import time

# commonly used library for data manipilation
import pandas as pd

# numerical
import numpy as np

# handle images - opencv
import cv2

# machine learning library
import sklearn
import sklearn.preprocessing

#used to serialize python objects to disk and load them back to memory
import pickle

#plotting
import matplotlib.pyplot as plt

# helper functions kindly provided for you by Matthias 
import helpers
# specific helper functions for feature extraction
import features

# tell matplotlib that we plot in a notebook
%matplotlib notebook


# filepath constants
DATA_BASE_PATH = './'
OUTPUT_PATH='./'


DATA_TRAIN_PATH = os.path.join(DATA_BASE_PATH,'train')
DATA_TEST_PATH = os.path.join(DATA_BASE_PATH,'test')

FEATURE_BASE_PATH = os.path.join(OUTPUT_PATH,'features')
FEATURE_TRAIN_PATH = os.path.join(FEATURE_BASE_PATH,'train')
FEATURE_TEST_PATH = os.path.join(FEATURE_BASE_PATH,'test')

PREDICTION_PATH = os.path.join(OUTPUT_PATH,'predictions')

# filepatterns to write out features
FILEPATTERN_DESCRIPTOR_TRAIN = os.path.join(FEATURE_TRAIN_PATH,'train_features_{}.pkl')
FILEPATTERN_DESCRIPTOR_TEST = os.path.join(FEATURE_TEST_PATH,'test_features_{}.pkl')

# create paths in case they don't exist:
helpers.createPath(FEATURE_BASE_PATH)
helpers.createPath(FEATURE_TRAIN_PATH)
helpers.createPath(FEATURE_TEST_PATH)
helpers.createPath(PREDICTION_PATH)

## 1. Loading the data and the labels

In [2]:
folder_paths = glob.glob(os.path.join(DATA_TRAIN_PATH,'*'))
label_strings = np.sort(np.array([os.path.basename(path) for path in folder_paths]))
num_classes = label_strings.shape[0]

train_paths = dict((label_string, helpers.getImgPaths(os.path.join(DATA_TRAIN_PATH,label_string))) for label_string in label_strings)

## 2. Extracting the features

As noted before, we only use the SIFT and DAISY features.

In [None]:
descriptor_dict = {
    'daisy': features.extractDAISYCallback,
    'sift': features.extractSIFTCallback
}

After we have selected all the features we want to extract in the dictionary above we can proceed by extracting all desired features from the train data and writing them to the harddrive for later using the pickle module.

In [None]:
train_descriptor_dict = descriptor_dict.copy()

# if the corresponding files already exist, do not extract them again
train_descriptor_dict = dict((key,value) for (key,value) in descriptor_dict.items() if not os.path.isfile(FILEPATTERN_DESCRIPTOR_TRAIN.format(key)))

if len(train_descriptor_dict) > 0: 
    train_features = []
    train_labels = []
    
    # convert train images
    train_features_by_descriptor = dict((key,[]) for (key,value) in train_descriptor_dict.items())
    
    for label_string in label_strings:
        print('extracting train features for class {} :'.format(label_string))

        extracted_features = features.extractFeatures(train_paths[label_string],train_descriptor_dict, label_string)

        # append descriptors of corresponding label to correct descriptor list 
        for key in train_features_by_descriptor.keys():
            train_features_by_descriptor[key]+=extracted_features[key]
  
    for descriptor_key in train_features_by_descriptor.keys():
        with open(FILEPATTERN_DESCRIPTOR_TRAIN.format(descriptor_key),'wb') as pkl_file_train:
            pickle.dump(train_features_by_descriptor[descriptor_key], pkl_file_train, protocol=pickle.HIGHEST_PROTOCOL)

Next, we do the same for the test set.

In [None]:
test_paths = helpers.getImgPaths(DATA_TEST_PATH)
test_descriptor_dict = descriptor_dict.copy()

if len(test_descriptor_dict) > 0: 
    test_features = []
    
    print('extracting test features:') 
    test_features_by_descriptor = features.extractFeatures(test_paths,test_descriptor_dict, None) 
    
    for descriptor_key in test_features_by_descriptor.keys():
        with open(FILEPATTERN_DESCRIPTOR_TEST.format(descriptor_key),'wb') as pkl_file_test:
            pickle.dump(test_features_by_descriptor[descriptor_key], pkl_file_test, protocol=pickle.HIGHEST_PROTOCOL)