In [2]:
# standard packages used to handle files
import sys
import os
import glob
import time

# commonly used library for data manipilation
import pandas as pd

# numerical
import numpy as np

# handle images - opencv
import cv2

# machine learning library
import sklearn
import sklearn.preprocessing

#used to serialize python objects to disk and load them back to memory
import pickle

#plotting
import matplotlib.pyplot as plt

# helper functions kindly provided for you by Matthias 
import helpers
# specific helper functions for feature extraction
import features

from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import cross_val_score



In [3]:
# filepath constants
DATA_BASE_PATH = './'
OUTPUT_PATH='./'


DATA_TRAIN_PATH = os.path.join(DATA_BASE_PATH,'train')
DATA_TEST_PATH = os.path.join(DATA_BASE_PATH,'test')

FEATURE_BASE_PATH = os.path.join(OUTPUT_PATH,'features')
FEATURE_TRAIN_PATH = os.path.join(FEATURE_BASE_PATH,'train')
FEATURE_TEST_PATH = os.path.join(FEATURE_BASE_PATH,'test')

PREDICTION_PATH = os.path.join(OUTPUT_PATH,'predictions')

# filepatterns to write out features
FILEPATTERN_DESCRIPTOR_TRAIN = os.path.join(FEATURE_TRAIN_PATH,'train_features_{}.pkl')
FILEPATTERN_DESCRIPTOR_TEST = os.path.join(FEATURE_TEST_PATH,'test_features_{}.pkl')

# create paths in case they don't exist:
helpers.createPath(FEATURE_BASE_PATH)
helpers.createPath(FEATURE_TRAIN_PATH)
helpers.createPath(FEATURE_TEST_PATH)
helpers.createPath(PREDICTION_PATH)



#import yaml
#with open(sys.argv[1]) as config_file:
#    config = yaml.load(config_file)

config = {
    'codebook_size': 1500,
    'feature_count': 500,
    'regularization': 2
}
print(config)


folder_paths = glob.glob(os.path.join(DATA_TRAIN_PATH,'*'))
label_strings = np.sort(np.array([os.path.basename(path) for path in folder_paths]))
num_classes = label_strings.shape[0]
print(label_strings)
print('Number of classes:', num_classes)



{'codebook_size': 1500, 'feature_count': 500, 'regularization': 2}
['bobcat' 'chihuahua' 'collie' 'dalmatian' 'german_shepherd' 'leopard'
 'lion' 'persian_cat' 'siamese_cat' 'tiger' 'wolf']
Number of classes: 11


In [12]:
descriptor_desired='sift_3000'
with open(FILEPATTERN_DESCRIPTOR_TRAIN.format(descriptor_desired),'rb') as pkl_file_train:
    train_features_from_pkl_sift = pickle.load(pkl_file_train)


print('Number of encoded train images: {}'.format(len(train_features_from_pkl_sift)))

Number of encoded train images: 3556


In [13]:

clustered_codebook = helpers.createCodebook(train_features_from_pkl_sift, codebook_size = 2000)

print('Created SIFT codebook')


training took 625.105090379715 seconds
Created SIFT codebook


In [14]:
print(train_features_from_pkl_sift[0].data.shape)

(3000, 128)


In [15]:

# encode all train images
train_data_sift=[]
train_labels_old=[]

for i, image_features in enumerate(train_features_from_pkl_sift):
    bow_feature_vector = helpers.encodeImage(image_features.data,clustered_codebook)
    train_data_sift.append(bow_feature_vector)
    train_labels_old.append(image_features.label)

In [16]:
# use a labelencoder to obtain numerical labels
label_encoder = sklearn.preprocessing.LabelEncoder()
label_encoder.fit(label_strings)
print(train_labels_old[:10])
train_labels = label_encoder.transform(train_labels_old)
print(train_labels)

['bobcat', 'bobcat', 'bobcat', 'bobcat', 'bobcat', 'bobcat', 'bobcat', 'bobcat', 'bobcat', 'bobcat']
[ 0  0  0 ... 10 10 10]


In [17]:
for feature_name in [ None, 'daisy', 'freak']:
    print('Loading data for', feature_name)
    
    if feature_name is None:
        train_data_second = np.zeros_like(train_data_sift)
    else:
        with open(FILEPATTERN_DESCRIPTOR_TRAIN.format(feature_name),'rb') as pkl_file_train:
            train_features_from_pkl_second = pickle.load(pkl_file_train)

        clustered_codebook = helpers.createCodebook(train_features_from_pkl_second, codebook_size = 500)
        print('Created codebook for', feature_name)

        # encode all train images
        train_data_second=[]

        for i, image_features in enumerate(train_features_from_pkl_second):
            bow_feature_vector = helpers.encodeImage(image_features.data,clustered_codebook)
            train_data_second.append(bow_feature_vector)
        
    print('Encoded all feature vectors')
    
    train_data = np.concatenate([train_data_sift, train_data_second], axis=1)

    simple_pipe = make_pipeline(SVC(random_state=0, probability=True, kernel='linear',C=2,verbose=True))

    scores1 = cross_val_score(simple_pipe, train_data, train_labels, cv=5, scoring='neg_log_loss')
    print(feature_name)
    print(scores1)
    print("Average validation accuracy: ",scores1.mean(),", stdev: ",scores1.std())

Loading data for None
Encoded all feature vectors
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]None
[-1.34088801 -1.27669807 -1.33705193 -1.30252009 -1.40936519]
Average validation accuracy:  -1.333304661312353 , stdev:  0.04474091876406463
Loading data for daisy
training took 28.593923091888428 seconds
Created codebook for daisy
Encoded all feature vectors
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]daisy
[-1.32093443 -1.22392106 -1.28655728 -1.28849392 -1.30669476]
Average validation accuracy:  -1.2853202911778214 , stdev:  0.03318645320324449
Loading data for freak
training took 23.360713958740234 seconds
Created codebook for freak
Encoded all feature vectors
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]freak
[-1.35627179 -1.29948971 -1.33668092 -1.30495668 -1.37990088]
Average validation accuracy:  -1.335459999040274 , stdev:  0.030443149264361938
