In [1]:
# Chapter 4 - Improving the models
# Classify cat and dog pictures with SVM
import os
import numpy as np
import sklearn as sk
from sklearn.svm import SVC

In [2]:
# PIL compatibility layer with Pillow-PIL
from PIL import Image
from PIL import ImageFilter

In [3]:
# Get access to the training and testing images
dataset_training_dir = 'datasets/cats_vs_dogs/train'
dataset_test_dir = 'datasets/cats_vs_dogs/test1'
train_images = [item for item in os.listdir(dataset_training_dir)]
test_images = [item for item in os.listdir(dataset_test_dir)]
print("#{} training images, and #{} testing images.\nSample training file '{}'\nSample testing file '{}'".format(len(train_images), len(test_images), train_images[0], test_images[0]))

#25000 training images, and #12500 testing images.
Sample training file 'dog.8011.jpg'
Sample testing file '9733.jpg'


In [None]:
# We go for a labels list
labels = []
for i in train_images:
    if 'dog' in i:
        labels.append(1)
    else:
        labels.append(0)

In [None]:
# Print a labeling sample
print("Labeling Sample: {}".format(labels[0:10]))

In [None]:
# Extracting features using the PILLOW library
features = []
pixel_value = 0

In [None]:
# Feature extraction
for item in train_images:
    print("---> Feature extraction for image '{}'".format(item))
    print("\tConvert with mysterious 'L' parameter...")
    im = Image.open(os.path.join(dataset_training_dir, item)).convert('L')
    size = 64, 64
    print("\tResize '{}'".format(size))
    im = im.resize(size, Image.ANTIALIAS)
    print("\tFind Edges")
    im = im.filter(ImageFilter.FIND_EDGES)
    print("\tCompute histogram of image")
    pixel_value = im.histogram()
    print("\tAppend pixel value histogram to features")
    features.append(pixel_value)
    print("\tPixel value histogram size '{}', sample '{}'".format(len(pixel_value), pixel_value[:10]))

In [None]:
# Default RBF kernel
clf = SVC()
# Train the model
clf = clf.fit(features, labels)

In [None]:
# Let's calculate model accuracy on the training dataset
# We need to do feature extraction on the results as well
results = []
total = 0
for index, item in enumerate(train_images[:100]):
    print("---> Calculating accuracy on image '{}'".format(item))
    x = clf.predict([features[index]])
    print("\tThe prediction looks like this '{}'".format(x))
    results.append(x[0])
print("Working on #{} results".format(len(results)))
total = np.sum(np.logical_and(labels[:100],results))
print("Accuracy: ({}/{})*100={:.3f}%".format(total, len(results), (total/len(results))*100))