# README

- I am using the flower_subset.npz file for the test and training images. I assume that the notebook and that file is under the same directory.

# Image Classification using RF and SVC

In [None]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm

In [None]:
# read flowers images and labels from the given file
flower_subset = np.load('flower_subset.npz')
train_images = flower_subset['train_images']
train_labels = flower_subset['train_labels']
test_images = flower_subset['test_images']
test_labels = flower_subset['test_labels']

image_size = (64, 64)       # h x w in pixels
cell_size = (8, 8)          # h x w in pixels
block_size_cells = (4, 4)   # h x w in cells
block_size = (32, 32)       # h x w in pixels
bins = 4                    # number of orientation bins

# resize images and compute HoG features
def hog_features(images):
    features = []
    new_images = []
    hog = cv2.HOGDescriptor(image_size, block_size, cell_size, cell_size, bins)
    for image in images:
        image = cv2.resize(image, (int(image.shape[1] * 0.5), int(image.shape[0] * 0.5)), cv2.INTER_AREA)
        image = image * 255
        feature = hog.compute(cv2.convertScaleAbs(image))
        features.append(feature.reshape(1, -1))
    return features

# get hog features of the training and test images
train_images_features = hog_features(train_images)
test_images_features = hog_features(test_images)

# convert the list of the features into vector stack
train_images_features = np.vstack(train_images_features)
test_images_features = np.vstack(test_images_features)

# fit a non-linear SVM classifier with RBF kernel gamma = 'auto' and C = 1
flower_clf = svm.SVC(gamma='auto', C=1.)
flower_clf.fit(train_images_features, train_labels)

# predict labels of the test images
predictions = flower_clf.predict(test_images_features)

# calculate accuracy method
def calc_accuracy(list1, list2):
    match_count = 0
    for i in range(len(list1)):
        if list1[i] == list2[i]:
            match_count += 1
    accuracy = str(match_count / len(list2) * 100) + '%'
    return accuracy

# calculate accuracy of the classifier
print('Accuracy:', calc_accuracy(predictions, test_labels))

Accuracy: 11.11111111111111%


In [None]:
# tune values 'gamma' and 'C' to achieve accuracy > 50%
accurate_flower_clf = svm.SVC(gamma=0.1, C=10)
accurate_flower_clf.fit(train_images_features, train_labels)

accurate_predictions = accurate_flower_clf.predict(test_images_features)

print('Accuracy:', calc_accuracy(accurate_predictions, test_labels))

Accuracy: 58.88888888888889%


In [None]:
from sklearn.ensemble import RandomForestClassifier

# fit random forest (RF) classifier with n_estimators 10, max_depth 5 and criterion entropy
rf_clf = RandomForestClassifier(n_estimators=10, max_depth=5, criterion='entropy')
rf_clf.fit(train_images_features, train_labels)

# predict labels of the test images and calculate accuracy
rf_predictions = rf_clf.predict(test_images_features)

print('Accuracy:', calc_accuracy(rf_predictions, test_labels))

Accuracy: 36.666666666666664%


In [None]:
# Tune values of n_estimators and max_depth to achieve test accuracy > 50%
accurate_rf_clf = RandomForestClassifier(n_estimators=100, max_depth=30, criterion='entropy')
accurate_rf_clf.fit(train_images_features, train_labels)

# predict labels of the test images and calculate accuracy
accurate_rf_predictions = accurate_rf_clf.predict(test_images_features)

print('Accuracy:', calc_accuracy(accurate_rf_predictions, test_labels))

Accuracy: 52.22222222222223%


In [None]:
# SVC with different random state values
random_svc_clf1 = svm.SVC(gamma=0.1, C=10, probability=True, random_state=1)
random_svc_clf2 = svm.SVC(gamma=0.1, C=10, probability=True, random_state=5)
random_svc_clf3 = svm.SVC(gamma=0.1, C=10, probability=True, random_state=10)
random_svc_clf4 = svm.SVC(gamma=0.1, C=10, probability=True, random_state=50)
random_svc_clf5 = svm.SVC(gamma=0.1, C=10, probability=True, random_state=100)

random_svc_clf1.fit(train_images_features, train_labels)
random_svc_clf2.fit(train_images_features, train_labels)
random_svc_clf3.fit(train_images_features, train_labels)
random_svc_clf4.fit(train_images_features, train_labels)
random_svc_clf5.fit(train_images_features, train_labels)

random_svc_1 = random_svc_clf1.predict(test_images_features)
random_svc_2 = random_svc_clf2.predict(test_images_features)
random_svc_3 = random_svc_clf3.predict(test_images_features)
random_svc_4 = random_svc_clf4.predict(test_images_features)
random_svc_5 = random_svc_clf5.predict(test_images_features)

print(calc_accuracy(random_svc_1, test_labels))
print(calc_accuracy(random_svc_2, test_labels))
print(calc_accuracy(random_svc_3, test_labels))
print(calc_accuracy(random_svc_4, test_labels))
print(calc_accuracy(random_svc_5, test_labels))

58.88888888888889%
58.88888888888889%
58.88888888888889%
58.88888888888889%
58.88888888888889%


In [None]:
# RF with different random state values
rf_clf1 = RandomForestClassifier(n_estimators=100, max_depth=30, random_state=1, criterion='entropy')
rf_clf5 = RandomForestClassifier(n_estimators=100, max_depth=30, random_state=5, criterion='entropy')
rf_clf10 = RandomForestClassifier(n_estimators=100, max_depth=30, random_state=10, criterion='entropy')
rf_clf50 = RandomForestClassifier(n_estimators=100, max_depth=30, random_state=50, criterion='entropy')
rf_clf100 = RandomForestClassifier(n_estimators=100, max_depth=30, random_state=100, criterion='entropy')

rf_clf1.fit(train_images_features, train_labels)
rf_clf5.fit(train_images_features, train_labels)
rf_clf10.fit(train_images_features, train_labels)
rf_clf50.fit(train_images_features, train_labels)
rf_clf100.fit(train_images_features, train_labels)

rf_1 = rf_clf1.predict(test_images_features)
rf_5 = rf_clf5.predict(test_images_features)
rf_10 = rf_clf10.predict(test_images_features)
rf_50 = rf_clf50.predict(test_images_features)
rf_100 = rf_clf100.predict(test_images_features)

print(calc_accuracy(rf_1, test_labels))
print(calc_accuracy(rf_5, test_labels))
print(calc_accuracy(rf_10, test_labels))
print(calc_accuracy(rf_50, test_labels))
print(calc_accuracy(rf_100, test_labels))

47.77777777777778%
52.22222222222223%
52.22222222222223%
52.22222222222223%
46.666666666666664%


## Comparing Results of SVC and RF

Looking at the accuracies of SVC and RF, we can say that SVC is more accurate when it comes to classification, after the hyperparameters are finely tuned. Before fine-tuning the hyperparameters, RF is better at classification.

Also, during my executions, I noticed that SVC usually gets the same accuracy for the same hyperparameters at each run while the accuracy of RF can change for the same hyperparamaters. This is an observation made before using the random_state hyperparameter.

Comparing the performance of the classifier with respect to the changes in random_state, SVC is more robust and stable. It has the same accuracy for different random state values. On the other hand, RF is not that robust when it comes to the changes in the random state. As we can see from the accuracies, SVC produces the same accuracy for different random state values while RF has a range of accuracy from 46.67% to 52.2%.
