In [None]:
# Adds SIFT and SURF functionalities.
import sys
sys.path.append("/usr/local/lib/python3.8/site-packages/")

In [None]:
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import numpy as np
import numpy.linalg as lin

In [None]:
import utils        # Utility to measure code execution
                    # and plot confusion matrices.
import vocabulary   # Read images and produce visual vocabulary.
import classifiers  # Implements different classifiers.

In [None]:
# Reload modules without reloading the kernel.
import importlib
importlib.reload(vocabulary)

---

# Main

Expected execution times refer to `n_clusters = 96`.

## 1. and 2.: reading images, computing visual words and histograms

In [None]:
train_path = "../dataset/train"
test_path = "../dataset/test"

n_clusters = 48 # Size of dictionary.
n_descriptors = 100000

Expect ~ 16 minutes for the following cell.

In [None]:
with utils.codeTimer("Build train set"):
    
    # Read images and compute descriptors, saving them in a dataframe.
    train_df = vocabulary.compute_descriptors(train_path)
    
    print("Total number of {}-dimensional descriptors: {}"
          .format(len(train_df['descriptor'][0]), len(train_df)))

    # Compute kmeans clustering using descriptors to obtain visual dictionary.

    # Note: cluster centers are stored in this object,
    # and can be obtained using `kmeans.cluster_centers_`.
    kmeans = vocabulary.k_means_words(train_df, n_clusters, n_descriptors)
    
    # Aggregate descriptor info, making dataframe more compact.
    # Now the third column contains the list of descriptors.
    train_df = train_df.groupby(['image_id', 'label'],
                                as_index = False).agg({'descriptor':
                                                       (lambda x: list(x))})

    # Compute histograms and add them to dataframe.
    train_histograms = vocabulary.compute_histogram(train_df, kmeans)
    train_df['histogram'] = train_histograms

Expect ~ 23 minutes for the following cell.

In [None]:
# Perform analogous operations to compute histograms for test set,
# using words extracted from train.

with utils.codeTimer("Build test set"):

    test_df = vocabulary.compute_descriptors(test_path)

    test_df = test_df.groupby(['image_id', 'label'],
                                as_index = False).agg({'descriptor':
                                                       (lambda x: list(x))})

    # Note, kmeans has not been recomputed, the training one is used.
    test_histograms = vocabulary.compute_histogram(test_df, kmeans)
    test_df['histogram'] = test_histograms

----

## 3.: Nearest Neighbor classifier

Expect ~ 15 minutes for the following cell.

In [None]:
with utils.codeTimer("NN classifier"):
    true, predicted = classifiers.nn_classifier(train_df, test_df)
    
# Adding predicted lables to dataframe.
test_df["predicted"] = predicted

In [None]:
utils.plot_confusion_matrix(true, predicted,
                            title = "Confusion matrix: NN classifier",
                            filename = "nn.pdf")

----

## 4. and 5.: linear SVM

In [None]:
with utils.codeTimer("Linear SVM classifier"):
    true, predicted = classifiers.linear_SVM_classifier(train_df, test_df)
    
# Adding predicted lables to dataframe.
test_df["predicted"] = predicted

In [None]:
utils.plot_confusion_matrix(true, predicted,
                            title = "Confusion matrix: linear SVM",
                            filename = "linsvm.pdf")

----

## (optional) 6. and 7.: Gaussian SVM

Expect ~ 15 minutes for the following cell.

In [None]:
with utils.codeTimer("Gaussian SVM classifier, chi squared distance"):
    true, predicted = classifiers.gaussian_SVM_classifier(train_df, test_df,
                                                          dist = 'chi')
    
# Adding predicted lables to dataframe.
test_df["predicted"] = predicted

In [None]:
utils.plot_confusion_matrix(true, predicted,
                            title = r"Confusion matrix: $\mathbf{\chi^2}$ gaussian kernel SVM",
                            filename = "gaussvm.pdf")

----

## (optional) 8.: Error Correcting Output Code

In [None]:
with utils.codeTimer("ECOC SVM classifier"):
    true, predicted = classifiers.ecoc_classifier(train_df, test_df,
                                                  n_classifiers = 100)
    
# Adding predicted lables to dataframe.
test_df["predicted"] = predicted

In [None]:
utils.plot_confusion_matrix(true, predicted,
                            title = "Confusion matrix: ECOC",
                            filename = "ecoc.pdf")

##### Accuracy against number of binary classifiers.

In [None]:
# Test accuracy using a different number of classifiers.

accuracy = []
num_classifiers = np.arange(25, 225, 25)

for n in num_classifiers:
    true, predicted = classifiers.ecoc_classifier(train_df, test_df,
                                                  n_classifiers = n)
    # Adding predicted lables to dataframe.
    test_df["predicted"] = predicted
    
    a = utils.plot_confusion_matrix(true, predicted,
                                    title = "Confusion matrix: ECOC",
                                    filename = "ecoc.pdf")
    accuracy.append(a)

In [None]:
# Plot the result.

fig, ax = plt.subplots()

ax.plot(num_classifiers, accuracy, color = "gray")

# Ranges.
ax.set_xlim(10, 200)

# Labels.
ax.xaxis.set_ticks(num_classifiers)

# Axis and titles.
ax.set_xlabel("Number of classifiers", fontweight = 'bold')
ax.set_ylabel("Accuracy", fontweight = 'bold')
ax.set_title("ECOC: accuracy vs number of classifiers", fontweight = 'bold')

# Save to disk.
fig.savefig("ecocb.pdf", bbox_inches = 'tight')

plt.show()

----

## (optional) 9.: Soft assignment

Expect ~ 50 minutes for the following cell.

In [None]:
# NOTE: this cell will overwrite the 'hard'-assigned histograms
# with the newly computed soft assignment ones.

with utils.codeTimer("Soft assignment histograms"):
    train_histograms = vocabulary.compute_kernel_codebook(train_df, kmeans)
    train_df['histogram'] = train_histograms
    
    test_histograms = vocabulary.compute_kernel_codebook(test_df, kmeans)
    test_df['histogram'] = test_histograms


Expect ~ minutes for the following cell.

In [None]:
with utils.codeTimer("Gaussian SVM classifier, chi squared distance"):
    true, predicted = classifiers.gaussian_SVM_classifier(train_df, test_df,
                                                          dist = 'chi')
    
# Adding predicted lables to dataframe.
test_df["predicted"] = predicted

In [None]:
utils.plot_confusion_matrix(true, predicted,
                            title = r"Confusion matrix: $\mathbf{\chi^2}$ gaussian kernel SVM - SA",
                            filename = "soft.pdf")