In [1]:
from pathlib import Path
import os
import pandas as pd
import cv2
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import numpy.linalg as lin

In [2]:
# Read image and directory (image category)

def read_image(dir_path):
    """
    TBD iterator
    """
    
    for subdir, dirs, files in os.walk(dir_path):
        for file in files:
            # Ignore useless files.
            if file != ".DS_Store":
                yield (cv2.imread(os.path.join(subdir, file)),
                       os.path.basename(subdir))
                

def compute_descriptors(dir_path):
    """
    TBD, note on ORB instead of SIFT/SURF
    """
    
    # Lists to contain data.
    image_ids = []
    descriptors = []
    labels = []
    
    # Build ORB object.
    orb = cv2.ORB_create()
    
    image_id = 0
    
    for image, label in read_image(dir_path):
        # Detect and compute keypoints.
        kp, des = orb.detectAndCompute(image, None)
        # If no keypoints detected, continue.
        if kp == []:
            continue
        for d in range(len(des)):
            descriptors.append(des[d])
            image_ids.append(image_id)
            labels.append(label)
            
        image_id += 1
            
    des_df = pd.DataFrame(columns = ['image_id', 'descriptor', 'label'])
    des_df['image_id'] = image_ids
    des_df['descriptor'] = descriptors
    des_df['label'] = labels
    
    
    return des_df

In [3]:
def k_means_words(descriptors_df, n_clusters, num_descriptors):
    """
    TBD
    """
    
    # Sample `num_descriptors` descriptors.
    data = descriptors_df['descriptor'].sample(n = num_descriptors).tolist()
    # Compute the kmeans algorithm on the sampled descriptors.
    kmeans = KMeans(n_clusters = n_clusters, n_jobs = -1).fit(data)
    return kmeans

In [4]:
def find_nearest_term(word, dictionary):
    """
    TBD
    """
    distances = [lin.norm(word - term) for term in dictionary]
    
    return np.argmin(distances)

def compute_histogram(descriptors_df, kmeans):
    """
    TBD
    """
    # Estract visual words (centroids) from kmeans computation.
    words = kmeans.cluster_centers_
    
    descriptor_dim = len(words)
    num_images = len(descriptors_df['image_id'])
    
    histograms = np.zeros((num_images, descriptor_dim))
    
    # For every image.
    for index, row in descriptors_df.iterrows():
        # For every descriptor.
        for descriptor in row['descriptor']:
            closest_centroid = find_nearest_term(descriptor, words)
            histograms[index][closest_centroid] += 1
            
    # Compute norms and normalisation.
    norm = np.sum(histograms, axis = 1).reshape(num_images, 1)
    histograms = histograms / norm
    
    # More convenient format as list of arrays.
    histograms = list(histograms[row] for row in range(len(histograms)))
    
    return histograms

In [10]:
from scipy.stats import wasserstein_distance

def nn_classifier(train_df, test_df):
    true_labels = test_df['label']
    predicted_labels = []
    
    # For each test image.
    for index, row in test_df.iterrows():
        distances = []
        # Compute distance with each train image.
        for index, row in train_df.iterrows():
            distances.append(wasserstein_distance(row['histogram'],
                                                  myrow['histogram']))
        # The predicted label corresponds to the minimum distance.
        predicted_labels.append(train_df.iloc[np.argmin(distances)]['label'])
        
    return true_labels, predicted_labels

# MAIN

In [None]:
train_path = "./dataset/train"
test_path = "./dataset/test"

n_clusters = 50 # Size of dictionary.
n_descriptors = 100000

# Read images and compute descriptors, saving them in a dataframe.
train_df = compute_descriptors(train_path)

# Compute kmeans clustering using descriptors
# Note: cluster centers are stored in this object,
# and can be obtained using `kmeans.cluster_centers_`.
kmeans = k_means_words(train_df, n_clusters, n_descriptors)

# Aggregate descriptor info, making dataframe more compact.
# Now the third column contains the list of descriptors.
train_df = train_df.groupby(['image_id', 'label'],
                            as_index = False).agg({'descriptor':
                                                   (lambda x: list(x))})

# Compute histograms and add them to dataframe.
histograms = compute_histogram(train_df, kmeans)
train_df['histogram'] = histograms

In [None]:
train_df.head()

In [None]:
# Compute histograms for test set, using words extracted from train.

test_df = compute_descriptors(test_path)

test_df = test_df.groupby(['image_id', 'label'],
                            as_index = False).agg({'descriptor':
                                                   (lambda x: list(x))})

# Note, kmeans has not been recomputed, the training one is used.
histograms = compute_histogram(test_df, kmeans)
test_df['histogram'] = histograms

In [None]:
test_df.head()

In [None]:
true, predicted = nn_classifier(train_df, test_df)

In [None]:
img = cv2.imread("./dataset/train/Coast/image_0231.jpg")
orb = cv2.ORB_create()
kp = orb.detect(img,None)
kp, des = orb.compute(img, kp)
img2 = cv2.drawKeypoints(img,kp,img)
plt.imshow(img2),plt.show()