In [None]:
from pathlib import Path
import os
import pandas as pd
import cv2
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
# Read image and directory (image category)

def read_image(dir_path):
    """
    TBD iterator
    """
    
    for subdir, dirs, files in os.walk(dir_path):
        for file in files:
            # Ignore useless files.
            if file != ".DS_Store":
                yield (cv2.imread(os.path.join(subdir, file)),
                       os.path.basename(subdir))
                

def compute_descriptors(dir_path):
    """
    TBD, note on ORB instead of SIFT/SURF
    """
    
    # Lists to contain data.
    image_ids = []
    descriptors = []
    labels = []
    
    # Build ORB object.
    orb = cv2.ORB_create()
    
    image_id = 0
    
    for image, label in read_image(dir_path):
        # Detect and compute keypoints.
        kp, des = orb.detectAndCompute(image, None)
        # If no keypoints detected, continue.
        if kp == []:
            continue
        for d in range(len(des)):
            descriptors.append(des[d])
            image_ids.append(image_id)
            labels.append(label)
            
        image_id += 1
            
    des_df = pd.DataFrame(columns = ['image_id', 'descriptor', 'label'])
    des_df['image_id'] = image_ids
    des_df['descriptor'] = descriptors
    des_df['label'] = labels
    
    
    return des_df

In [None]:
def k_means_words(descriptors_df, n_clusters, num_descriptors):
    """
    TBD
    """
    
    # Sample `num_descriptors` descriptors.
    data = descriptors_df['descriptor'].sample(n = num_descriptors).tolist()
    # Compute the kmeans algorithm on the sampled descriptors.
    kmeans = KMeans(n_clusters = n_clusters, n_jobs = -1).fit(data)
    return kmeans

In [None]:
train_path = "./dataset/train"
test_path = "./dataset/test"

n_clusters = 50
n_descriptors = 100000

# Read images and compute descriptors, saving them in a dataframe.
des_df = compute_descriptors(train_path)
# Compute kmeans clustering using descriptors
# Note: cluster centers can be obtained using `kmeans.cluster_centers_`.
kmeans = k_means_words(des_df, n_clusters, n_descriptors)

In [None]:
img = cv2.imread("./dataset/train/Coast/image_0231.jpg")
orb = cv2.ORB_create()
kp = orb.detect(img,None)
kp, des = orb.compute(img, kp)
img2 = cv2.drawKeypoints(img,kp,img)
plt.imshow(img2),plt.show()