In [1]:
import os

import pandas as pd
import numpy as np
import h5py

import cv2

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

from image_features import feature_detect_extract

import matplotlib.pyplot as plt
%matplotlib inline

# Image search

In [19]:
def get_SIFT_keypoints(filepath, width=320, eps=1e-7):
    """Extract SIFT keypoints and features
       Given a string containing the path to an image,
       determine the location of the keypoints and features
    """
    
    # Create detector and descriptor
    feat_detector = cv2.FeatureDetector_create('SIFT')
    feat_extractor = cv2.DescriptorExtractor_create("SIFT")
    
    kps, features = feature_detect_extract(filepath, 
                                           feat_detector, 
                                           feat_extractor,
                                           width=width,
                                           eps=eps)
    
    kp = np.array([x.pt for x in kps])
    
    return kp, features

In [3]:
# Run the function

in_path = '../priv/images/snooth_dot_com_6799.png'
kp, features = get_SIFT_keypoints(in_path)

features.shape, kp.shape

((358, 128), (358, 2))

# Map query image keypoints to K-Means clusters

In [20]:
def map_feature_to_clusters(features, nclusters=1500):
    """Map features to K-Means clusters
       Given an input set of features and number of clusters
       return a histogram of features mapped to an
       existing K-Means clustering model
    """
    
    # Load the k-means clusters
    kmeans_path = '../priv/data/kmeans.h5'
    km = h5py.File(kmeans_path, 'r')
    km_matrix = km[str(nclusters)]

    # Pairwise euclidean distances
    ec = euclidean_distances(features, km_matrix)
    km.close()

    # Closest cluster id and count
    closest_clust_id = np.argmin(ec, axis=1)
    cluster_id, word_count = np.unique(closest_clust_id, return_counts=True)

    # Dense matrix of word counts
    hist = np.zeros(nclusters, dtype=np.int)
    hist[cluster_id] = word_count
    
    return hist

In [5]:
# Map the features to K-Means clusters

query_hist = map_feature_to_clusters(features)

query_hist

array([0, 0, 0, ..., 0, 2, 0])

# Get candidate histograms

In [21]:
def get_candidate_histograms(query_hist, max_images=200, nclusters=1500):
    """Given an input histogram from a query image,
       find the histograms of the top-N (max_images) 
    """

    # Load the inverted index data
    index_file = '../priv/data/inverted_index.h5'
    ix = pd.HDFStore(index_file, 'r')
    inverted_index = ix[str(nclusters)]
    ix.close()
    
    # Find all files that have keypoints mapped to
    # non-zero parts of the query histogram and
    # sort files by number of appearances
    index_bins = np.nonzero(query_hist)[0]

    sorted_counts = (inverted_index
                     .loc[index_bins]
                     .groupby('file')
                     .sum()
                     .sort_values('count', ascending=False)
                     .reset_index())

    # Find the top (max_images) files, accounting for ties at the
    # cut-off point
    max_occurrences = sorted_counts.iloc[max_images]['count']
    
    # Get a list of these image names
    candidate_images = (sorted_counts
                        .query('count >= {}'.format(max_occurrences))
                        ['file']
                        .values)
    
    # Extract the histgrams for these images
    hist_file = '../priv/data/hist.h5'
    hs = pd.HDFStore(hist_file, 'r')
    data_hist = hs[str(nclusters)].set_index('image_path')
    hs.close()

    candidate_hist = data_hist.loc[candidate_images]
    
    return candidate_hist

In [7]:
# Retrieve the candidate histograms

candidate_hist = get_candidate_histograms(query_hist)

candidate_hist.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
image_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
snooth_dot_com_43807.jpeg,0,0,23,0,0,0,0,38,0,0,...,0,0,0,0,0,0,0,0,8,0
snooth_dot_com_14543.jpeg,0,0,2,0,0,0,0,7,0,0,...,0,0,0,0,0,0,5,0,5,0
snooth_dot_com_14563.jpeg,0,0,2,0,0,0,0,7,0,0,...,0,0,0,0,0,0,5,0,5,0
snooth_dot_com_11833.jpeg,0,0,3,0,0,0,0,74,0,0,...,0,0,0,0,0,0,1,0,6,0
snooth_dot_com_30551.jpeg,0,0,8,0,0,0,0,13,0,0,...,0,0,0,0,0,0,4,0,8,0


# Rank images

## Without IDF, and using chi2.

In [22]:
def calc_candidate_chi2(hist, candidate_hist):
    """Given a query histogram and a set of histograms
       from candidate images, calculate the chi-squared
       score
    """

    chi_sq_val = ((candidate_hist - hist).pow(2) / 
                  (candidate_hist + hist + 1.0e-10)).sum(axis=1)*0.5
    
    return chi_sq_val.sort_values()

In [9]:
# Run the chi2 calculation

chi_sq_val = calc_candidate_chi2(query_hist, candidate_hist)

chi_sq_val.head()

image_path
snooth_dot_com_6799.png        0.000000
snooth_dot_com_48533.jpeg    194.621318
snooth_dot_com_23253.jpeg    201.852366
snooth_dot_com_10093.jpeg    206.491596
snooth_dot_com_11818.jpeg    206.737771
dtype: float64

## With IDF and cosine distances.

In [23]:
def calc_candidate_idf_cos(hist, candidate_hist):
    """Given a query histogram and a set of histograms
       from candidate images, calculate the IDF-weighted
       cosine distance
    """
    
    # Calculate the inverse document frequency
    nimages = candidate_hist.shape[0]
    idf = np.log(nimages/(1.0 + candidate_hist.sum(axis=0).values))

    # IDF-weighted cosine distances
    idf_cos_val = pd.Series(np.squeeze(cosine_distances((candidate_hist * idf).values, 
                                                        (hist.reshape(1,-1) * idf))),
                            index=candidate_hist.index)

    return idf_cos_val.sort_values()

In [11]:
# Run the IDF weighted cosine distances
idf_cos_val = calc_candidate_idf_cos(query_hist, candidate_hist)

idf_cos_val.head()

image_path
snooth_dot_com_6799.png      3.330669e-16
snooth_dot_com_11008.jpeg    4.268686e-01
snooth_dot_com_1644.jpeg     4.466822e-01
snooth_dot_com_26668.jpeg    4.781797e-01
snooth_dot_com_2899.jpeg     4.782433e-01
dtype: float64

What is the overlap in the top 50 images with the two techniques?

In [12]:
len(set(chi_sq_val.iloc[:50].index).intersection(idf_cos_val.iloc[:50].index))

32

## With RANSAC

In [24]:
def get_ransac_matches(kp, features, candidate_image_list, 
                       ratio=0.7, min_matches=20):
    
    """Given keypoints and features for a query image
       plus a list of candidate images, run RANSAC
       and find the best match
    """
        
    # The feature data
    st = pd.HDFStore('../priv/data/features.h5', 'r')

    # Euclidean matcher for comparing data
    desc_matcher = cv2.DescriptorMatcher_create('BruteForce')

    # Store the RANSAC scores here
    score_list = list()

    for candidate_image in candidate_image_list:

        # Get the basename for the file
        candidate_basename = os.path.splitext(os.path.basename(candidate_image))[0]

        # Find the row index location of this file to get keypoints
        candidate_loc = (st['basename'] == candidate_basename).idxmax()

        # The beginning and ending index of keypoints/features for the candidate image
        idx0, idx1 = st['index'].loc[candidate_loc]

        # The keypoints and features for the candidate image
        kp_ = st.select('keypoints', start=idx0, stop=idx1).values
        features_ = st.select('features', start=idx0, stop=idx1).values

        # Run brute force KNN matching with Euclidean distance to pair the features up
        matches = desc_matcher.knnMatch(features_, features, 2)

        # Extract the index for the train and candidate 
        match_list = [[x[0].trainIdx, x[0].queryIdx] 
                       for x in matches 
                       if ((len(x) >= 2) & (x[0].distance < x[1].distance*ratio))]
        
        filtered_matches = np.array(match_list)

        if len(filtered_matches) >= min_matches:
            # Point index for the query image
            pts  = np.array([kp[filtered_matches[:, 0]]], dtype=np.float32)

            # Point index for the candidate image
            pts_ = np.array([kp_[filtered_matches[:, 1]]], dtype=np.float32)

            # Run RANSAC - this outputs 0,1 depending on match
            _, status = cv2.findHomography(pts, 
                                           pts_,
                                           cv2.RANSAC,
                                           4.0)

            score_list.append((np.mean(status), candidate_basename))

    st.close()

    return sorted(score_list, reverse=True)
    

In [15]:
get_ransac_matches(kp, features, chi_sq_val.iloc[:100].index)

[(1.0, 'snooth_dot_com_6799')]

 I'm a little surprised RANSAC only returns one match as meeting the matching criteria. May need to tune parameters if this is too strict

## Putting it all together

In [27]:
def match_wine_label(image_path, ncluster=1500):
    # Required for calling this from an external notebook
    # fix when this becomes a function
    import time
    
    # Run SIFT
    kp, features = get_SIFT_keypoints(image_path)

    # Time the matchine
    begin = time.time()
    
    # Get the cluster histogram for the image
    query_hist = map_feature_to_clusters(features, nclusters=ncluster)

    # Load histograms of similar images and rank them
    candidate_hist = get_candidate_histograms(query_hist, nclusters=ncluster)
    chi_sq_val = calc_candidate_chi2(query_hist, candidate_hist)

    # Run RANSAC on some of them
    matched_file = get_ransac_matches(kp, features, chi_sq_val.index[:100])

    end = time.time()
    total_time = end - begin
    
    if len(matched_file) > 0:
        matched_file = matched_file[0][1]
    
    # TODO: load image when used for actual search?
    return matched_file, total_time


In [None]:
image_path = '../priv/images/snooth_dot_com_6799.png'

match_wine_label(image_path, 1500)