In [46]:
from glob import glob
import re
import pandas as pd
import numpy as np

import dill
import h5py

import cv2

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

%matplotlib inline
import matplotlib.pyplot as plt

# Image search

Calculate SIFT keypoints for candidate image.

In [2]:
def resize_image(image, height=None, width=None):
    
    (orig_height, orig_width) = image.shape[:2]
    orig_height = float(orig_height)
    orig_width = float(orig_width)

    if height is not None:
        ratio = height / orig_height
        dim = (int(orig_width * ratio), height)

    elif width is not None:
        ratio = width / orig_width
        dim = (width, int(orig_height * ratio))

    resized = cv2.resize(image, dim, interpolation=cv2.INTER_AREA)
    return resized

In [3]:
class RootSIFT(object):
    # From http://www.pyimagesearch.com/2015/04/13/implementing-rootsift-in-python-and-opencv/
    
    def __init__(self):
        # initialize the SIFT feature extractor
        self.extractor = cv2.DescriptorExtractor_create("SIFT")

    def compute(self, image, kps, eps=1e-7):
        # compute SIFT descriptors
        (kps, descs) = self.extractor.compute(image, kps)

        # if there are no keypoints or descriptors, return an empty tuple
        if len(kps) == 0:
            return ([], None)

        # apply the Hellinger kernel by first L1-normalizing and taking the
        # square-root
        descs /= (descs.sum(axis=1, keepdims=True) + eps)
        descs = np.sqrt(descs)
        #descs /= (np.linalg.norm(descs, axis=1, ord=2) + eps)

        # return a tuple of the keypoints and descriptors
        return (kps, descs)

In [4]:
in_path = '../images/snooth_dot_com_6799.png'
image = cv2.imread(in_path)
image = resize_image(image, width=320)
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

feat_detector = cv2.FeatureDetector_create('SURF')
sift = RootSIFT()
(kps, features) = sift.compute(image, kps)
kp = np.array([x.pt for x in kps])

In [5]:
features.shape

(358, 128)

# Map keypoints to K-Means clusters

# TODO ADJUST CALCULATION

In [8]:
nclusters = 1500

# Load the k-means clusters
kmeans_path = '../data/kmeans.h5'
km = h5py.File(kmeans_path, 'r')
km_matrix = km[str(nclusters)]

# Pairwise euclidean distances
print features.shape, km_matrix.shape
ec = euclidean_distances(features, km_matrix)
km.close()

# Closest cluster id and count
closest_clust_id = np.argmin(ec, axis=1)
cluster_id, word_count = np.unique(closest_clust_id, return_counts=True)

# Dense matrix of word counts
hist = np.zeros(nclusters, dtype=np.int)
hist[cluster_id] = word_count

(358, 128) (1500, 128)


# Get candidate images

In [11]:
max_images = 200

In [12]:
index_file = '../data/inverted_index.h5'
ix = pd.HDFStore(index_file, 'r')
inverted_index = ix[str(nclusters)]
ix.close()

In [13]:
index_bins = np.nonzero(hist)[0]

sorted_counts = (inverted_index
                 .loc[index_bins]
                 .groupby('file')
                 .sum()
                 .sort_values('count', ascending=False)
                 .reset_index())

In [14]:
mask = sorted_counts.file.str.contains('snooth_dot_com_6799.')
sorted_counts.loc[mask]

Unnamed: 0,file,count
138,snooth_dot_com_6799.png,358


In [68]:
max_occurrences = sorted_counts.iloc[max_images]['count']
candidate_images = sorted_counts.query('count >= {}'.format(max_occurrences))['file'].values
candidate_hist = data_hist.loc[candidate_images]

In [69]:
len(candidate_images)

204

# Rank images

In [70]:
hist_file = '../data/hist.h5'
hs = pd.HDFStore(hist_file, 'r')
data_hist = hs['1500'].set_index('image_path')
hs.close()

## Without TF-IDF, and using chi2.

In [71]:
chi_sq_val = ((candidate_hist - hist).pow(2) / (candidate_hist + hist + 1.0e-10)).sum(axis=1)*0.5

In [72]:
chi_sq_val.sort_values()[:20]

image_path
snooth_dot_com_6799.png        0.000000
snooth_dot_com_48533.jpeg    194.621318
snooth_dot_com_23253.jpeg    201.852366
snooth_dot_com_10093.jpeg    206.491596
snooth_dot_com_11818.jpeg    206.737771
snooth_dot_com_11008.jpeg    207.005353
snooth_dot_com_9974.png      221.666879
snooth_dot_com_6773.png      225.352200
snooth_dot_com_9951.jpeg     228.119250
snooth_dot_com_42938.jpeg    238.891034
snooth_dot_com_39104.jpeg    242.254825
snooth_dot_com_25983.png     245.581937
snooth_dot_com_48476.jpeg    253.876899
snooth_dot_com_24889.png     254.129743
snooth_dot_com_38515.jpeg    256.899488
snooth_dot_com_38535.jpeg    256.899488
snooth_dot_com_33700.png     259.406166
snooth_dot_com_33680.png     259.406166
snooth_dot_com_24304.png     260.604081
snooth_dot_com_24802.jpeg    265.088038
dtype: float64

## With TF-IDF and cosine distances.

In [73]:
nimages = candidate_hist.shape[0]
idf = np.log(nimages/(1.0 + candidate_hist.sum(axis=0).values))
idf.shape

(1500,)

In [74]:
idf_cos_val = pd.Series(np.squeeze(cosine_distances((candidate_hist * idf).values, (hist * idf).ravel())),
                        index=chi_sq_val.index)
idf_cos_val.sort_values()[:20]



image_path
snooth_dot_com_6799.png      7.771561e-16
snooth_dot_com_11008.jpeg    4.268686e-01
snooth_dot_com_1644.jpeg     4.466822e-01
snooth_dot_com_26668.jpeg    4.781797e-01
snooth_dot_com_2899.jpeg     4.782433e-01
snooth_dot_com_7013.png      4.890880e-01
snooth_dot_com_7100.png      4.918696e-01
snooth_dot_com_44120.jpeg    4.933682e-01
snooth_dot_com_6773.png      5.039745e-01
snooth_dot_com_10093.jpeg    5.070767e-01
snooth_dot_com_48533.jpeg    5.101588e-01
snooth_dot_com_11818.jpeg    5.107447e-01
snooth_dot_com_24159.jpeg    5.135485e-01
snooth_dot_com_23924.png     5.142873e-01
snooth_dot_com_23253.jpeg    5.160893e-01
snooth_dot_com_4182.jpeg     5.210643e-01
snooth_dot_com_37125.jpeg    5.300589e-01
snooth_dot_com_24889.png     5.308205e-01
snooth_dot_com_6638.png      5.321208e-01
snooth_dot_com_33662.jpeg    5.326064e-01
dtype: float64

## With RANSAC

In [76]:
matcher = cv2.DescriptorMatcher_create("BruteForce")

In [79]:
matcher.knnMatch?