In [2]:
from pathlib import Path
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from tqdm.auto import tqdm as tn
from ipywidgets import interact
import ipywidgets as widgets
from PIL import Image
import cv2

from pandarallel import pandarallel

tn.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=os.cpu_count())

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
DATASET_PATH = Path("../dataset")
TRAIN_DATASET_PATH = DATASET_PATH / "train_df_with_features_v2.csv"
VAL_DATASET_PATH = DATASET_PATH / "val_df_with_features_v2.csv"
TEST_DATASET_PATH = DATASET_PATH / "test_df_with_features_v2.csv"
IMAGES_TRAIN = DATASET_PATH / "images_train_unpadded"
IMAGES_TEST = DATASET_PATH / "images_test_unpadded"

In [4]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
val_df = pd.read_csv(VAL_DATASET_PATH)
test_df = pd.read_csv(TEST_DATASET_PATH)


Columns (21,22) have mixed types. Specify dtype option on import or set low_memory=False.



In [5]:
train_df["width_diff"] = np.abs(train_df["left_width"] - train_df["right_width"])
train_df["height_diff"] = np.abs(train_df["left_height"] - train_df["right_height"])
val_df["width_diff"] = np.abs(val_df["left_width"] - val_df["right_width"])
val_df["height_diff"] = np.abs(val_df["left_height"] - val_df["right_height"])
test_df["width_diff"] = np.abs(test_df["left_width"] - test_df["right_width"])
test_df["height_diff"] = np.abs(test_df["left_height"] - test_df["right_height"])

In [6]:
train_df["width_ratio"] = train_df.apply(lambda row: row["left_width"] / row["right_width"] if row["right_width"] != 0 else 0, axis=1)
train_df["height_ratio"] = train_df.apply(lambda row: row["left_height"] / row["right_height"] if row["right_width"] != 0 else 0, axis=1)
val_df["width_ratio"] = val_df.apply(lambda row: row["left_width"] / row["right_width"] if row["right_width"] != 0 else 0, axis=1)
val_df["height_ratio"] = val_df.apply(lambda row: row["left_height"] / row["right_height"] if row["right_width"] != 0 else 0, axis=1)
test_df["width_ratio"] = test_df.apply(lambda row: row["left_width"] / row["right_width"] if row["right_width"] != 0 else 0, axis=1)
test_df["height_ratio"] = test_df.apply(lambda row: row["left_height"] / row["right_height"] if row["right_width"] != 0 else 0, axis=1)

In [6]:
# class MySIFT:
#     from functools import cached_property

#     def __init__(self, folder = IMAGES_TRAIN, flann_index_kdtree = 1):
#         self.folder = folder
#         self.flann_index_kdtree = flann_index_kdtree
    
#     @cached_property
#     def sift(self):
#         import cv2
#         return cv2.SIFT_create()
    
#     @cached_property
#     def flann(self):
#         import cv2
#         index_params = dict(algorithm=self.flann_index_kdtree, trees=5)
#         search_params = dict(checks=50)
#         flann = cv2.FlannBasedMatcher(index_params, search_params)
#         return flann

#     def sift_similarity(self, row):
#         import cv2
#         try:
#             img1 = cv2.imread(str(self.folder / row["image_url1"]), cv2.IMREAD_GRAYSCALE)
#             img2 = cv2.imread(str(self.folder / row["image_url2"]), cv2.IMREAD_GRAYSCALE)
#             kp1, des1 = self.sift.detectAndCompute(img1,None)
#             kp2, des2 = self.sift.detectAndCompute(img2,None)

#             matches = self.flann.knnMatch(des1, des2, k=2)

#             good_matches_count = 0
#             for m, n in matches:
#                 if m.distance < 0.7*n.distance:
#                     good_matches_count += 1

#             similarity = good_matches_count/len(kp2)
#             return similarity
#         except Exception:
#             return 0

In [8]:
def sift_similarity(row, folder = IMAGES_TRAIN, sigma=2.0, contrast_threshold=0.01, edge_threshold=20):
    import cv2
    sift = cv2.SIFT_create(sigma=sigma, contrastThreshold=contrast_threshold, edgeThreshold=edge_threshold)
    FLANN_INDEX_KDTREE = 1
    index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
    search_params = dict(checks=50)
    flann = cv2.FlannBasedMatcher(index_params, search_params)

    try:
        img1 = cv2.imread(str(folder / row["image_url1"]), cv2.IMREAD_GRAYSCALE)
        img2 = cv2.imread(str(folder / row["image_url2"]), cv2.IMREAD_GRAYSCALE)
        kp1, des1 = sift.detectAndCompute(img1,None)
        kp2, des2 = sift.detectAndCompute(img2,None)

        matches = flann.knnMatch(des1, des2, k=2)

        good_matches_count = 0
        for m, n in matches:
            if m.distance < 0.7*n.distance:
                good_matches_count += 1

        similarity = good_matches_count/len(kp2)
        return similarity
    except Exception:
        return 0

In [9]:
train_df.loc[train_df.index, "sift_similarity"] = train_df.parallel_apply(sift_similarity, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1133), Label(value='0 / 1133'))), …

In [10]:
train_df

Unnamed: 0,image_url1,image_url2,is_same,image_path1,image_path2,ahash_16,phash_16_8,phash,dhash_verical_16,dhash_16,...,whash_16_db4,left_height,right_height,left_width,right_width,width_diff,height_diff,width_ratio,height_ratio,sift_similarity
0,892325437.jpg,944751814.jpg,0,dataset/images-b1/892325437.jpg,dataset/images-b1/944751814.jpg,119.0,108.0,108.0,136.0,137.0,...,310.0,600.0,600.0,800.0,800.0,0.0,0.0,1.0000,1.000000,0.044818
1,965225293.jpg,965564035.jpg,1,dataset/images-b3/965225293.jpg,dataset/images-b1/965564035.jpg,0.0,0.0,0.0,0.0,0.0,...,0.0,800.0,800.0,600.0,600.0,0.0,0.0,1.0000,1.000000,1.000000
2,892403612.jpg,927225968.jpg,0,dataset/images-b3/892403612.jpg,dataset/images-b1/927225968.jpg,115.0,126.0,126.0,120.0,96.0,...,222.0,600.0,600.0,800.0,800.0,0.0,0.0,1.0000,1.000000,0.036188
3,917878082.jpg,921610429.jpg,1,dataset/images-b3/917878082.jpg,dataset/images-b2/921610429.jpg,10.0,24.0,24.0,19.0,25.0,...,28.0,600.0,600.0,800.0,800.0,0.0,0.0,1.0000,1.000000,0.537602
4,907769150.jpg,921819974.jpg,0,dataset/images-b3/907769150.jpg,dataset/images-b1/921819974.jpg,73.0,100.0,100.0,107.0,114.0,...,116.0,800.0,600.0,600.0,800.0,200.0,200.0,0.7500,1.333333,0.020619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72482,896077337.jpg,932363571.jpg,0,dataset/images-b2/896077337.jpg,dataset/images-b3/932363571.jpg,140.0,104.0,104.0,118.0,104.0,...,250.0,600.0,600.0,800.0,800.0,0.0,0.0,1.0000,1.000000,
72483,919255125.jpg,922397616.jpg,0,dataset/images-b3/919255125.jpg,dataset/images-b1/922397616.jpg,96.0,108.0,108.0,117.0,103.0,...,222.0,600.0,563.0,800.0,1000.0,200.0,37.0,0.8000,1.065719,
72484,924310310.jpg,925806417.jpg,1,dataset/images-b3/924310310.jpg,dataset/images-b1/925806417.jpg,4.0,0.0,0.0,3.0,0.0,...,2.0,600.0,600.0,800.0,800.0,0.0,0.0,1.0000,1.000000,
72485,927655303.jpg,931435054.jpg,1,dataset/images-b1/927655303.jpg,dataset/images-b2/931435054.jpg,2.0,8.0,8.0,4.0,6.0,...,98.0,800.0,800.0,600.0,600.0,0.0,0.0,1.0000,1.000000,


In [11]:
val_df.loc[val_df.index, "sift_similarity"] = val_df.parallel_apply(sift_similarity, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=284), Label(value='0 / 284'))), HB…

In [12]:
test_df.loc[test_df.index, "sift_similarity"] = test_df.parallel_apply(sift_similarity, folder=IMAGES_TEST, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=355), Label(value='0 / 355'))), HB…

## Have to save result in your own fashion

In [10]:
train_df.to_csv(DATASET_PATH / "train_df_with_features_v2_sift.csv", index=False)

In [13]:
val_df.to_csv(DATASET_PATH / "val_df_with_features_v2_sift.csv", index=False)
test_df.to_csv(DATASET_PATH / "test_df_with_features_v2_sift.csv", index=False)

## Grid search

In [8]:
SPLIT_COLUMN = 'phash_16_8'

In [9]:
df_susp = train_df[(train_df[SPLIT_COLUMN] <= 90) & (train_df[SPLIT_COLUMN] >= 55)]

In [10]:
len(df_susp)

1502

In [12]:
df_susp.loc[df_susp.index[:100], "sift_similarity"] = df_susp.iloc[:100].parallel_apply(sift_similarity, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9), Label(value='0 / 9'))), HBox(c…

In [13]:
SIGMAS = [0.5, 1.0, 1.4, 2.]
EDGES_THRESHS = [1, 3, 10, 20]
CONSTRAST_THRESHS = [0.01, 0.02, 0.04, 0.1]

In [14]:
from sklearn.metrics import f1_score

In [21]:
results = {}
for sigma in SIGMAS:
    for edge_thresh in EDGES_THRESHS:
        for contrast_thresh in CONSTRAST_THRESHS:
            df_susp.loc[df_susp.index, "sift_similarity"] = df_susp.parallel_apply(sift_similarity, axis=1, sigma=sigma, 
                                                                                  edge_threshold=edge_thresh, contrast_threshold=contrast_thresh)

            f1s = {}
            max_sim = df_susp["sift_similarity"].max()
            min_sim = df_susp["sift_similarity"].min()
            if max_sim == min_sim:
                print('SKIPPING')
                continue
            for i in np.arange(min_sim, max_sim, step=min(0.001, (max_sim - min_sim)/100)):
                f1s[i] = (f1_score(df_susp["is_same"], df_susp["sift_similarity"] > i) + f1_score(1 - df_susp["is_same"], df_susp["sift_similarity"] <= i)) / 2
            
            threshold = max(f1s, key=f1s.get)

            results[str({'sigma': sigma, 'edge': edge_thresh, 'contrast': contrast_thresh})] = f1s[threshold]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

SKIPPING


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

In [22]:
best_params = max(results, key=results.get)

In [23]:
best_params, results[best_params]

("{'sigma': 2.0, 'edge': 20, 'contrast': 0.01}", 0.9726419314755507)

In [None]:
plt.hist([df_susp[df_susp[TARGET_COLUMN] == 1]["sift_similarity"], df_susp[df_susp[TARGET_COLUMN] == 0]["sift_similarity"]], 
                                  bins=20, stacked=True, label=["Equal", "Different"])
plt.show()