In [1]:
from pathlib import Path
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from tqdm.auto import tqdm as tn
from ipywidgets import interact
import ipywidgets as widgets
from PIL import Image

from pandarallel import pandarallel

tn.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=os.cpu_count())

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
DATASET_PATH = Path("../dataset")
TRAIN_DATASET_PATH = DATASET_PATH / "train_df_with_features_v1.csv"
VAL_DATASET_PATH = DATASET_PATH / "val_df_with_features_v1.csv"
TEST_DATASET_PATH = DATASET_PATH / "test_df_with_features_v1.csv"
IMAGES_TRAIN = DATASET_PATH / "images_train"
IMAGES_TEST = DATASET_PATH / "images_test"

In [3]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
val_df = pd.read_csv(VAL_DATASET_PATH)
test_df = pd.read_csv(TEST_DATASET_PATH)

In [5]:
train_df["width_diff"] = np.abs(train_df["left_width"] - train_df["right_width"])
train_df["height_diff"] = np.abs(train_df["left_height"] - train_df["right_height"])
val_df["width_diff"] = np.abs(val_df["left_width"] - val_df["right_width"])
val_df["height_diff"] = np.abs(val_df["left_height"] - val_df["right_height"])
test_df["width_diff"] = np.abs(test_df["left_width"] - test_df["right_width"])
test_df["height_diff"] = np.abs(test_df["left_height"] - test_df["right_height"])

In [8]:
train_df["width_ratio"] = train_df.apply(lambda row: row["left_width"] / row["right_width"] if row["right_width"] != 0 else 0, axis=1)
train_df["height_ratio"] = train_df.apply(lambda row: row["left_height"] / row["right_height"] if row["right_width"] != 0 else 0, axis=1)
val_df["width_ratio"] = val_df.apply(lambda row: row["left_width"] / row["right_width"] if row["right_width"] != 0 else 0, axis=1)
val_df["height_ratio"] = val_df.apply(lambda row: row["left_height"] / row["right_height"] if row["right_width"] != 0 else 0, axis=1)
test_df["width_ratio"] = test_df.apply(lambda row: row["left_width"] / row["right_width"] if row["right_width"] != 0 else 0, axis=1)
test_df["height_ratio"] = test_df.apply(lambda row: row["left_height"] / row["right_height"] if row["right_width"] != 0 else 0, axis=1)

In [4]:
def create_sift_similarity(folder = IMAGES_TRAIN):
    # import cv2
    def sift_similarity(row):
        import cv2
        sift = cv2.SIFT_create()
        FLANN_INDEX_KDTREE = 1
        index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
        search_params = dict(checks=50)
        flann = cv2.FlannBasedMatcher(index_params, search_params)

        try:
            img1 = cv2.imread(str(folder / row["image_url1"]), cv2.IMREAD_GRAYSCALE)
            img2 = cv2.imread(str(folder / row["image_url2"]), cv2.IMREAD_GRAYSCALE)
            kp1, des1 = sift.detectAndCompute(img1,None)
            kp2, des2 = sift.detectAndCompute(img2,None)

            matches = flann.knnMatch(des1, des2, k=2)

            good_matches_count = 0
            for m, n in matches:
                if m.distance < 0.7*n.distance:
                    good_matches_count += 1

            similarity = good_matches_count/len(kp2)
            return similarity
        except Exception:
            return 0
    return sift_similarity

In [10]:
train_df["sift_similarity"] = train_df.parallel_apply(create_sift_similarity(), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6041), Label(value='0 / 6041'))), …

In [9]:
train_df

Unnamed: 0,image_url1,image_url2,is_same,image_path1,image_path2,ahash_16,phash_16_8,phash,dhash_verical_16,dhash_16,...,whash_16_haar,whash_16_db4,left_height,right_height,left_width,right_width,width_diff,height_diff,width_ratio,height_ratio
0,892325437.jpg,944751814.jpg,0,dataset/images-b1/892325437.jpg,dataset/images-b1/944751814.jpg,119.0,108.0,108.0,136.0,137.0,...,122.0,310.0,600.0,600.0,800.0,800.0,0.0,0.0,1.0000,1.000000
1,965225293.jpg,965564035.jpg,1,dataset/images-b3/965225293.jpg,dataset/images-b1/965564035.jpg,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,800.0,800.0,600.0,600.0,0.0,0.0,1.0000,1.000000
2,892403612.jpg,927225968.jpg,0,dataset/images-b3/892403612.jpg,dataset/images-b1/927225968.jpg,115.0,126.0,126.0,120.0,96.0,...,114.0,222.0,600.0,600.0,800.0,800.0,0.0,0.0,1.0000,1.000000
3,917878082.jpg,921610429.jpg,1,dataset/images-b3/917878082.jpg,dataset/images-b2/921610429.jpg,10.0,24.0,24.0,19.0,25.0,...,8.0,28.0,600.0,600.0,800.0,800.0,0.0,0.0,1.0000,1.000000
4,907769150.jpg,921819974.jpg,0,dataset/images-b3/907769150.jpg,dataset/images-b1/921819974.jpg,73.0,100.0,100.0,107.0,114.0,...,76.0,116.0,800.0,600.0,600.0,800.0,200.0,200.0,0.7500,1.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72482,896077337.jpg,932363571.jpg,0,dataset/images-b2/896077337.jpg,dataset/images-b3/932363571.jpg,140.0,104.0,104.0,118.0,104.0,...,120.0,250.0,600.0,600.0,800.0,800.0,0.0,0.0,1.0000,1.000000
72483,919255125.jpg,922397616.jpg,0,dataset/images-b3/919255125.jpg,dataset/images-b1/922397616.jpg,96.0,108.0,108.0,117.0,103.0,...,106.0,222.0,600.0,563.0,800.0,1000.0,200.0,37.0,0.8000,1.065719
72484,924310310.jpg,925806417.jpg,1,dataset/images-b3/924310310.jpg,dataset/images-b1/925806417.jpg,4.0,0.0,0.0,3.0,0.0,...,0.0,2.0,600.0,600.0,800.0,800.0,0.0,0.0,1.0000,1.000000
72485,927655303.jpg,931435054.jpg,1,dataset/images-b1/927655303.jpg,dataset/images-b2/931435054.jpg,2.0,8.0,8.0,4.0,6.0,...,0.0,98.0,800.0,800.0,600.0,600.0,0.0,0.0,1.0000,1.000000


In [None]:
val_df["sift_similarity"] = val_df.parallel_apply(create_sift_similarity(), axis=1)

In [None]:
test_df["sift_similarity"] = test_df.parallel_apply(create_sift_similarity(IMAGES_TEST), axis=1)

## Have to save result in your own fashion

In [None]:
train_df.to_csv(DATASET_PATH / "train_df_with_features_v1_sift.csv", index=False)

In [None]:
val_df.to_csv(DATASET_PATH / "val_df_with_features_v1_sift.csv", index=False)
test_df.to_csv(DATASET_PATH / "test_df_with_features_v1_sift.csv", index=False)