In [5]:
!pip install imagehash

Collecting imagehash
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl (296 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.5/296.5 kB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: imagehash
Successfully installed imagehash-4.3.1


In [6]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import imagehash
from PIL import Image, UnidentifiedImageError
from tqdm.auto import tqdm as tn
from sklearn.metrics import f1_score

from pandarallel import pandarallel


tn.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=os.cpu_count())

INFO: Pandarallel will run on 62 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [10]:
# DATA_PATH = Path("/kaggle/input/csc-hack-lun-dataset")
# COMP_DATA_PATH = Path("/kaggle/input/csc-hackathon-2023-lunua-task")
# IMAGE_PATH = Path("/kaggle/input/csc-hack-lun-dataset/dataset")
DATA_PATH = Path("data")
COMP_DATA_PATH = Path("data")
IMAGE_PATH = Path("dataset")

TEST_SET = COMP_DATA_PATH / "test-data.csv"
TRAIN_SPLIT = DATA_PATH / "train_split_20perval_grouped_stratified.csv"
VAL_SPLIT = DATA_PATH / "val_split_20perval_grouped_stratified.csv"

IMG_GLOB = "images*/*.jpg"

In [11]:
def bind_fs(df, path: Path, glob: str):
    mapping = {x.name: x for x in path.glob(glob)}
    return df.applymap(lambda x: mapping.get(x))

In [12]:
val_df = pd.read_csv(VAL_SPLIT)
val_df[["image_path1", "image_path2"]] = bind_fs(val_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

train_df = pd.read_csv(TRAIN_SPLIT)
train_df[["image_path1", "image_path2"]] = bind_fs(train_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

test_df = pd.read_csv(TEST_SET)
test_df[["image_url1", "image_url2"]] = test_df[["image_url1", "image_url2"]].applymap(lambda x: x.rsplit("/", 1)[-1])
test_df[["image_path1", "image_path2"]] = bind_fs(test_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

In [62]:
class Predictor:
    def __init__(self, transforms, col1="image_path1", col2="image_path2"):
        self.transforms = transforms
        self._col1 = col1
        self._col2 = col2

    def transform_image(self, img):

        return {k: transform(img) for k, transform in self.transforms.items()}

    def predict(self, row):
        if row[self._col1] is None or row[self._col2] is None:
            # print(row)
            return None
        try:
            img1 = Image.open(row[self._col1])
            img2 = Image.open(row[self._col2])
            hashes1 = self.transform_image(img1)
            hashes2 = self.transform_image(img2)
        except (UnidentifiedImageError, OSError):
            return {}
        hashres = {}
        for k in hashes1:
            try:
                hashres[k] = hashes1[k] - hashes2[k]
            except:
                hashres[k] = np.nan
        hashres["left_height"] = img1.height
        hashres["right_height"] = img2.height
        hashres["left_width"] = img1.width
        hashres["right_width"] = img2.width
        return hashres

In [64]:
transforms = {
    "ahash_16": lambda x: imagehash.average_hash(x, 16),
    "phash_16_8": lambda x: imagehash.phash(x, hash_size=16, highfreq_factor=8),
    "phash": lambda x: imagehash.phash(x, hash_size=16, highfreq_factor=8),
    "dhash_verical_16": lambda x: imagehash.dhash_vertical(x, hash_size=16),
    "dhash_16": lambda x: imagehash.dhash(x, hash_size=16),
    "colorhash_21": lambda x: imagehash.colorhash(x, binbits=21),
    "colorhash_33": lambda x: imagehash.colorhash(x, binbits=33),
    "colorhash_63": lambda x: imagehash.colorhash(x, binbits=63),
    "colorhash_123": lambda x: imagehash.colorhash(x, binbits=123),
    "whash_16_haar": lambda x: imagehash.whash(x, 16, mode="haar"),
    "whash_16_db4": lambda x: imagehash.whash(x, 16, mode="db4"),
}

predictor = Predictor(transforms)
features = train_df.parallel_apply(predictor.predict, axis=1)

In [58]:
f = pd.Series(features, index=train_df.index)
f[f.isna()] = [{} for x in f[f.isna()]]

train_df = pd.concat([train_df, pd.DataFrame(f.to_list())], axis=1)
train_df.to_csv("train_df_with_features_v1.csv", index=False)

In [66]:
f = val_df.parallel_apply(predictor.predict, axis=1)

val_df = pd.concat([val_df, pd.DataFrame(f.to_list())], axis=1)
val_df.to_csv("val_df_with_features_v1.csv", index=False)

In [69]:
f = test_df.parallel_apply(predictor.predict, axis=1)

test_df = pd.concat([test_df, pd.DataFrame(f.to_list())], axis=1)
test_df.to_csv("test_df_with_features_v1.csv", index=False)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=366), Label(value='0 / 366'))), HB…