In [None]:
!pip install pandarallel

In [20]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import imagehash
from PIL import Image, UnidentifiedImageError
from tqdm.auto import tqdm as tn
from sklearn.metrics import f1_score

from pandarallel import pandarallel


tn.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=os.cpu_count())

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
# DATA_PATH = Path("/kaggle/input/csc-hack-lun-dataset")
# COMP_DATA_PATH = Path("/kaggle/input/csc-hackathon-2023-lunua-task")
# IMAGE_PATH = Path("/kaggle/input/csc-hack-lun-dataset/dataset")
DATA_PATH = Path("../data")
COMP_DATA_PATH = Path("../data")
IMAGE_PATH = Path("../dataset")

TEST_SET = COMP_DATA_PATH / "test-data.csv"
TRAIN_SPLIT = DATA_PATH / "train_split_20perval_grouped_stratified.csv"
VAL_SPLIT = DATA_PATH / "val_split_20perval_grouped_stratified.csv"

IMG_GLOB = "images*/*.jpg"

In [6]:
def bind_fs(df, path: Path, glob: str):
    mapping = {x.name: x for x in path.glob(glob)}
    return df.applymap(lambda x: mapping.get(x))

In [42]:
val_df = pd.read_csv(VAL_SPLIT)
val_df[["image_path1", "image_path2"]] = bind_fs(val_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

train_df = pd.read_csv(TRAIN_SPLIT)
train_df[["image_path1", "image_path2"]] = bind_fs(train_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

test_df = pd.read_csv(TEST_SET)
test_df[["image_url1", "image_url2"]] = test_df[["image_url1", "image_url2"]].applymap(lambda x: x.rsplit("/", 1)[-1])
test_df[["image_path1", "image_path2"]] = bind_fs(test_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

In [14]:
class Predictor:
    def __init__(self, transform, col1="image_path1", col2="image_path2"):
        self.transform = transform
        self._col1 = col1
        self._col2 = col2

    def process_image(self, path):
        img = Image.open(path)
        return self.transform(img)

    def predict(self, row):
        if row[self._col1] is None or row[self._col2] is None:
            # print(row)
            return None
        try:
            hash1 = self.process_image(row[self._col1])
            hash2 = self.process_image(row[self._col2])
        except (UnidentifiedImageError, OSError):
            return None
        return hash1 - hash2

In [24]:
len(train_df)

72487

In [15]:
# train_df_full = train_df
# train_df = train_df_full.sample(10000, random_state=90)

In [25]:
predictor = Predictor(lambda x: imagehash.phash(x, hash_size=8, highfreq_factor=4))
train_df["phash"] = train_df.parallel_apply(predictor.predict, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6041), Label(value='0 / 6041'))), …

In [21]:
res = {}
for i in np.arange(train_df["phash"].min(), train_df["phash"].max()):
    res[i] = f1_score(train_df["is_same"], train_df["phash"] < i)

In [26]:
threshold = max(res, key=res.get)

res, threshold

({0.0: 0.0,
  1.0: 0.7441950464396284,
  2.0: 0.7441950464396284,
  3.0: 0.8791018998272885,
  4.0: 0.8791018998272885,
  5.0: 0.9322145442579797,
  6.0: 0.9322145442579797,
  7.0: 0.956088145407753,
  8.0: 0.956088145407753,
  9.0: 0.9707324790381269,
  10.0: 0.9707324790381269,
  11.0: 0.9732268670737436,
  12.0: 0.9732268670737436,
  13.0: 0.9660261337432745,
  14.0: 0.9660261337432745,
  15.0: 0.9354695149087672,
  16.0: 0.9354695149087672,
  17.0: 0.8712434518886132,
  18.0: 0.8712434518886132,
  19.0: 0.7823078823820114,
  20.0: 0.7823078823820114,
  21.0: 0.6893330436671735,
  22.0: 0.6893330436671735,
  23.0: 0.6041072447233314,
  24.0: 0.6041072447233314,
  25.0: 0.5476005858533644,
  26.0: 0.5476005858533644,
  27.0: 0.5148178137651821,
  28.0: 0.5148178137651821,
  29.0: 0.4989017885158456,
  30.0: 0.4989017885158456,
  31.0: 0.49055439895134556,
  32.0: 0.49055439895134556,
  33.0: 0.4880368098159509,
  34.0: 0.4880368098159509,
  35.0: 0.48710689417706016,
  36.0: 0.487106

In [31]:
val_df["phash"] = val_df.parallel_apply(predictor.predict, axis=1)
val_df["predict"] = (val_df["phash"] < threshold).astype(int)
f1_score(val_df["is_same"], val_df["predict"])

0.9771657660798326

In [44]:
test_df["phash"] = test_df.parallel_apply(predictor.predict, axis=1)
test_df["predict"] = (test_df["phash"] < threshold).astype(int)

test_df.rename(columns={"predict": "is_same"})[["ID", "is_same"]].to_csv("phash_baseline.csv", index=False)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1889), Label(value='0 / 1889'))), …