In [2]:
import itertools
import os
from pathlib import Path

import numpy as np
import pandas as pd
import imagehash
import skimage
from PIL import Image, UnidentifiedImageError
from tqdm.auto import tqdm as tn

from pandarallel import pandarallel


tn.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=os.cpu_count())

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [15]:
%cd ~/work/dev/Others/yerko/other/csc

/home/jovyan/work/dev/Others/yerko/other/csc


In [35]:
# DATA_PATH = Path("/kaggle/input/csc-hack-lun-dataset")
# COMP_DATA_PATH = Path("/kaggle/input/csc-hackathon-2023-lunua-task")
# IMAGE_PATH = Path("/kaggle/input/csc-hack-lun-dataset/dataset")
DATA_PATH = Path("data")
COMP_DATA_PATH = Path("data")
IMAGE_PATH = Path("dataset")

TEST_SET = COMP_DATA_PATH / "test-data.csv"
TRAIN_SPLIT = DATA_PATH / "train_split_20perval_grouped_stratified.csv"
VAL_SPLIT = DATA_PATH / "val_split_20perval_grouped_stratified.csv"

IMG_GLOB = "images_*_unpadded/*.jpg"

In [36]:
def bind_fs(df, path: Path, glob: str):
    mapping = {x.name: x for x in path.glob(glob)}
    return df.applymap(lambda x: mapping.get(x))

In [37]:
val_df = pd.read_csv(VAL_SPLIT)
val_df[["image_path1", "image_path2"]] = bind_fs(val_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

train_df = pd.read_csv(TRAIN_SPLIT)
train_df[["image_path1", "image_path2"]] = bind_fs(train_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

test_df = pd.read_csv(TEST_SET)
test_df[["image_url1", "image_url2"]] = test_df[["image_url1", "image_url2"]].applymap(lambda x: x.rsplit("/", 1)[-1])
test_df[["image_path1", "image_path2"]] = bind_fs(test_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

In [6]:
class Predictor:
    def __init__(self, transforms, col1="image_path1", col2="image_path2"):
        self.transforms = transforms
        self._col1 = col1
        self._col2 = col2

    def transform_image(self, img):
        return {k: transform(img) for k, transform in self.transforms.items()}

    def predict(self, row):
        if row[self._col1] is None or row[self._col2] is None:
            # print(row)
            return None
        try:
            img1 = Image.open(row[self._col1])
            img2 = Image.open(row[self._col2])
            hashes1 = self.transform_image(img1)
            hashes2 = self.transform_image(img2)
        except (UnidentifiedImageError, OSError):
            return {}
        hashres = {}
        for k in hashes1:
            try:
                hashres[k] = hashes1[k] - hashes2[k]
            except:
                hashres[k] = np.nan
        hashres["left_grayscale"] = img1.mode == "L"
        hashres["right_grayscale"] = img2.mode == "L"
        hashres["left_height"] = img1.height
        hashres["right_height"] = img2.height
        hashres["left_width"] = img1.width
        hashres["right_width"] = img2.width
        return hashres

## Version 1

In [7]:
transforms = {
    "ahash_16": lambda x: imagehash.average_hash(x, 16),
    "phash_16_8": lambda x: imagehash.phash(x, hash_size=16, highfreq_factor=8),
    "phash": lambda x: imagehash.phash(x, hash_size=16, highfreq_factor=8),
    "dhash_verical_16": lambda x: imagehash.dhash_vertical(x, hash_size=16),
    "dhash_16": lambda x: imagehash.dhash(x, hash_size=16),
    "colorhash_21": lambda x: imagehash.colorhash(x, binbits=21),
    "colorhash_33": lambda x: imagehash.colorhash(x, binbits=33),
    "colorhash_63": lambda x: imagehash.colorhash(x, binbits=63),
    "colorhash_123": lambda x: imagehash.colorhash(x, binbits=123),
    "whash_16_haar": lambda x: imagehash.whash(x, 16, mode="haar"),
    "whash_16_db4": lambda x: imagehash.whash(x, 16, mode="db4"),
}

predictor = Predictor(transforms)
features = train_df.parallel_apply(predictor.predict, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1133), Label(value='0 / 1133'))), …

In [8]:
f = pd.Series(features, index=train_df.index)
f[f.isna()] = [{} for x in f[f.isna()]]

train_df = pd.concat([train_df, pd.DataFrame(f.to_list())], axis=1)
train_df.to_csv("train_df_with_features_v1.1.csv", index=False)

In [9]:
f = val_df.parallel_apply(predictor.predict, axis=1)

val_df = pd.concat([val_df, pd.DataFrame(f.to_list())], axis=1)
val_df.to_csv("val_df_with_features_v1.1.csv", index=False)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=284), Label(value='0 / 284'))), HB…

In [10]:
f = test_df.parallel_apply(predictor.predict, axis=1)

test_df = pd.concat([test_df, pd.DataFrame(f.to_list())], axis=1)
test_df.to_csv("test_df_with_features_v1.1.csv", index=False)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=355), Label(value='0 / 355'))), HB…

## Version 2

In [17]:
def butterworth_pil_phash(img, cutoff_frequency_ratio, high_pass, order, hash_size=16, highfreq_factor=8):
    arr = np.array(img.convert('RGB'))
    res = skimage.filters.butterworth(arr,
                                      cutoff_frequency_ratio,
                                      high_pass,
                                      order,
                                      channel_axis=-1,
                                    #   npad=32
                                      )
    pil_res = Image.fromarray(res, mode='RGB')
    hash_res = imagehash.phash(pil_res, hash_size=hash_size, highfreq_factor=highfreq_factor)
    return hash_res
    

In [18]:
cutoffs = [0.02, 0.08, 0.16]
high_passes = [True, False]
orders = [1, 3, 5, 8]

In [19]:
bw_transforms = {
    f"phash_16_8_bw_c{cutoff}_hp{high_pass}_order{order}" : 
        lambda x: butterworth_pil_phash(
            x,
            cutoff,
            high_pass,
            order
        )   
    for (cutoff, high_pass, order) in itertools.product(cutoffs, high_passes, orders)
}
bw_transforms

{'phash_16_8_bw_c0.02_hpTrue_order1': <function __main__.<dictcomp>.<lambda>(x)>,
 'phash_16_8_bw_c0.02_hpTrue_order3': <function __main__.<dictcomp>.<lambda>(x)>,
 'phash_16_8_bw_c0.02_hpTrue_order5': <function __main__.<dictcomp>.<lambda>(x)>,
 'phash_16_8_bw_c0.02_hpTrue_order8': <function __main__.<dictcomp>.<lambda>(x)>,
 'phash_16_8_bw_c0.02_hpFalse_order1': <function __main__.<dictcomp>.<lambda>(x)>,
 'phash_16_8_bw_c0.02_hpFalse_order3': <function __main__.<dictcomp>.<lambda>(x)>,
 'phash_16_8_bw_c0.02_hpFalse_order5': <function __main__.<dictcomp>.<lambda>(x)>,
 'phash_16_8_bw_c0.02_hpFalse_order8': <function __main__.<dictcomp>.<lambda>(x)>,
 'phash_16_8_bw_c0.08_hpTrue_order1': <function __main__.<dictcomp>.<lambda>(x)>,
 'phash_16_8_bw_c0.08_hpTrue_order3': <function __main__.<dictcomp>.<lambda>(x)>,
 'phash_16_8_bw_c0.08_hpTrue_order5': <function __main__.<dictcomp>.<lambda>(x)>,
 'phash_16_8_bw_c0.08_hpTrue_order8': <function __main__.<dictcomp>.<lambda>(x)>,
 'phash_16_8

In [20]:
transforms = {
    "ahash_16": lambda x: imagehash.average_hash(x, 16),
    "phash_16_8": lambda x: imagehash.phash(x, hash_size=16, highfreq_factor=8),
    "phash": lambda x: imagehash.phash(x, hash_size=16, highfreq_factor=8),
    "dhash_verical_16": lambda x: imagehash.dhash_vertical(x, hash_size=16),
    "dhash_16": lambda x: imagehash.dhash(x, hash_size=16),
    "colorhash_21": lambda x: imagehash.colorhash(x, binbits=21),
    "colorhash_33": lambda x: imagehash.colorhash(x, binbits=33),
    "colorhash_63": lambda x: imagehash.colorhash(x, binbits=63),
    "colorhash_123": lambda x: imagehash.colorhash(x, binbits=123),
    "whash_16_haar": lambda x: imagehash.whash(x, 16, mode="haar"),
    "whash_16_db4": lambda x: imagehash.whash(x, 16, mode="db4"),
}
transforms = transforms | bw_transforms

predictor = Predictor(transforms)
features = train_df.parallel_apply(predictor.predict, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1133), Label(value='0 / 1133'))), …

Process ForkPoolWorker-130:
Process ForkPoolWorker-78:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.10/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/opt/conda/lib/python3.10/multiprocessing/pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
Process ForkPoolWorker-98:
  File "/opt/conda/lib/python3.10/site-packages/pandarallel/data_types/dataframe.py", line 32, in work
    return data.apply(
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/o

KeyboardInterrupt: 

  File "/opt/conda/lib/python3.10/multiprocessing/pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
  File "/opt/conda/lib/python3.10/site-packages/pandas/core/frame.py", line 9558, in apply
    return op.apply().__finalize__(self, method="apply")
  File "/opt/conda/lib/python3.10/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Process ForkPoolWorker-105:
Traceback (most recent call last):
Process ForkPoolWorker-129:
Process ForkPoolWorker-89:
Traceback (most recent call last):
Process ForkPoolWorker-95:
  File "/opt/conda/lib/python3.10/multiprocessing/pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
Traceback (most recent call last):
Traceback (most recent call last):
  File "/tmp/ipykernel_374652/999389445.py", line 108, in transform_im

  File "/opt/conda/lib/python3.10/site-packages/pandarallel/data_types/dataframe.py", line 32, in work
    return data.apply(
  File "/opt/conda/lib/python3.10/site-packages/pandarallel/data_types/dataframe.py", line 32, in work
    return data.apply(
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/pandas/core/frame.py", line 9558, in apply
    return op.apply().__finalize__(self, method="apply")
  File "/opt/conda/lib/python3.10/site-packages/pandas/core/apply.py", line 741, in apply
    return self.apply_standard()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/tmp/ipykernel_374652/3815286982.py", line -1, in <lambda>
  File "/opt/conda/lib/python3.10/site-packages/pandas/core/apply.py", line 741, in apply
    return self.apply_standard()
Process ForkPoolWorker-90:
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.10/multiprocessing/pro

In [None]:
f = pd.Series(features, index=train_df.index)
f[f.isna()] = [{} for x in f[f.isna()]]

train_df = pd.concat([train_df, pd.DataFrame(f.to_list())], axis=1)
train_df.to_csv("train_df_with_features_v2.csv", index=False)
f = val_df.parallel_apply(predictor.predict, axis=1)

val_df = pd.concat([val_df, pd.DataFrame(f.to_list())], axis=1)
val_df.to_csv("val_df_with_features_v2.csv", index=False)
f = test_df.parallel_apply(predictor.predict, axis=1)

test_df = pd.concat([test_df, pd.DataFrame(f.to_list())], axis=1)
test_df.to_csv("test_df_with_features_v2.csv", index=False)