In [17]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import imagehash
from PIL import Image, UnidentifiedImageError
from tqdm.auto import tqdm as tn
from sklearn.metrics import f1_score
import wandb

from pandarallel import pandarallel


tn.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=os.cpu_count())

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [8]:
DATA_PATH = Path("../data")
COMP_DATA_PATH = Path("../data")
IMAGE_PATH = Path("../dataset")

TEST_SET = COMP_DATA_PATH / "test-data.csv"
TRAIN_SPLIT = DATA_PATH / "train_split_20perval_grouped_stratified.csv"
VAL_SPLIT = DATA_PATH / "val_split_20perval_grouped_stratified.csv"

IMG_GLOB = "images*/*.jpg"

In [9]:
def bind_fs(df, path: Path, glob: str):
    mapping = {x.name: x for x in path.glob(glob)}
    return df.applymap(lambda x: mapping.get(x))

In [10]:
val_df = pd.read_csv(VAL_SPLIT)
val_df[["image_path1", "image_path2"]] = bind_fs(val_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

train_df = pd.read_csv(TRAIN_SPLIT)
train_df[["image_path1", "image_path2"]] = bind_fs(train_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

test_df = pd.read_csv(TEST_SET)
test_df[["image_url1", "image_url2"]] = test_df[["image_url1", "image_url2"]].applymap(lambda x: x.rsplit("/", 1)[-1])
test_df[["image_path1", "image_path2"]] = bind_fs(test_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

In [11]:
class Predictor:
    def __init__(self, transform, col1="image_path1", col2="image_path2"):
        self.transform = transform
        self._col1 = col1
        self._col2 = col2

    def process_image(self, path):
        img = Image.open(path)
        return self.transform(img)

    def predict(self, row):
        if row[self._col1] is None or row[self._col2] is None:
            # print(row)
            return None
        try:
            hash1 = self.process_image(row[self._col1])
            hash2 = self.process_image(row[self._col2])
        except (UnidentifiedImageError, OSError):
            return None
        return hash1 - hash2

In [12]:
val_df

Unnamed: 0,image_url1,image_url2,is_same,image_path1,image_path2
0,941374542.jpg,941588763.jpg,1,../dataset/images-b2/941374542.jpg,../dataset/images-b1/941588763.jpg
1,899704859.jpg,940179676.jpg,0,../dataset/images-b2/899704859.jpg,../dataset/images-b2/940179676.jpg
2,892607076.jpg,928668915.jpg,0,../dataset/images-b3/892607076.jpg,../dataset/images-b2/928668915.jpg
3,917418509.jpg,917920631.jpg,1,../dataset/images-b3/917418509.jpg,../dataset/images-b3/917920631.jpg
4,899358242.jpg,910255023.jpg,0,../dataset/images-b3/899358242.jpg,../dataset/images-b2/910255023.jpg
...,...,...,...,...,...
18146,930719593.jpg,938692480.jpg,0,../dataset/images-b2/930719593.jpg,../dataset/images-b2/938692480.jpg
18147,930447711.jpg,931167498.jpg,1,../dataset/images-b2/930447711.jpg,../dataset/images-b2/931167498.jpg
18148,903267789.jpg,926730753.jpg,0,../dataset/images-b3/903267789.jpg,../dataset/images-b3/926730753.jpg
18149,924358116.jpg,925851577.jpg,0,../dataset/images-b2/924358116.jpg,../dataset/images-b1/925851577.jpg


In [13]:
predictor = Predictor(lambda x: imagehash.phash(x, hash_size=8, highfreq_factor=4))
train_df["phash"] = train_df.parallel_apply(predictor.predict, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9061), Label(value='0 / 9061'))), …

In [14]:
sweep_config = {
       'method': 'grid',
       'parameters': {
           'hash_size': {
               'values': [4, 8, 16, 32, 64]
           },
           'highfreq_factor': {
               'values': [2, 4, 8, 16, 32, 64]
           }
       }
    }

In [15]:
hashfuncs = {
	'ahash': imagehash.average_hash,
	'phash': imagehash.phash,
	'dhash': imagehash.dhash,
	'whash-haar': imagehash.whash,
	'whash-db4': lambda img: imagehash.whash(img, mode='db4'),
	'colorhash': imagehash.colorhash,
}

In [19]:
WANDB_NOTEBOOK_NAME = "11_phash_gridsearch"
sweep_id = wandb.sweep(sweep_config)

Create sweep with ID: db1ejh23
Sweep URL: https://wandb.ai/nikita-fordui/uncategorized/sweeps/db1ejh23


In [None]:
def phash_gridsearch():
    wandb.init(project="csc_hackathon_lun")
    predictor = Predictor(lambda x: imagehash.phash(x, 
                                                    hash_size=wandb.config.hash_size, 
                                                    highfreq_factor=wandb.config.highfreq_factor))
    train_df["phash"] = train_df.parallel_apply(predictor.predict, axis=1)
    res = {}
    for i in np.arange(train_df["phash"].min(), train_df["phash"].max()):
        res[i] = f1_score(train_df["is_same"], train_df["phash"] < i)
    threshold = max(res, key=res.get)
    wandb.log({"train_f1": res.get(threshold), 
               "threshold": threshold})