In [26]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import imagehash
from PIL import Image, UnidentifiedImageError
from tqdm.auto import tqdm as tn
from sklearn.metrics import f1_score
import wandb

from pandarallel import pandarallel


tn.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=80)

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [27]:
DATA_PATH = Path("../data")
COMP_DATA_PATH = Path("../data")
IMAGE_PATH = Path("../dataset")

TEST_SET = COMP_DATA_PATH / "test-data.csv"
TRAIN_SPLIT = DATA_PATH / "train_split_20perval_grouped_stratified.csv"
VAL_SPLIT = DATA_PATH / "val_split_20perval_grouped_stratified.csv"

IMG_GLOB = "images*/*.jpg"

In [28]:
def bind_fs(df, path: Path, glob: str):
    mapping = {x.name: x for x in path.glob(glob)}
    return df.applymap(lambda x: mapping.get(x))

In [29]:
val_df = pd.read_csv(VAL_SPLIT)
val_df[["image_path1", "image_path2"]] = bind_fs(val_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

train_df = pd.read_csv(TRAIN_SPLIT)
train_df[["image_path1", "image_path2"]] = bind_fs(train_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

test_df = pd.read_csv(TEST_SET)
test_df[["image_url1", "image_url2"]] = test_df[["image_url1", "image_url2"]].applymap(lambda x: x.rsplit("/", 1)[-1])
test_df[["image_path1", "image_path2"]] = bind_fs(test_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

In [30]:
class Predictor:
    def __init__(self, transform, col1="image_path1", col2="image_path2"):
        self.transform = transform
        self._col1 = col1
        self._col2 = col2

    def process_image(self, path):
        img = Image.open(path)
        return self.transform(img)

    def predict(self, row):
        if row[self._col1] is None or row[self._col2] is None:
            # print(row)
            return None
        try:
            hash1 = self.process_image(row[self._col1])
            hash2 = self.process_image(row[self._col2])
        except (UnidentifiedImageError, OSError):
            return None
        return hash1 - hash2

In [31]:
val_df

Unnamed: 0,image_url1,image_url2,is_same,image_path1,image_path2
0,941374542.jpg,941588763.jpg,1,../dataset/images-b2/941374542.jpg,../dataset/images-b1/941588763.jpg
1,899704859.jpg,940179676.jpg,0,../dataset/images-b2/899704859.jpg,../dataset/images-b2/940179676.jpg
2,892607076.jpg,928668915.jpg,0,../dataset/images-test/892607076.jpg,../dataset/images-b2/928668915.jpg
3,917418509.jpg,917920631.jpg,1,../dataset/images-b3/917418509.jpg,../dataset/images-b3/917920631.jpg
4,899358242.jpg,910255023.jpg,0,../dataset/images-b3/899358242.jpg,../dataset/images-b2/910255023.jpg
...,...,...,...,...,...
18146,930719593.jpg,938692480.jpg,0,../dataset/images-b2/930719593.jpg,../dataset/images-b2/938692480.jpg
18147,930447711.jpg,931167498.jpg,1,../dataset/images-b2/930447711.jpg,../dataset/images-b2/931167498.jpg
18148,903267789.jpg,926730753.jpg,0,../dataset/images-b3/903267789.jpg,../dataset/images-b3/926730753.jpg
18149,924358116.jpg,925851577.jpg,0,../dataset/images-b2/924358116.jpg,../dataset/images-b1/925851577.jpg


In [32]:
predictor = Predictor(lambda x: imagehash.phash(x, hash_size=8, highfreq_factor=4))
train_df["phash"] = train_df.parallel_apply(predictor.predict, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=907), Label(value='0 / 907'))), HB…

In [None]:
phash_sweep_config = {
    'name': 'phash_sweep',
    'method': 'grid',
    "project": "csc_hackathon_lun",
    'parameters': {
        'hash_size': {
            'values': [4, 8, 16, 32]
        },
        'highfreq_factor': {
            'values': [2, 4, 8, 16, 32, 64]
        }
    }
}
ahash_sweep_config = {
    'name': 'ahash_sweep',
    'method': 'grid',
    "project": "csc_hackathon_lun",
    'parameters': {
        'hash_size': {
            'values': [4, 8, 16, 32]
        },
    }
}
dhash_sweep_config = {
    'name': 'dhash_sweep',
    'method': 'grid',
    "project": "csc_hackathon_lun",
    'parameters': {
        'hash_size': {
            'values': [4, 8, 16, 32]
        },
    }
}
whash_sweep_config = {
    'name': 'whash_sweep',
    'method': 'grid',
    "project": "csc_hackathon_lun",
    'parameters': {
        'hash_size': {
            'values': [4, 8, 16, 32]
        },
        'mode': {
            'values': ['haar', 'db4']
        },
    }
}
colorhash_sweep_config = {
    'name': 'colorhash_sweep',
    'method': 'grid',
    "project": "csc_hackathon_lun",
    'parameters': {
        'binbits': {
            'values': [1, 3, 5, 7, 9, 12, 15]
        },
    }
}

In [None]:
def phash_gridsearch():
    wandb.init(project="csc_hackathon_lun")
    predictor = Predictor(lambda x: imagehash.phash(x, 
                                                    hash_size=wandb.config.hash_size, 
                                                    highfreq_factor=wandb.config.highfreq_factor))
    train_df["phash"] = train_df.parallel_apply(predictor.predict, axis=1)
    res = {}
    for i in np.arange(train_df["phash"].min(), train_df["phash"].max()):
        res[i] = f1_score(train_df["is_same"], train_df["phash"] < i)
    threshold = max(res, key=res.get)
    wandb.log({"train_f1": res.get(threshold), 
               "threshold": threshold})

    
def ahash_gridsearch():
    wandb.init(project="csc_hackathon_lun")
    predictor = Predictor(lambda x: imagehash.ahash(x, 
                                                    hash_size=wandb.config.hash_size, 
                                                   ))
    train_df["phash"] = train_df.parallel_apply(predictor.predict, axis=1)
    res = {}
    for i in np.arange(train_df["phash"].min(), train_df["phash"].max()):
        res[i] = f1_score(train_df["is_same"], train_df["phash"] < i)
    threshold = max(res, key=res.get)
    wandb.log({"train_f1": res.get(threshold), 
               "threshold": threshold})
    
    
def dhash_gridsearch():
    wandb.init(project="csc_hackathon_lun")
    predictor = Predictor(lambda x: imagehash.dhash(x, 
                                                    hash_size=wandb.config.hash_size, 
                                                   ))
    train_df["phash"] = train_df.parallel_apply(predictor.predict, axis=1)
    res = {}
    for i in np.arange(train_df["phash"].min(), train_df["phash"].max()):
        res[i] = f1_score(train_df["is_same"], train_df["phash"] < i)
    threshold = max(res, key=res.get)
    wandb.log({"train_f1": res.get(threshold), 
               "threshold": threshold})
    
    
def whash_gridsearch():
    wandb.init(project="csc_hackathon_lun")
    predictor = Predictor(lambda x: imagehash.whash(x, 
                                                    hash_size=wandb.config.hash_size, 
                                                    mode=wandb.config.mode
                                                   ))
    train_df["phash"] = train_df.parallel_apply(predictor.predict, axis=1)
    res = {}
    for i in np.arange(train_df["phash"].min(), train_df["phash"].max()):
        res[i] = f1_score(train_df["is_same"], train_df["phash"] < i)
    threshold = max(res, key=res.get)
    wandb.log({"train_f1": res.get(threshold), 
               "threshold": threshold})

    
def colorhash_gridsearch():
    wandb.init(project="csc_hackathon_lun")
    predictor = Predictor(lambda x: imagehash.colorhash(x, 
                                                    binbits=wandb.config.binbits
                                                   ))
    train_df["phash"] = train_df.parallel_apply(predictor.predict, axis=1)
    res = {}
    for i in np.arange(train_df["phash"].min(), train_df["phash"].max()):
        res[i] = f1_score(train_df["is_same"], train_df["phash"] < i)
    threshold = max(res, key=res.get)
    wandb.log({"train_f1": res.get(threshold), 
               "threshold": threshold})

In [None]:
configs = [
    (phash_sweep_config, phash_gridsearch),
    (ahash_sweep_config, ahash_gridsearch),
    (whash_sweep_config, whash_gridsearch),
    (dhash_sweep_config, dhash_gridsearch),
    (colorhash_sweep_config, colorhash_gridsearch)
]

In [None]:
%env WANDB_NOTEBOOK_NAME = "/gpfs/space/home/fordui/csc_hackathon_lun/code/11_phash_gridsearch.ipynb"

In [None]:
for config, gs_func in configs[-2:]:
    sweep_id = wandb.sweep(config)
    wandb.agent(sweep_id, function=gs_func)

## Best pHash

In [21]:
predictor = Predictor(lambda x: imagehash.phash(x, hash_size=16, highfreq_factor=8))
train_df["phash"] = train_df.parallel_apply(predictor.predict, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=907), Label(value='0 / 907'))), HB…

In [22]:
res = {}
for i in np.arange(train_df["phash"].min(), train_df["phash"].max()):
    res[i] = f1_score(train_df["is_same"], train_df["phash"] < i)

In [23]:
threshold = max(res, key=res.get)

res, threshold

({0.0: 0.0,
  1.0: 0.5651229202880556,
  2.0: 0.5651229202880556,
  3.0: 0.6891075631681933,
  4.0: 0.6891075631681933,
  5.0: 0.7441999456669384,
  6.0: 0.7441999456669384,
  7.0: 0.7874957383755999,
  8.0: 0.7874957383755999,
  9.0: 0.8197876250765774,
  10.0: 0.8197876250765774,
  11.0: 0.8472277622656561,
  12.0: 0.8472277622656561,
  13.0: 0.8688207812538218,
  14.0: 0.8688207812538218,
  15.0: 0.8874990979722417,
  16.0: 0.8874990979722417,
  17.0: 0.9022120953194721,
  18.0: 0.9022120953194721,
  19.0: 0.9147817925856405,
  20.0: 0.9147817925856405,
  21.0: 0.9238751105113769,
  22.0: 0.9238751105113769,
  23.0: 0.931679600886918,
  24.0: 0.931679600886918,
  25.0: 0.9376909120139639,
  26.0: 0.9376909120139639,
  27.0: 0.9426448736998514,
  28.0: 0.9426448736998514,
  29.0: 0.9465485435787652,
  30.0: 0.9465485435787652,
  31.0: 0.9494242954148027,
  32.0: 0.9494242954148027,
  33.0: 0.9521134417614281,
  34.0: 0.9521134417614281,
  35.0: 0.9545290182607123,
  36.0: 0.954529018

In [24]:
val_df["phash"] = val_df.parallel_apply(predictor.predict, axis=1)
val_df["predict"] = (val_df["phash"] < threshold).astype(int)
f1_score(val_df["is_same"], val_df["predict"])

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=227), Label(value='0 / 227'))), HB…

0.9827812254173586

In [25]:
test_df["phash"] = test_df.parallel_apply(predictor.predict, axis=1)
test_df["predict"] = (test_df["phash"] < threshold).astype(int)

test_df.rename(columns={"predict": "is_same"})[["ID", "is_same"]].to_csv("phash_gridsearch_best.csv", index=False)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=284), Label(value='0 / 284'))), HB…