In [None]:
import os
import glob

import cv2
import imagehash
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_recall_curve
from tqdm import tqdm

In [None]:
def crop_horizontal_white_stripes(image_path):
    image = Image.open(image_path)
    gray = np.array(image.convert('L'))
    image = np.array(image)
    height, width = gray.shape

    # remove rows and columns thas contain only white pixels
    left_side_width = np.min(np.argmin(gray > 250, axis=1))
    right_side_width = np.min(np.argmin(gray[:,::-1] > 250, axis=1))
    up_side_height = np.min(np.argmin(gray > 250, axis=0))
    down_side_height = np.min(np.argmin(gray[::-1, :] > 250, axis=0))
    cropped = image[up_side_height:height-down_side_height, left_side_width:width-right_side_width]

    return cropped


hash_functions = {
    'dhash': imagehash.dhash,
    'whash': imagehash.whash,
    'phash': imagehash.phash,
    'ahash': imagehash.average_hash
}

In [None]:
train_df = pd.read_csv('/kaggle/input/csc-hackathon-2023-lunua-task/train.csv')
train_df['image_name1'] = train_df['image_url1'].apply(lambda x: os.path.basename(x))
train_df['image_name2'] = train_df['image_url2'].apply(lambda x: os.path.basename(x))

for name in hash_functions.keys():
    train_df[name] = -1

for ind in tqdm(train_df.index):
    try:
        image_1 = crop_horizontal_white_stripes(f"/kaggle/input/csc-hackathon-2023-lun/train/{train_df.loc[ind, 'image_name1']}")
        image_2 = crop_horizontal_white_stripes(f"/kaggle/input/csc-hackathon-2023-lun/train/{train_df.loc[ind, 'image_name2']}")

        for name, func in hash_functions.items():
            hash_1 = func(Image.fromarray(image_1)).hash.reshape(-1)
            hash_2 = func(Image.fromarray(image_2)).hash.reshape(-1)
            train_df.loc[ind, name] = (hash_1 == hash_2).sum() / len(hash_1)

    except:
        continue

train_df = train_df[train_df['dhash'] != -1]
train_df.to_csv('train_hashes.csv')

In [None]:
test_df = pd.read_csv('/kaggle/input/csc-hackathon-2023-lunua-task/test-data.csv')
test_df['image_name1'] = test_df['image_url1'].apply(lambda x: os.path.basename(x))
test_df['image_name2'] = test_df['image_url2'].apply(lambda x: os.path.basename(x))

for name in hash_functions.keys():
    test_df[name] = -1

for ind in tqdm(test_df.index):
    try:
        image_1 = crop_horizontal_white_stripes(f"/kaggle/input/csc-hackathon-2023-lun/test/{test_df.loc[ind, 'image_name1']}")
        image_2 = crop_horizontal_white_stripes(f"/kaggle/input/csc-hackathon-2023-lun/test/{test_df.loc[ind, 'image_name2']}")

        for name, func in hash_functions.items():
            hash_1 = func(Image.fromarray(image_1)).hash.reshape(-1)
            hash_2 = func(Image.fromarray(image_2)).hash.reshape(-1)
            test_df.loc[ind, name] = (hash_1 == hash_2).sum() / len(hash_1)

    except:
        continue

test_df.to_csv('test_hashes.csv')