In [None]:
import os

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_recall_curve, confusion_matrix
from sklearn.model_selection import KFold
from tqdm import tqdm

In [None]:
train_simclr = pd.read_csv('train_simclr_large.csv')
train_hashes = pd.read_csv('train_hashes.csv')
train_local = pd.read_csv('train_local_features.csv')

train_simclr = train_simclr.iloc[:, [0, -2]]
train_simclr.columns = ['ID', 'simclr']

train_hashes = train_hashes.iloc[:, [0, -1, -2, -3, -4, -7, -5, -6]]
train_hashes.columns = ['ID', 'phash', 'whash', 'dhash', 'ahash', 'is_same', 'image_name1', 'image_name2']

train_local = train_local.iloc[:, [0, -2, -1]]
train_local.columns = ['ID', 'sp', 'sp_score']

train = train_simclr.merge(train_hashes, on='ID')
train = train.merge(train_local, on='ID')

In [None]:
bad_samples = [50719, 51028, 54253, 58461, 62639, 65452, 69671, 69696, 70908, 72838, 75008, 76712, 83840]
subset = train[~train.index.isin(bad_samples)]

features = ['simclr', 'phash', 'dhash', 'whash', 'sp']

reg = RandomForestClassifier(n_estimators=100)
reg.fit((subset[features]), subset['is_same'])
reg_pred = reg.predict_proba(subset[features])[:, 1]
precision, recall, thresholds = precision_recall_curve(subset['is_same'], reg_pred)
cutoff = sorted(list(zip(np.abs(precision - recall), thresholds)), key=lambda x: x[0], reverse=False)[0][1]
print(cutoff)

print(f1_score(subset['is_same'], reg.predict_proba(subset[features])[:, 1] > cutoff))

In [None]:
test_simclr = pd.read_csv('test_simclr_large.csv')
test_hashes = pd.read_csv('test_hashes.csv')
test_local = pd.read_csv('test_local_features.csv')

test_hashes = test_hashes.loc[:, ['ID', 'phash', 'whash', 'dhash']]
test_simclr = test_simclr.loc[:, ['ID', 'simclr']]
test_local = test_local.loc[:, ['ID', 'sp', 'sp_score', 'image_name1', 'image_name2']]

test = test_simclr.merge(test_hashes, on='ID')
test = test.merge(test_local, on='ID')

In [None]:
test_pred_proba = reg.predict_proba(test[features])[:, 1]
test_pred = test_pred_proba > cutoff

test['is_same'] = test_pred.astype(int)

submission_df = test[['ID', 'is_same']]
submission_df = submission_df.set_index('ID')
submission_df['different'] = 1 - submission_df['is_same']
submission_df.columns = ['same', 'different']
submission_df.to_csv('submission.csv')