## Import Libs

In [None]:
import numpy as np
import seaborn as sb
%matplotlib inline 
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_curve, accuracy_score, precision_score, recall_score, f1_score, auc
from sklearn.metrics import confusion_matrix
from skimage.metrics import structural_similarity as ssim
from sklearn.decomposition import PCA as PCA
from sklearn.svm import SVC as SVC
from time import time

import os
import cv2

## Dataset class for data processing

In [None]:
class Dataset():
    def __init__(self, data_path, transform) -> None:
        self.data_path = data_path
        self.transform = transform
        self.image_pairs = []
        self.labels = []

    def get_all_data(self):
        all_pairs = []
        for label in os.listdir(self.data_path):
            image_data_dir = os.path.join(self.data_path, label)
            for pair_dir in os.listdir(image_data_dir):
                identity_photos_dir = os.path.join(image_data_dir, pair_dir)
                pair = os.listdir(identity_photos_dir)
                pair_tuple = (os.path.join(identity_photos_dir, pair[0]), os.path.join(identity_photos_dir, pair[1]))
                self.image_pairs.append(pair_tuple)
                self.labels.append(label)
                all_pairs.append({'img1':os.path.join(identity_photos_dir, pair[0]), 'img2':os.path.join(identity_photos_dir, pair[1]), label: label})
        
        return all_pairs

    def get_item(self, idx):
        img1_path, img2_path = self.image_pairs[idx]
        img1 = cv2.imread(img1_path)
        img2 = cv2.imread(img2_path)
        
        def transform_img(img, size=(96,96)):
            grey_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            resize_img = cv2.resize(grey_img, size)
            normalized_img = cv2.normalize(resize_img, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)
            return normalized_img
            
        if self.transform:
            return transform_img(img1), transform_img(img2), self.labels[idx]
        
        return img1, img2, self.label[idx]
    
    def calc_dist_euclidean(self, img1, img2):
        distance = np.sqrt(np.sum((img1-img2)**2))
        return distance
    
    def calc_dist_simi(self, img1, img2):
        return ssim(img1, img2, data_range=img1.max()-img1.min())
    
    def calc_dist_absolute(self, img1, img2):
        return np.sum(np.absolute(img1-img2))

    def calc_dist_cosine(self, img1, img2):
        return np.dot(img1, img2)/(np.linalg.norm(img1) - np.linalg.norm(img2))
    
    def __len__(self):
        return len(self.image_pairs)

## get data

In [None]:
data_path = '../data/'
dataset = Dataset(data_path, transform=True)
all_data = dataset.get_all_data()

In [None]:
def all_images(all_data, dataset, n=25000):
    X = []
    y = []
    for i in range(n):
        img1, img2, label = dataset.get_item(i)
        image_dist = np.array(dataset.calc_dist_absolute(img1, img2))
        X.append(image_dist)
        y.append(label)

    for i in range(int(len(all_data)/2), int(len(all_data)/2+n)):
        img1, img2, label = dataset.get_item(i)
        image_dist = np.array(dataset.calc_dist_absolute(img1, img2))
        X.append(image_dist)
        y.append(label)

    return X, y

X, y = all_images(all_data, dataset, 10000)

## Data split

In [None]:
X = np.array(X)
y = np.array(y)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X.reshape(-1,1), y, test_size=0.15, random_state=42)

## Training

In [None]:
param_grid = {
    'C': [0.01, 0.05],
    'kernel': ['linear', 'sigmoid']
}
clf = SVC(class_weight='balanced')
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'], refit='accuracy', verbose=3, return_train_score=True, n_jobs=6)
grid_search.fit(X_train, y_train.astype(int))

In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_svm = grid_search.best_estimator_
print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Score: {best_score}")

In [None]:
import pandas as pd

file = 'svm_scores.txt'
scores = pd.DataFrame(grid_search.cv_results_)
scores.to_csv(file)

In [None]:
y_pred = best_svm.predict(X_test.astype(int))
print(classification_report(y_test.astype(int), y_pred.astype(int)))

In [None]:
conf_matrix = confusion_matrix(y_test.astype(int), y_pred.astype(int))
plt.figure(figsize=(8, 6))
sb.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Dissimilar', 'Similar'], yticklabels=['Dissimilar', 'Similar'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print("acc: ", accuracy_score(y_test.astype(int), y_pred.astype(int)))
print("f1_score: ", f1_score(y_test.astype(int), y_pred.astype(int)))
print("precision: ", precision_score(y_test.astype(int), y_pred.astype(int)))
print("recall: ", recall_score(y_test.astype(int), y_pred.astype(int)))

In [None]:
fpr, tpr, _ = roc_curve(y_test.astype(int), y_pred.astype(int))
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
import pickle
with open('best_svm.pkl', 'wb') as file:
    pickle.dump(clf, file)