In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import os
import cv2
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.cluster import KMeans

In [None]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

In [None]:
!python --version

In [None]:
! pip install opencv-python==3.4.0.12

In [None]:
pip install opencv-contrib-python==3.2.0.7

In [None]:
def load_images(folder, label):
    images = []
    labels = []
    writer_ids = []
    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            images.append(img)
            labels.append(label)
            writer_ids.append(filename[-7:-4])  # Extract last 3 characters before extension
    return images, labels, writer_ids

def preprocess_image(img):
    img = cv2.GaussianBlur(img, (5,5), 0)
    _, img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    img = cv2.resize(img, (800, 1500))
    return img

def extract_surf_features(images, feature_length=64):
    surf = cv2.xfeatures2d.SURF_create(200)
    descriptors_list = []
    for img in images:
        keypoints, descriptors = surf.detectAndCompute(img, None)
        if descriptors is not None:
            if descriptors.shape[0] >= feature_length:
                descriptors_list.append(descriptors[:feature_length].flatten())  # Take first `feature_length`
            else:
                padded = np.zeros((feature_length, descriptors.shape[1]))  # Zero padding
                padded[:descriptors.shape[0], :] = descriptors
                descriptors_list.append(padded.flatten())  # Flatten for consistency
        else:
            descriptors_list.append(np.zeros(feature_length * 64))  # Default feature size
    return np.array(descriptors_list)

def train_and_evaluate(X_train, X_test, y_train, y_test):
    model = SVC(kernel='rbf',C=10,gamma=0.01, probability=True)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    conf_matrix = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    return conf_matrix, accuracy, precision, recall, f1, fpr, tpr, roc_auc

In [None]:
# Load data
genuine_images, genuine_labels, genuine_writer_ids = load_images(r"/kaggle/input/icdar-2009/genuines", 1)
forgery_images, forgery_labels, forgery_writer_ids = load_images(r"/kaggle/input/icdar-2009/forgeries", 0)

In [None]:
# Ensure total images = 176
genuine_images = genuine_images[:88]
forgery_images = forgery_images[:88]
genuine_labels = genuine_labels[:88]
forgery_labels = forgery_labels[:88]

# Combine data
all_images = genuine_images + forgery_images
all_labels = genuine_labels + forgery_labels

# Pre-process images
all_images = [preprocess_image(img) for img in all_images]

# Extract features
features = extract_surf_features(all_images)

In [None]:
# Different split ratios
ratios = [(0.5, 0.5), (0.625, 0.375), (0.75, 0.25)]
plt.figure(figsize=(8, 6))
for train_ratio, test_ratio in ratios:
    X_train, X_test, y_train, y_test = train_test_split(features, all_labels, train_size=train_ratio, random_state=42)
    conf_matrix, accuracy, precision, recall, f1, fpr, tpr, roc_auc = train_and_evaluate(X_train, X_test, y_train, y_test)
    print(f'Train-Test Split: {train_ratio * 100}-{test_ratio * 100}')
    print(f'Confusion Matrix:\n{conf_matrix}')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')
    print(f'ROC AUC: {roc_auc:.2f}\n')
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
# Find a random writer to pair genuine and forged signatures from the same person
random_writer_id = random.choice(genuine_writer_ids)

# Find the index of the genuine image for this writer
genuine_idx = genuine_writer_ids.index(random_writer_id)

# Find the corresponding forged signature for the same writer
# For simplicity, let's just match the forged signature to the same writer_id
forgery_idx = forgery_writer_ids.index(random_writer_id)

# Display the genuine and forged signature of the same writer
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.imshow(genuine_images[genuine_idx], cmap='gray')
plt.title('Original Signature')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(forgery_images[forgery_idx], cmap='gray')
plt.title('Forged Signature')
plt.axis('off')

plt.show()