In [None]:
!pip install optuna opencv-python

In [None]:
!pip install \
    --extra-index-url=https://pypi.nvidia.com \
    "cudf-cu12==25.4.*"

In [None]:
!sudo apt-get update 
!sudo apt-get install -y libgl1
!sudo apt-get install -y poppler-utils

In [None]:
%load_ext cudf.pandas

In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import optuna
from optuna.trial import TrialState
from skimage.metrics import structural_similarity as ssim
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import cv2
import pandas as pd
from datetime import datetime
import json
import pickle
import hashlib
from pathlib import Path
import mlflow
import mlflow.pyfunc
from mlflow import MlflowClient
from mlflow.models.signature import ModelSignature

mlflow.set_experiment("forgery_detection_experiment")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
img_dir = '../../datafabric/forgery_detect_dataset/full'
csv_file = '../../datafabric/forgery_detect_dataset/data.csv'

In [None]:
subset_size = None

In [None]:
class OptimizedSignatureNet(nn.Module):
    """Optimized neural network based on Optuna findings."""
    def __init__(self, trial=None, input_size=None, num_layers=None, first_layer_size=None, 
                 layer_reductions=None, dropout_rate=None):
        super(OptimizedSignatureNet, self).__init__()
        
        if trial is not None:
            # Original Optuna-based initialization
            num_layers = trial.suggest_int("num_layers", 4, 5)
            first_layer_size = trial.suggest_int("first_layer_size", 512, 768, step=128)
            
            layer_sizes = [first_layer_size]
            current_size = first_layer_size
            
            for i in range(1, num_layers):
                reduction = trial.suggest_float(f"layer_{i}_reduction", 0.7, 0.85)
                current_size = int(current_size * reduction)
                layer_sizes.append(max(current_size, 64))
            
            self.dropout_rate = trial.suggest_float("dropout_rate", 0.3, 0.5)
            
            # Build the network layers
            self.layers = nn.ModuleList()
            for i in range(len(layer_sizes)):
                if i == 0:
                    # First layer
                    self.layers.append(nn.Linear(input_size, layer_sizes[i]))
                else:
                    # Subsequent layers
                    self.layers.append(nn.Linear(layer_sizes[i-1], layer_sizes[i]))
            
            # Output layer
            self.output = nn.Linear(layer_sizes[-1], 2)  # Binary classification
            
        else:
            # Direct parameter initialization for loading saved models
            layer_sizes = [first_layer_size]
            current_size = first_layer_size
            
            for reduction in layer_reductions:
                current_size = int(current_size * reduction)
                layer_sizes.append(max(current_size, 64))
            
            self.dropout_rate = dropout_rate
            
            # Build the network layers
            self.layers = nn.ModuleList()
            for i in range(len(layer_sizes)):
                if i == 0:
                    # First layer
                    self.layers.append(nn.Linear(input_size, layer_sizes[i]))
                else:
                    # Subsequent layers
                    self.layers.append(nn.Linear(layer_sizes[i-1], layer_sizes[i]))
            
            # Output layer
            self.output = nn.Linear(layer_sizes[-1], 2)  # Binary classification
        
        # Batch normalization
        self.batch_norms = nn.ModuleList([nn.BatchNorm1d(size) for size in layer_sizes])
        # Dropout
        self.dropout = nn.Dropout(self.dropout_rate)
        
        # Initialize weights
        for layer in self.layers:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.constant_(layer.bias, 0)
        nn.init.xavier_uniform_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)
        
    def forward(self, x):
        for i, (layer, batch_norm) in enumerate(zip(self.layers, self.batch_norms)):
            x = layer(x)
            x = batch_norm(x)
            x = F.relu(x)
            x = self.dropout(x)
        x = self.output(x)
        return F.log_softmax(x, dim=1)

In [None]:
class FeatureCache:
    def __init__(self, cache_dir="feature_cache"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        self.cache_index_file = self.cache_dir / "cache_index.json"
        self.load_cache_index()
    
    def load_cache_index(self):
        """Load the cache index"""
        if self.cache_index_file.exists():
            with open(self.cache_index_file, 'r') as f:
                self.cache_index = json.load(f)
        else:
            self.cache_index = {}
    
    def save_cache_index(self):
        """Save the cache index"""
        with open(self.cache_index_file, 'w') as f:
            json.dump(self.cache_index, f)
    
    def get_file_hash(self, filepath):
        """Get a hash of the file for cache key"""
        stat = os.stat(filepath)
        file_info = f"{filepath}_{stat.st_size}_{stat.st_mtime}"
        return hashlib.md5(file_info.encode()).hexdigest()
    
    def get_cache_key(self, img1_path, img2_path):
        """Generate cache key for a pair of images"""
        hash1 = self.get_file_hash(img1_path)
        hash2 = self.get_file_hash(img2_path)
        return f"{hash1}_{hash2}"
    
    def get_cached_features(self, img1_path, img2_path):
        """Get cached features if available"""
        cache_key = self.get_cache_key(img1_path, img2_path)
        
        if cache_key in self.cache_index:
            cache_file = self.cache_dir / f"{cache_key}.pkl"
            if cache_file.exists():
                try:
                    with open(cache_file, 'rb') as f:
                        return pickle.load(f)
                except:
                    # Remove corrupted cache entry
                    del self.cache_index[cache_key]
                    self.save_cache_index()
        
        return None
    
    def cache_features(self, img1_path, img2_path, features):
        """Cache extracted features"""
        cache_key = self.get_cache_key(img1_path, img2_path)
        cache_file = self.cache_dir / f"{cache_key}.pkl"
        
        try:
            with open(cache_file, 'wb') as f:
                pickle.dump(features, f)
            
            self.cache_index[cache_key] = {
                'img1': img1_path,
                'img2': img2_path,
                'cache_file': str(cache_file)
            }
            self.save_cache_index()
            return True
        except Exception as e:
            print(f"Error caching features: {e}")
            return False

In [None]:
def load_saved_features(file_path="features_dataset.npz"):
    """
    Load pre-extracted features from disk
    """
    try:
        # Check if file exists
        if not os.path.exists(file_path):
            print(f"Features file not found at {file_path}")
            return None, None, None
            
        data = np.load(file_path)
        
        # Check if required keys exist
        required_keys = ['features', 'labels']
        for key in required_keys:
            if key not in data:
                print(f"Required key '{key}' not found in {file_path}")
                return None, None, None
        
        X = data['features']
        y = data['labels']
        
        # Handle image_pairs safely (it might not exist)
        image_pairs = data.get('image_pairs', None)
        
        print(f"Loaded {len(X)} feature vectors from {file_path}")
        print(f"Feature vector size: {X.shape[1]}")
        print(f"Genuine signatures: {sum(y)} ({sum(y)/len(y)*100:.1f}%)")
        print(f"Forged signatures: {len(y)-sum(y)} ({(len(y)-sum(y))/len(y)*100:.1f}%)")
        
        return X, y, image_pairs
    except Exception as e:
        print(f"Error loading features: {e}")
        return None, None, None

In [None]:
# Debug: Check what we have
print("Current state:")
print(f"X is None: {X is None}")
print(f"y is None: {y is None}")

if X is not None and y is not None:
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    print(f"X type: {type(X)}")
    print(f"y type: {type(y)}")

# Check if file exists
print(f"features_dataset.npz exists: {os.path.exists('features_dataset.npz')}")

# If file exists, try to manually inspect it
if os.path.exists('features_dataset.npz'):
    try:
        data = np.load('features_dataset.npz')
        print(f"Keys in file: {list(data.keys())}")
        for key in data.keys():
            print(f"{key} shape: {data[key].shape}")
    except Exception as e:
        print(f"Error reading file: {e}")

In [None]:
# Initialize variables
X = None
y = None

# First check if features_dataset.npz exists
if os.path.exists("features_dataset.npz"):
    print("Found existing features file, loading...")
    X, y, _ = load_saved_features("features_dataset.npz")
    
if X is None or y is None:
    print("No valid features loaded. You need to extract features first.")
    print("This may take a while...")
    
    # Check if the required input files exist
    if not os.path.exists(csv_file):
        print(f"ERROR: CSV file not found at {csv_file}")
        raise FileNotFoundError(f"CSV file not found at {csv_file}")
    
    if not os.path.exists(img_dir):
        print(f"ERROR: Image directory not found at {img_dir}")
        raise FileNotFoundError(f"Image directory not found at {img_dir}")
    
    # Extract features
    X, y, _ = extract_and_save_all_features(csv_file, img_dir, "features_dataset.npz", subset_size)
    
    if X is None or y is None:
        print("ERROR: Feature extraction failed.")
        raise ValueError("Feature extraction failed")
else:
    print("Successfully loaded features from file!")

    

In [None]:
# Define the working feature extraction function
def extract_hybrid_signature_features(img1_path, img2_path, size=(128, 64)):
    """
    Extract a hybrid set of features - proven features + selected advanced ones
    """
    try:
        # Load and preprocess images
        img1 = cv2.imread(img1_path, cv2.IMREAD_GRAYSCALE)
        img2 = cv2.imread(img2_path, cv2.IMREAD_GRAYSCALE)
        
        if img1 is None or img2 is None:
            return None
        
        # Enhanced preprocessing
        img1 = cv2.bilateralFilter(img1, 9, 75, 75)  # Noise reduction
        img2 = cv2.bilateralFilter(img2, 9, 75, 75)
        
        # Multiple scale analysis
        scales = [(128, 64), (256, 128), (64, 32)]
        features = []
        
        for scale in scales:
            img1_scaled = cv2.resize(img1, scale)
            img2_scaled = cv2.resize(img2, scale)
            
            # SSIM at different scales
            ssim_val = ssim(img1_scaled, img2_scaled)
            features.append(ssim_val)
            
            # Histogram comparison at multiple scales
            for bins in [32, 64]:
                hist1 = cv2.calcHist([img1_scaled], [0], None, [bins], [0, 256])
                hist2 = cv2.calcHist([img2_scaled], [0], None, [bins], [0, 256])
                cv2.normalize(hist1, hist1, 0, 1, cv2.NORM_MINMAX)
                cv2.normalize(hist2, hist2, 0, 1, cv2.NORM_MINMAX)
                
                features.append(cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL))
                features.append(cv2.compareHist(hist1, hist2, cv2.HISTCMP_BHATTACHARYYA))
        
        # Writer-specific features
        # 1. Slant analysis
        def calculate_slant(img):
            edges = cv2.Canny(img, 50, 150)
            lines = cv2.HoughLines(edges, 1, np.pi/180, threshold=50)
            angles = []
            if lines is not None:
                for line in lines[:10]:  # Top 10 lines
                    angle = line[0][1]
                    angles.append(angle)
            return np.mean(angles) if angles else 0
        
        slant1 = calculate_slant(img1)
        slant2 = calculate_slant(img2)
        features.extend([slant1, slant2, abs(slant1 - slant2)])
        
        # 2. Aspect ratio analysis
        def get_bounding_box_features(img):
            contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            if contours:
                x, y, w, h = cv2.boundingRect(max(contours, key=cv2.contourArea))
                return w/h, w, h
            return 1.0, 0, 0
        
        aspect1, w1, h1 = get_bounding_box_features(img1)
        aspect2, w2, h2 = get_bounding_box_features(img2)
        features.extend([aspect1, aspect2, abs(aspect1 - aspect2), abs(w1-w2), abs(h1-h2)])
        
        # 3. Pressure variation (simulated from thickness)
        def estimate_pressure_variation(img):
            # Simulate pressure from line thickness
            _, binary = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)
            if not binary.any():
                return 0.0
            distances = cv2.distanceTransform(binary, cv2.DIST_L2, 5)
            valid_distances = distances[distances > 0]
            return np.std(valid_distances) if len(valid_distances) > 1 else 0.0
        
        pressure_var1 = estimate_pressure_variation(img1)
        pressure_var2 = estimate_pressure_variation(img2)
        features.extend([pressure_var1, pressure_var2, abs(pressure_var1 - pressure_var2)])
        
        # Original proven features
        orb = cv2.ORB_create()
        kp1, des1 = orb.detectAndCompute(img1, None)
        kp2, des2 = orb.detectAndCompute(img2, None)
        
        if des1 is not None and des2 is not None and len(kp1) > 0 and len(kp2) > 0:
            bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
            matches = bf.match(des1, des2)
            matches = sorted(matches, key=lambda x: x.distance)
            
            num_matches = len(matches)
            avg_distance = sum(m.distance for m in matches) / num_matches if num_matches > 0 else 100
            match_ratio = num_matches / min(len(kp1), len(kp2)) if min(len(kp1), len(kp2)) > 0 else 0
            
            features.extend([num_matches, avg_distance, len(kp1), len(kp2), match_ratio])
        else:
            features.extend([0, 100, 0, 0, 0])
        
        features = np.array(features, dtype=np.float64)
        features = np.nan_to_num(features, nan=0.0, posinf=100.0, neginf=-100.0)
        
        return features
        
    except Exception as e:
        print(f"Error extracting features: {e}")
        return None

# Now run the data processing with this definition

In [None]:
def extract_hybrid_signature_features_cached(img1_path, img2_path, cache_manager=None, size=(128, 64)):
    """
    Extract features with caching support
    """
    # Try to get from cache first
    if cache_manager:
        cached_features = cache_manager.get_cached_features(img1_path, img2_path)
        if cached_features is not None:
            print(f"Using cached features for {os.path.basename(img1_path)} and {os.path.basename(img2_path)}")
            return cached_features
    
    # Extract features if not in cache
    features = extract_hybrid_signature_features(img1_path, img2_path, size)
    
    # Cache the features
    if cache_manager and features is not None:
        cache_manager.cache_features(img1_path, img2_path, features)
    
    return features

In [None]:
def extract_and_save_all_features(csv_file, img_dir, output_file="features_dataset.npz", subset_size=None):
    """
    Extract all features once and save them to disk
    """
    print(f"Extracting features for entire dataset from {csv_file}...")
    
    # Load CSV file
    data = pd.read_csv(csv_file, header=0)
    
    if subset_size and subset_size < len(data):
        data = data.sample(subset_size, random_state=42)
        print(f"Using subset of {len(data)} rows")
    
    all_features = []
    all_labels = []
    image_pairs = []
    processed = 0
    
    # Initialize cache manager
    cache_manager = FeatureCache()
    
    for idx, row in data.iterrows():
        if idx % 50 == 0:
            print(f"Processing pair {idx}/{len(data)}...")
        
        img1_path = os.path.join(img_dir, row[0])
        img2_path = os.path.join(img_dir, row[1])
        label = int(row[2])
        
        # Extract features with caching
        features = extract_hybrid_signature_features_cached(img1_path, img2_path, cache_manager)
        
        if features is not None:
            all_features.append(features)
            all_labels.append(label)
            image_pairs.append((row[0], row[1]))
            processed += 1
        
        if processed % 100 == 0:
            print(f"Processed {processed} valid pairs")
    
    # Convert to numpy arrays
    X = np.array(all_features)
    y = np.array(all_labels)
    
    # Clean data
    if np.isnan(X).any() or np.isinf(X).any():
        print("WARNING: Dataset contains NaN or infinite values. Cleaning...")
        X = np.nan_to_num(X, nan=0.0, posinf=100.0, neginf=-100.0)
    
    # Save everything
    np.savez_compressed(output_file, 
                       features=X, 
                       labels=y, 
                       image_pairs=image_pairs)
    
    print(f"Saved {len(X)} feature vectors to {output_file}")
    return X, y, image_pairs


In [None]:
def extract_single_pair_features(img1_path, img2_path):
    """Extract features for a single image pair"""
    try:
        # Initialize cache manager if not exists
        if not hasattr(extract_single_pair_features, 'cache_manager'):
            extract_single_pair_features.cache_manager = FeatureCache()
        
        # Extract features with caching
        features = extract_hybrid_signature_features_cached(
            img1_path, img2_path, 
            extract_single_pair_features.cache_manager
        )
        return features
    except Exception as e:
        print(f"Error processing pair {img1_path}, {img2_path}: {e}")
        return None

In [None]:
# Cell 6
try:
    data = pd.read_csv(csv_file, header=0)
    print(f"Loaded CSV with {len(data)} rows")
    print(f"First few rows of CSV:\n{data.head()}")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    
pairs = []
labels = []
processed = 0
errors = 0

# If subset size is specified, take a random subset
if subset_size and subset_size < len(data):
    data = data.sample(subset_size, random_state=42)
    print(f"Using subset of {len(data)} rows")

print(f"Starting to process {len(data)} pairs...")

for idx, row in data.iterrows():
    if idx % 50 == 0:
        print(f"Processing pair {idx}/{len(data)}...")
    
    try:
        # Get image paths and label - using integer indexing to be safe
        img1_path = os.path.join(img_dir, str(row.iloc[0]).strip())
        img2_path = os.path.join(img_dir, str(row.iloc[1]).strip())
        label = int(row.iloc[2])
        
        # Check if files exist
        if not os.path.exists(img1_path):
            if errors < 5:
                print(f"Warning: Image not found at {img1_path}")
            errors += 1
            continue
            
        if not os.path.exists(img2_path):
            if errors < 5:
                print(f"Warning: Image not found at {img2_path}")
            errors += 1
            continue
           
        # Extract features for this pair
        pair_features = extract_single_pair_features(img1_path, img2_path)
        if pair_features is not None:
            pairs.append(pair_features)
            labels.append(label)
            processed += 1
    
    except Exception as e:
        if errors < 5:
            print(f"Error processing pair {idx}: {e}")
        errors += 1

# Process results after the loop completes
if processed == 0:
    print("No valid pairs were processed!")
    X, y = None, None
else:
    # Convert to numpy arrays
    X = np.array(pairs)
    y = np.array(labels)
    
    # Clean data
    if np.isnan(X).any() or np.isinf(X).any():
        print("WARNING: Dataset contains NaN or infinite values. Cleaning...")
        X = np.nan_to_num(X, nan=0.0, posinf=100.0, neginf=-100.0)
    
    print(f"Successfully processed {processed} pairs out of {len(data)} total")
    print(f"Feature shape: {X.shape}")
    print(f"Errors encountered: {errors}")
    
    # Save features
    np.savez_compressed("features_dataset.npz", 
                       features=X, 
                       labels=y)
    
    print(f"Saved {len(X)} feature vectors to features_dataset.npz")

In [None]:
# Cell 7
test_size = 0.25

np.random.seed(42)
indices = np.random.permutation(len(X))
test_size_count = int(len(X) * test_size)
test_indices = indices[:test_size_count]
train_indices = indices[test_size_count:]

X_train = X[train_indices]
X_test = X[test_indices]
y_train = y[train_indices]
y_test = y[test_indices]

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

print(f"Training set: {X_train_scaled.shape}")
print(f"Test set: {X_test_scaled.shape}")
print(f"Genuine signatures: {sum(y_train)} / {len(y_train)} in training, {sum(y_test)} / {len(y_test)} in test")

In [None]:
# Cell 8
def test(network, X_test, y_test):
    """Tests the model using signature verification features."""
    network.eval()
    
    # Convert to PyTorch tensors if they aren't already
    if not isinstance(X_test, torch.Tensor):
        X_test = torch.tensor(X_test, dtype=torch.float32)
    if not isinstance(y_test, torch.Tensor):
        y_test = torch.tensor(y_test, dtype=torch.long)
    
    # Move to device
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    
    correct = 0
    total = 0
    
    with torch.no_grad():
        batch_size = 1000
        for i in range(0, len(X_test), batch_size):
            batch_X = X_test[i:i+batch_size]
            batch_y = y_test[i:i+batch_size]
            
            output = network(batch_X)
            pred = output.argmax(dim=1)
            correct += (pred == batch_y).sum().item()
            total += batch_y.size(0)
    
    accuracy = correct / total
    return accuracy

In [None]:
class SignatureDataAugmentor:
    def __init__(self, noise_factor=0.01, flip_prob=0.1):
        self.noise_factor = noise_factor
        self.flip_prob = flip_prob
    
    def add_noise(self, X):
        """Add Gaussian noise to features"""
        noise = np.random.normal(0, self.noise_factor, X.shape)
        return X + noise
    
    def feature_dropout(self, X, dropout_rate=0.1):
        """Randomly set some features to zero"""
        mask = np.random.binomial(1, 1-dropout_rate, X.shape)
        return X * mask
    
    def augment_batch(self, X, y, augment_ratio=2):
        """Augment a batch of data"""
        augmented_X = [X]
        augmented_y = [y]
        
        for _ in range(augment_ratio):
            # Add noise
            X_noise = self.add_noise(X)
            augmented_X.append(X_noise)
            augmented_y.append(y)
            
            # Feature dropout
            X_dropout = self.feature_dropout(X)
            augmented_X.append(X_dropout)
            augmented_y.append(y)
        
        return np.vstack(augmented_X), np.hstack(augmented_y)

In [None]:
def train(network, optimizer, X_train, y_train, augmentor=None, epochs=20):
    """Trains the model using signature verification features.

    Parameters:
        - network (torch.nn.Module): The neural network
        - optimizer (torch.optim): The optimizer for the network  
        - X_train (torch.Tensor): Training features
        - y_train (torch.Tensor): Training labels
    """
    network.train()
    
    for epoch in range(epochs):
        # Augment data each epoch
        if augmentor:
            X_aug, y_aug = augmentor.augment_batch(X_train, y_train, augment_ratio=1)
        else:
            X_aug, y_aug = X_train, y_train
        
        # Train on augmented data
        batch_size = 64
        for i in range(0, len(X_aug), batch_size):
            batch_X = torch.tensor(X_aug[i:i+batch_size], dtype=torch.float32).to(device)
            batch_y = torch.tensor(y_aug[i:i+batch_size], dtype=torch.long).to(device)
            
            optimizer.zero_grad()
            output = network(batch_X)
            loss = F.nll_loss(output, batch_y)
            loss.backward()
            optimizer.step()

In [None]:
# Cell 9
def objective(trial, X_train, y_train, X_test, y_test):
    """Enhanced objective function with class weighting and MLflow logging."""
    
    with mlflow.start_run(nested=True):
        # Log trial parameters
        mlflow.log_params(trial.params)
        
        class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        class_weights = torch.FloatTensor(class_weights).to(device)
        
        input_size = X_train.shape[1]
        model = OptimizedSignatureNet(trial, input_size).to(device)
        
        # RMSprop works best based on results
        optimizer = optim.RMSprop(
            model.parameters(),
            lr=trial.suggest_float("lr", 1e-5, 1e-3, log=True),
            weight_decay=trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
        )
        
        # Training loop with early stopping
        best_accuracy = 0
        patience = 5
        patience_counter = 0
        
        for epoch in range(20):  # Increase epochs slightly
            model.train()
            for i in range(0, len(X_train), 64):
                batch_X = torch.tensor(X_train[i:i+64], dtype=torch.float32).to(device)
                batch_y = torch.tensor(y_train[i:i+64], dtype=torch.long).to(device)
                
                optimizer.zero_grad()
                output = model(batch_X)
                loss = F.nll_loss(output, batch_y, weight=class_weights)
                loss.backward()
                optimizer.step()
            
            accuracy = test(model, X_test, y_test)
            
            # Log epoch metrics
            mlflow.log_metric("accuracy", accuracy, step=epoch)
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                patience_counter = 0
            else:
                patience_counter += 1
                
            if patience_counter >= patience:
                break
            
            trial.report(accuracy, epoch)
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()
        
        # Log final metrics
        mlflow.log_metric("best_accuracy", best_accuracy)
        
        return best_accuracy

In [None]:
with mlflow.start_run(run_name=f"signature_verification_optimization_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    # Log dataset information
    mlflow.log_params({
        "dataset_size": len(X),
        "train_size": len(X_train),
        "test_size": len(X_test),
        "feature_size": X.shape[1],
        "genuine_train": sum(y_train),
        "forgery_train": len(y_train) - sum(y_train),
        "genuine_test": sum(y_test),
        "forgery_test": len(y_test) - sum(y_test)
    })
    
    study = optuna.create_study(direction="maximize")
    study.optimize(
        lambda trial: objective(trial, X_train_scaled, y_train, X_test_scaled, y_test),
        n_trials=150  # Increase for better results
    )
    
    best_trial = study.best_trial
    print(f"Best single model accuracy: {best_trial.value}")
    
    # Log best parameters
    mlflow.log_params({f"best_{k}": v for k, v in best_trial.params.items()})
    mlflow.log_metric("best_single_model_accuracy", best_trial.value)

In [None]:
print(f"Best single model accuracy: {study.best_trial.value}")

In [None]:
class EnsembleSignatureVerifier:
    def __init__(self, models, weights=None):
        """
        Initialize ensemble with multiple trained models
        
        Args:
            models: List of trained models
            weights: Optional weights for each model (defaults to equal weights)
        """
        self.models = models
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Fixed: uncommented this line
        
        # Move all models to device and set to eval mode
        for model in self.models:
            model.to(self.device)
            model.eval()
        
        # Set weights for each model
        if weights is None:
            self.weights = [1.0 / len(models)] * len(models)
        else:
            assert len(weights) == len(models), "Number of weights must match number of models"
            self.weights = weights
    
    def predict_proba(self, X):
        """Get weighted average probabilities from all models"""
        if not isinstance(X, torch.Tensor):
            X = torch.tensor(X, dtype=torch.float32)
        X = X.to(self.device)
        
        all_probas = []
        
        with torch.no_grad():
            for model, weight in zip(self.models, self.weights):
                output = model(X)
                probas = torch.softmax(output, dim=1)
                all_probas.append(probas.cpu().numpy() * weight)
        
        return np.sum(all_probas, axis=0)
    
    def predict(self, X):
        """Make predictions using ensemble"""
        probas = self.predict_proba(X)
        return np.argmax(probas, axis=1)
    
    def predict_with_confidence(self, X, threshold=0.85):
        """Make predictions only for high-confidence cases"""
        probas = self.predict_proba(X)
        confidence = np.max(probas, axis=1)
        predictions = np.argmax(probas, axis=1)
        
        # Mark low-confidence predictions as uncertain
        uncertain_mask = confidence < threshold
        predictions[uncertain_mask] = -1  # -1 indicates uncertain
        
        return predictions, confidence

In [None]:
def train_model_simple(model, optimizer, X_train, y_train, epochs=10):
    """Simple training function that ensures device consistency"""
    model.train()
    model.to(device)
    
    # Convert to tensors and move to device
    if not isinstance(X_train, torch.Tensor):
        X_train = torch.tensor(X_train, dtype=torch.float32)
    if not isinstance(y_train, torch.Tensor):
        y_train = torch.tensor(y_train, dtype=torch.long)
    
    X_train = X_train.to(device)
    y_train = y_train.to(device)
    
    batch_size = 64
    for epoch in range(epochs):
        for i in range(0, len(X_train), batch_size):
            batch_X = X_train[i:i+batch_size]
            batch_y = y_train[i:i+batch_size]
            
            optimizer.zero_grad()
            output = model(batch_X)
            loss = torch.nn.functional.nll_loss(output, batch_y)
            loss.backward()
            optimizer.step()

In [None]:
def create_ensemble_from_study(study, X_train, y_train, top_n=5):
    """Create ensemble from top N trials"""
    best_trials = sorted(study.trials, key=lambda t: t.value if hasattr(t, 'value') and t.value is not None else 0, reverse=True)[:top_n]
    
    models = []
    
    # Import the model class
   
    for trial in best_trials:
        # Create a new trial-like object for initialization
        class TrialMock:
            def __init__(self, params):
                self.params = params
            
            def suggest_int(self, name, *args, **kwargs):
                return self.params[name]
            
            def suggest_float(self, name, *args, **kwargs):
                return self.params[name]
        
        # Create model with trial parameters
        mock_trial = TrialMock(trial.params)
        model = OptimizedSignatureNet(mock_trial, X_train.shape[1])
        model.to(device)
        
        # Create optimizer with trial parameters
        optimizer = optim.RMSprop(
            model.parameters(),
            lr=trial.params['lr'],
            weight_decay=trial.params['weight_decay']
        )
        
        # Train the model
        train_model_simple(model, optimizer, X_train, y_train, epochs=10)
        models.append(model)
    
    return EnsembleSignatureVerifier(models)

ensemble_model = create_ensemble_from_study(study, X_train_scaled, y_train, top_n=5)

In [None]:
ensemble_predictions = ensemble_model.predict(X_test_scaled)
ensemble_acc = np.mean(ensemble_predictions == y_test)
print(f"Ensemble accuracy: {ensemble_acc}")

In [None]:
class ConfidenceEvaluator:
    def __init__(self, model, threshold=0.85):
        self.model = model
        self.threshold = threshold
    
    def evaluate_with_confidence(self, X_test, y_test):
        """Evaluate model with confidence thresholding"""
        if hasattr(self.model, 'predict_with_confidence'):
            predictions, confidence = self.model.predict_with_confidence(X_test, self.threshold)
        else:
            probas = self.model.predict_proba(X_test) if hasattr(self.model, 'predict_proba') else torch.softmax(self.model(torch.tensor(X_test, dtype=torch.float32).to(device)), dim=1).cpu().numpy()
            confidence = np.max(probas, axis=1)
            predictions = np.argmax(probas, axis=1)
            predictions[confidence < self.threshold] = -1
        
        # Calculate metrics for different confidence levels
        results = {}
        for conf_thresh in [0.5, 0.7, 0.85, 0.9, 0.95]:
            high_conf_mask = confidence >= conf_thresh
            if np.sum(high_conf_mask) > 0:
                accurate_predictions = predictions[high_conf_mask]
                accurate_true = y_test[high_conf_mask]
                
                # Only count non-uncertain predictions
                certain_mask = accurate_predictions != -1
                if np.sum(certain_mask) > 0:
                    accuracy = np.mean(accurate_predictions[certain_mask] == accurate_true[certain_mask])
                    coverage = np.sum(high_conf_mask) / len(y_test)
                    results[conf_thresh] = {'accuracy': accuracy, 'coverage': coverage}
        
        return results

In [None]:
# Add this cell before the torch.save
def analyze_errors(model, X_test, y_test):
    """Analyze prediction errors in detail"""
    predictions = model.predict(X_test)
    probas = model.predict_proba(X_test) if hasattr(model, 'predict_proba') else None
    
    # Find misclassified examples
    errors = predictions != y_test
    error_indices = np.where(errors)[0]
    
    error_analysis = {
        'total_errors': len(error_indices),
        'error_rate': len(error_indices) / len(y_test),
        'false_positives': np.sum((predictions == 1) & (y_test == 0)),
        'false_negatives': np.sum((predictions == 0) & (y_test == 1))
    }
    
    if probas is not None:
        # Analyze confidence of errors
        error_probas = probas[error_indices]
        error_confidence = np.max(error_probas, axis=1)
        error_analysis['avg_error_confidence'] = np.mean(error_confidence)
        error_analysis['high_conf_errors'] = np.sum(error_confidence > 0.9)
    
    return error_analysis

In [None]:
evaluator = ConfidenceEvaluator(ensemble_model)
confidence_results = evaluator.evaluate_with_confidence(X_test_scaled, y_test)

print("\nConfidence-based results:")
for threshold, metrics in confidence_results.items():
    print(f"Threshold {threshold}: Accuracy={metrics['accuracy']:.3f}, Coverage={metrics['coverage']:.3f}")

# Error analysis
errors = analyze_errors(ensemble_model, X_test_scaled, y_test)
print(f"\nError Analysis:")
print(f"Total errors: {errors['total_errors']}")
print(f"Error rate: {errors['error_rate']:.3f}")
print(f"False positives: {errors['false_positives']}")
print(f"False negatives: {errors['false_negatives']}")

In [None]:
# When saving the ensemble (modify your training code)
def save_complete_ensemble(study, ensemble_model, scaler, imputer, filename='best_signature_ensemble.pth'):
    """Save ensemble with model architectures"""
    # Get the best trial parameters
    best_trial = study.best_trial
    best_params = best_trial.params
    
    # Save everything needed for reconstruction
    checkpoint = {
        'models': [model.state_dict() for model in ensemble_model.models],
        'weights': ensemble_model.weights,
        'scaler': scaler,
        'imputer': imputer,
        'model_params': best_params,  # Save the architecture parameters
        'input_size': ensemble_model.models[0].layers[0].in_features  # Save input size
    }
    
    torch.save(checkpoint, filename)
    print(f"Complete ensemble saved to {filename}")

In [None]:
from mlflow.models import infer_signature

with mlflow.start_run(run_name=f"ensemble_evaluation_{datetime.now().strftime('%Y%m%d_%H%M%S')}", nested=True):
    try:
        # Log ensemble performance
        mlflow.log_metric("ensemble_accuracy", ensemble_acc)
        
        # Log confidence-based results
        for threshold, metrics in confidence_results.items():
            mlflow.log_metric(f"accuracy_at_{threshold}_confidence", metrics['accuracy'])
            mlflow.log_metric(f"coverage_at_{threshold}_confidence", metrics['coverage'])
        
        # Log error analysis
        mlflow.log_metrics({
            "total_errors": errors['total_errors'],
            "error_rate": errors['error_rate'],
            "false_positives": errors['false_positives'],
            "false_negatives": errors['false_negatives']
        })
        
        # Create signature BEFORE moving models to CPU
        signature = infer_signature(X_test_scaled, ensemble_model.predict_proba(X_test_scaled))
        
        # Save model ensemble with MLflow - using a simpler approach
        class EnsembleModelWrapper(mlflow.pyfunc.PythonModel):
            def __init__(self, models_state_dicts, layer_sizes_list, dropout_rates, weights, scaler, imputer):
                self.models_state_dicts = models_state_dicts
                self.layer_sizes_list = layer_sizes_list
                self.dropout_rates = dropout_rates
                self.weights = weights
                self.scaler = scaler
                self.imputer = imputer
                self.models = None
                
            def load_context(self, context):
                """Load models when needed"""
                if self.models is None:
                    self.models = []
                    for state_dict, layer_sizes, dropout_rate in zip(self.models_state_dicts, self.layer_sizes_list, self.dropout_rates):
                        # Create model with exact layer sizes
                        model = self.create_model_with_exact_sizes(layer_sizes, dropout_rate)
                        model.load_state_dict(state_dict)
                        model.eval()
                        self.models.append(model)
            
            def create_model_with_exact_sizes(self, layer_sizes, dropout_rate):
                """Create model with exact layer sizes to match saved state dict"""
                class ExactSizeModel(nn.Module):
                    def __init__(self, layer_sizes, dropout_rate):
                        super().__init__()
                        self.layers = nn.ModuleList()
                        self.batch_norms = nn.ModuleList()
                        
                        # Create layers with exact sizes
                        for i in range(len(layer_sizes) - 1):
                            self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))
                            self.batch_norms.append(nn.BatchNorm1d(layer_sizes[i + 1]))
                        
                        # Output layer
                        self.output = nn.Linear(layer_sizes[-1], 2)
                        self.dropout = nn.Dropout(dropout_rate)
                    
                    def forward(self, x):
                        for layer, batch_norm in zip(self.layers, self.batch_norms):
                            x = layer(x)
                            x = batch_norm(x)
                            x = F.relu(x)
                            x = self.dropout(x)
                        x = self.output(x)
                        return F.log_softmax(x, dim=1)
                
                return ExactSizeModel(layer_sizes, dropout_rate)
                        
            def predict(self, context, model_input):
                if self.models is None:
                    self.load_context(context)
                    
                # Preprocess
                X_imputed = self.imputer.transform(model_input)
                X_scaled = self.scaler.transform(X_imputed)
                
                # Convert to tensor
                X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
                
                # Predict with ensemble
                all_probas = []
                for model, weight in zip(self.models, self.weights):
                    with torch.no_grad():
                        output = model(X_tensor)
                        probas = torch.softmax(output, dim=1).numpy()
                        all_probas.append(probas * weight)
                
                return np.sum(all_probas, axis=0)
        
        # Extract exact layer sizes and parameters from each model
        models_state_dicts = []
        layer_sizes_list = []
        dropout_rates = []
        
        for model in ensemble_model.models:
            # Get exact layer sizes
            layer_sizes = [model.layers[0].in_features]  # Input size
            for layer in model.layers:
                layer_sizes.append(layer.out_features)
            layer_sizes_list.append(layer_sizes)
            
            # Get dropout rate
            dropout_rates.append(model.dropout_rate)
            
            # Move to CPU and get state dict
            model.cpu()
            models_state_dicts.append(model.state_dict())
            # Move back to device
            model.to(device)
        
        # Create wrapper with exact specifications
        wrapped_model = EnsembleModelWrapper(
            models_state_dicts,
            layer_sizes_list,
            dropout_rates,
            ensemble_model.weights,
            scaler,
            imputer
        )
        
        # Test that the wrapper can load properly
        wrapped_model.load_context(None)
        print(f"Successfully loaded {len(wrapped_model.models)} models in wrapper")
        
        # Log the model and get the model URI
        run = mlflow.active_run()
        model_uri = f"runs:/{run.info.run_id}/ensemble_signature_verifier"
        
        mlflow.pyfunc.log_model(
            "ensemble_signature_verifier",
            python_model=wrapped_model,
            signature=signature
        )
        
        save_complete_ensemble(study, ensemble_model, scaler, imputer, filename='best_signature_ensemble.pth')

        # Register the model using the URI
        registered_model = mlflow.register_model(
            model_uri=model_uri,
            name="ForgeryDetectionModel"
        )
        
        # Important: Explicitly set run to completed
        mlflow.set_tag("mlflow.runName", f"ensemble_evaluation_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
        mlflow.set_tag("status", "completed")
        
        print("Model saved both locally and to MLflow!")
        
    except Exception as e:
        # Log the error and re-raise
        mlflow.set_tag("status", "failed")
        mlflow.set_tag("error", str(e))
        print(f"Error occurred: {e}")
        raise

In [None]:
def load_ensemble_model_with_params(checkpoint_path, device):
    """Load ensemble model using saved parameters"""
    checkpoint = torch.load(checkpoint_path, map_location=device)
    
    scaler = checkpoint['scaler']
    imputer = checkpoint['imputer']
    model_weights = checkpoint['weights']
    model_states = checkpoint['models']
    model_params = checkpoint['model_params']
    input_size = checkpoint['input_size']
    
    # Reconstruct models using saved parameters
    models = []
    for model_state in model_states:
        # Create model with saved parameters
        model = OptimizedSignatureNet(
            input_size=input_size,
            num_layers=model_params['num_layers'],
            first_layer_size=model_params['first_layer_size'],
            layer_reductions=[
                model_params.get('layer_1_reduction', 0.8),
                model_params.get('layer_2_reduction', 0.8),
                model_params.get('layer_3_reduction', 0.8),
                model_params.get('layer_4_reduction', 0.8)
            ][:model_params['num_layers']-1],
            dropout_rate=model_params['dropout_rate']
        )
        model.load_state_dict(model_state)
        model.to(device)
        model.eval()
        models.append(model)
    
    return models, scaler, imputer, model_weights