# Signature detection with custom model

In [None]:
import torch
import torch.nn as nn, torch.nn.functional as F
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.retinanet import RetinaNetClassificationHead
from torchvision.models.detection.retinanet import RetinaNet_ResNet50_FPN_Weights
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms.functional as TF
from torchvision import transforms
import webdataset as wds
from PIL import Image
import numpy as np
import time
import matplotlib.pyplot as plt
import cv2
import json

In [None]:
# check GPU
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))

### Hyperparameters

In [None]:
train_dataset = "datasets/custom/train-00000.tar"
val_dataset = "datasets/custom/val-00000.tar"
test_dataset = "datasets/custom/test-00000.tar"

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

num_classes = 2  # 1 class (signature) + background
imgsz = 512
epochs = 2
batch_size = 4
learning_rate = 0.005

### utility functions

In [None]:
def iou(boxA, boxB):
    # boxes are [x1,y1,x2,y2]
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interW = max(0, xB - xA)
    interH = max(0, yB - yA)
    interArea = interW * interH
    boxAArea = max(0, (boxA[2] - boxA[0])) * max(0, (boxA[3] - boxA[1]))
    boxBArea = max(0, (boxB[2] - boxB[0])) * max(0, (boxB[3] - boxB[1]))
    denom = float(boxAArea + boxBArea - interArea)
    return interArea / denom if denom > 0 else 0.0

In [None]:
def train_one_epoch(model, loader, optimizer, device):
    model.train()
    running_loss = 0.0
    it = 0
    for images, targets in loader:
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        running_loss += losses.item()
        it += 1
    return running_loss / max(1, it)

In [None]:
def validate(model, loader, device):
    model.train()
    val_loss = 0.0
    it = 0

    with torch.no_grad():
        for images, targets in loader:
            images = [img.to(device) for img in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            # Temporarily run in train mode to get loss dict (model() in eval returns list)
            loss_dict = model(images, targets)

            losses = sum(loss for loss in loss_dict.values())
            val_loss += losses.item()
            it += 1

    return val_loss / max(1, it)


In [None]:
def evaluate_precision_recall(model, loader, device, iou_th=0.5, score_th=0.5):
    model.eval()
    TP = 0
    FP = 0
    FN = 0
    with torch.no_grad():
        for images, targets in loader:
            img = images[0].to(device)
            gt = targets[0]
            preds = model([img])[0]
            pred_boxes = preds['boxes'].cpu().numpy()
            pred_scores = preds['scores'].cpu().numpy()
            gt_boxes = gt['boxes'].cpu().numpy() if gt['boxes'].size(0) > 0 else np.zeros((0,4))

            keep_idx = np.where(pred_scores >= score_th)[0]
            pred_boxes = pred_boxes[keep_idx]
            matched_gt = set()
            for pb in pred_boxes:
                best_iou = 0
                best_j = -1
                for j, gb in enumerate(gt_boxes):
                    if j in matched_gt:
                        continue
                    cur_iou = iou(pb, gb)
                    if cur_iou > best_iou:
                        best_iou = cur_iou
                        best_j = j
                if best_iou >= iou_th and best_j >= 0:
                    TP += 1
                    matched_gt.add(best_j)
                else:
                    FP += 1
            FN += (len(gt_boxes) - len(matched_gt))

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    return precision, recall


In [None]:
def training_loop(model, train_loader, val_loader, optimizer, lr_scheduler, device, epochs=10):
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        t0 = time.time()
        try:
            train_loss = train_one_epoch(model, train_loader, optimizer, device)
            lr_scheduler.step()
            val_loss = validate(model, val_loader, device)
            prec, rec = evaluate_precision_recall(model, val_loader, device)
            print(f'Epoch {epoch+1}/{epochs} — train_loss: {train_loss:.4f}, val_loss: {val_loss:.4f}, prec: {prec:.3f}, rec: {rec:.3f}, time: {time.time()-t0:.1f}s')
            train_losses.append(train_loss)
            val_losses.append(val_loss)
        except RuntimeError as e:
            if 'out of memory' in str(e).lower():
                print('RuntimeError: CUDA out of memory during training.\nConsider:')
                # try to free cache and continue or abort
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                raise
            else:
                raise
    return model, train_losses, val_losses

### Dataset

In [None]:
transform = transforms.Compose([
    transforms.Resize((imgsz, imgsz)),
    transforms.ToTensor()
])

def preprocessSample(sample):
     # Detect image key dynamically
    img_key = None
    for k in sample.keys():
        if k.lower() in ["jpg", "jpeg", "png"]:
            img_key = k
            break
    if img_key is None:
        raise ValueError(f"No supported image format found in sample keys: {list(sample.keys())}")

    # Image already decoded to PIL
    img = sample[img_key]
    if img.mode != "RGB":
        img = img.convert("RGB")

    # Get original image size before resizing
    orig_w, orig_h = img.size

    # Resize image
    img_resized = transforms.Resize((imgsz, imgsz))(img)
    new_w, new_h = img_resized.size

    # Compute scale factors
    scale_x = new_w / orig_w
    scale_y = new_h / orig_h

    # Parse target
    target = sample["json"]

    # Convert boxes [x, y, w, h] → [x1, y1, x2, y2]
    boxes = []
    for (x, y, w, h) in target["boxes"]:
        x1, y1, x2, y2 = x, y, x + w, y + h
        # Scale coordinates
        x1 *= scale_x
        x2 *= scale_x
        y1 *= scale_y
        y2 *= scale_y
        boxes.append([x1, y1, x2, y2])

    # Convert to tensors
    target["boxes"] = torch.as_tensor(boxes, dtype=torch.float32)
    target["labels"] = torch.as_tensor(target["labels"], dtype=torch.int64)

    # Final transform to tensor
    img_tensor = transforms.ToTensor()(img_resized)

    return img_tensor, target

In [None]:
train_dataset = (
    wds.WebDataset(train_dataset)   # <- use pattern or list of tar paths
    .decode("pil")
    .map(preprocessSample)
)
val_dataset = (
    wds.WebDataset(val_dataset)   # <- use pattern or list of tar paths
    .decode("pil")
    .map(preprocessSample)
)
test_dataset = (
    wds.WebDataset(test_dataset)   # <- use pattern or list of tar paths
    .decode("pil")
    .map(preprocessSample)
)

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    num_workers=0,
    collate_fn=lambda x: tuple(zip(*x))
)
val_loader = torch.utils.data.DataLoader(
    val_dataset, 
    batch_size=batch_size, 
    num_workers=0, 
    collate_fn=lambda x: tuple(zip(*x))
)
test_loader = torch.utils.data.DataLoader(
    test_dataset, 
    batch_size=batch_size, 
    num_workers=0, 
    collate_fn=lambda x: tuple(zip(*x))
)



## Model definition

In [None]:
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()

        # CNNs for rgb images
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        self.conv3 = nn.Conv2d(in_channels=12, out_channels=24, kernel_size=5)
        self.conv4 = nn.Conv2d(in_channels=24, out_channels=48, kernel_size=5)
        self.conv5 = nn.Conv2d(in_channels=48, out_channels=192, kernel_size=5)

        # Connecting CNN outputs with Fully Connected layers for classification
        self.class_fc1 = nn.Linear(in_features=1728, out_features=240)
        self.class_fc2 = nn.Linear(in_features=240, out_features=120)
        self.class_out = nn.Linear(in_features=120, out_features=2)

        # Connecting CNN outputs with Fully Connected layers for bounding box
        self.box_fc1 = nn.Linear(in_features=1728, out_features=240)
        self.box_fc2 = nn.Linear(in_features=240, out_features=120)
        self.box_out = nn.Linear(in_features=120, out_features=4)


    def forward(self, t):
        t = self.conv1(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        t = self.conv2(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        t = self.conv3(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        t = self.conv4(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        t = self.conv5(t)
        t = F.relu(t)
        t = F.avg_pool2d(t, kernel_size=4, stride=2)

        t = torch.flatten(t,start_dim=1)
        

        class_t = self.class_fc1(t)
        class_t = F.relu(class_t)

        class_t = self.class_fc2(class_t)
        class_t = F.relu(class_t)

        class_t = F.softmax(self.class_out(class_t),dim=1)

        box_t = self.box_fc1(t)
        box_t = F.relu(box_t)

        box_t = self.box_fc2(box_t)
        box_t = F.relu(box_t)

        box_t = self.box_out(box_t)
        box_t = F.sigmoid(box_t)

        return [class_t,box_t]

## Training

In [None]:
model = Network()

try:
    model.to(device)
except RuntimeError as e:
    print('Error moving model to device — falling back to CPU.\n', e)
    device = torch.device('cpu')
    model.to(device)

model

In [None]:
# clear cache and reduce fragmentation
if torch.cuda.is_available():
    torch.cuda.empty_cache()

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

In [None]:
print("Starting training...")
model, train_losses, val_losses = training_loop(model, train_loader, val_loader, optimizer, lr_scheduler, device, epochs)