# Objects Detection

## Tasks

### Task 1

Calculate the `Jaccard similarity` for class 0, class 1, and class 2.  
Calculate the average and round to 4 decimal places.

In [1]:
import torch
from torch import Tensor


y_pred = Tensor([
    [1, 1, 2, 2, 2],
    [1, 1, 2, 1, 2],
    [1, 0, 0, 0, 0],
    [2, 2, 2, 0, 0],
    [2, 1, 1, 1, 2]
])

y_true = Tensor([
    [1, 1, 1, 2, 2],
    [1, 1, 1, 2, 2],
    [1, 1, 1, 2, 2],
    [0, 0, 0, 2, 2],
    [0, 0, 0, 2, 2]
])

# Flatten the tensors
y_pred_flat = y_pred.flatten()
y_true_flat = y_true.flatten()

# Calculate Jaccard similarity for class 0
pred_class_0 = (y_pred_flat == 0)
true_class_0 = (y_true_flat == 0)
intersection_0 = (pred_class_0 & true_class_0).sum().item()
union_0 = (pred_class_0 | true_class_0).sum().item()
jaccard_0 = intersection_0 / union_0 if union_0 > 0 else 0

# Calculate Jaccard similarity for class 1
pred_class_1 = (y_pred_flat == 1)
true_class_1 = (y_true_flat == 1)
intersection_1 = (pred_class_1 & true_class_1).sum().item()
union_1 = (pred_class_1 | true_class_1).sum().item()
jaccard_1 = intersection_1 / union_1 if union_1 > 0 else 0

# Calculate Jaccard similarity for class 2
pred_class_2 = (y_pred_flat == 2)
true_class_2 = (y_true_flat == 2)
intersection_2 = (pred_class_2 & true_class_2).sum().item()
union_2 = (pred_class_2 | true_class_2).sum().item()
jaccard_2 = intersection_2 / union_2 if union_2 > 0 else 0

# Calculate the average
average_jaccard = (jaccard_0 + jaccard_1 + jaccard_2) / 3

In [2]:
print(f'Class 0 - Intersection: {intersection_0}, Union: {union_0}, Jaccard: {jaccard_0:.4f}')
print(f'Class 1 - Intersection: {intersection_1}, Union: {union_1}, Jaccard: {jaccard_1:.4f}')
print(f'Class 2 - Intersection: {intersection_2}, Union: {union_2}, Jaccard: {jaccard_2:.4f}')
print(f'Average Jaccard similarity: {average_jaccard:.4f}')

Class 0 - Intersection: 0, Union: 12, Jaccard: 0.0000
Class 1 - Intersection: 5, Union: 13, Jaccard: 0.3846
Class 2 - Intersection: 4, Union: 16, Jaccard: 0.2500
Average Jaccard similarity: 0.2115


### Task 2

Enhance the `UNET` model from the seminar, train it on the `OXFORD-PETS` dataset, and achieve a pixel-wise `Accuracy` of 88%. To do this, you'll need to add more downsampling blocks and upsampling blocks, and possibly increase the base_channels.

In [None]:
import torch.nn as nn


def conv_plus_conv(in_channels: int, out_channels: int):
    return nn.Sequential(
        nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=3,
            stride=1,
            padding=1
        ),
        nn.BatchNorm2d(num_features=out_channels),
        nn.LeakyReLU(0.2),
        nn.Conv2d(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=3,
            stride=1,
            padding=1
        ),
        nn.BatchNorm2d(num_features=out_channels),
        nn.LeakyReLU(0.2),
    )

In [None]:
class UNET(nn.Module):
    def __init__(self):
        super().__init__()

        base_channels = 32

        self.down1 = conv_plus_conv(3, base_channels)
        self.down2 = conv_plus_conv(base_channels, base_channels * 2)
        self.down3 = conv_plus_conv(base_channels * 2, base_channels * 4)
        self.down4 = conv_plus_conv(base_channels * 4, base_channels * 8)
        self.down5 = conv_plus_conv(base_channels * 8, base_channels * 16)

        self.up1 = conv_plus_conv(base_channels * 2, base_channels)
        self.up2 = conv_plus_conv(base_channels * 4, base_channels)
        self.up3 = conv_plus_conv(base_channels * 8, base_channels * 2)
        self.up4 = conv_plus_conv(base_channels * 16, base_channels * 4)
        self.up5 = conv_plus_conv(base_channels * 32, base_channels * 8)

        self.bottleneck = conv_plus_conv(base_channels * 16, base_channels * 16)

        self.out = nn.Conv2d(in_channels=base_channels, out_channels=3, kernel_size=1)

        self.downsample = nn.MaxPool2d(kernel_size=2, stride=2)

    
    def forward(self, x):
        # x.shape = (N, N, 3)

        residual1 = self.down1(x)  # x.shape: (N, N, 3) -> (N, N, base_channels)
        x = self.downsample(residual1)  # x.shape: (N, N, base_channels) -> (N // 2, N // 2, base_channels)

        residual2 = self.down2(x)  # x.shape: (N // 2, N // 2, base_channels) -> (N // 2, N // 2, base_channels * 2)
        x = self.downsample(residual2)  # x.shape: (N // 2, N // 2, base_channels * 2) -> (N // 4, N // 4, base_channels * 2)

        residual3 = self.down3(x)
        x = self.downsample(residual3)

        residual4 = self.down4(x)
        x = self.downsample(residual4)

        residual5 = self.down5(x)
        x = self.downsample(residual5)

        # LATENT SPACE DIMENSION DIM = N // 4
        x = self.bottleneck(x)  # x.shape: (N // 4, N // 4, base_channels * 2) -> (N // 4, N // 4, base_channels * 2)

        x = nn.functional.interpolate(x, scale_factor=2)
        x = torch.cat((x, residual5), dim=1)
        x = self.up5(x)

        x = nn.functional.interpolate(x, scale_factor=2)
        x = torch.cat((x, residual4), dim=1)
        x = self.up4(x)

        x = nn.functional.interpolate(x, scale_factor=2)
        x = torch.cat((x, residual3), dim=1)
        x = self.up3(x)

        x = nn.functional.interpolate(x, scale_factor=2)  # x.shape: (N // 4, N // 4, base_channels * 2) -> (N // 2, N // 2, base_channels * 2)
        x = torch.cat((x, residual2), dim=1)  # x.shape: (N // 2, N // 2, base_channels * 2) -> (N // 2, N // 2, base_channels * 4)
        x = self.up2(x)  # x.shape: (N // 2, N // 2, base_channels * 4) -> (N // 2, N // 2, base_channels)

        x = nn.functional.interpolate(x, scale_factor=2)  # x.shape: (N // 2, N // 2, base_channels) -> (N, N, base_channels)
        x = torch.cat((x, residual1), dim=1)  # x.shape: (N, N, base_channels) -> (N, N, base_channels * 2)
        x = self.up1(x)  # x.shape: (N, N, base_channels * 2) -> (N, N, base_channels)

        x = self.out(x)  # x.shape: (N, N, base_channels) -> (N, N, 3)

        return x

In [None]:
import random

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as T
from IPython.display import clear_output
from torch.optim import Adam
from torch.optim import Optimizer
from torch.utils.data import DataLoader
from torch.utils.data import Subset
from torchvision.datasets import OxfordIIITPet
from tqdm import tqdm

In [None]:
def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

In [None]:
def train(
    model: nn.Module,
    data_loader: DataLoader,
    optimizer: Optimizer,
    loss_fn,
    device: torch.device,
):
    model.train()

    train_loss = 0
    total = 0
    correct = 0

    for x, y in tqdm(data_loader, desc='Train'):
        bs = y.size(0)

        x, y = x.to(device), y.squeeze(1).to(device)

        optimizer.zero_grad()

        output = model(x)

        loss = loss_fn(output.reshape(bs, 3, -1), y.reshape(bs, -1))

        train_loss += loss.item()

        loss.backward()

        optimizer.step()

        _, y_pred = output.max(dim=1)
        total += y.size(0) * y.size(1) * y.size(2)
        correct += (y == y_pred).sum().item()

    train_loss /= len(data_loader)
    accuracy = correct / total

    return train_loss, accuracy

In [None]:
@torch.inference_mode()
def evaluate(
    model: nn.Module, data_loader: DataLoader, loss_fn, device: torch.device
):
    model.eval()

    total_loss = 0
    total = 0
    correct = 0

    for x, y in tqdm(data_loader, desc='Evaluation'):
        bs = y.size(0)

        x, y = x.to(device), y.squeeze(1).to(device)

        output = model(x)

        loss = loss_fn(output.reshape(bs, 3, -1), y.reshape(bs, -1))

        total_loss += loss.item()

        _, y_pred = output.max(dim=1)
        total += y.size(0) * y.size(1) * y.size(2)
        correct += (y == y_pred).sum().item()

    total_loss /= len(data_loader)
    accuracy = correct / total

    return total_loss, accuracy

In [None]:
def plot_stats(
    train_loss: list[float],
    valid_loss: list[float],
    train_accuracy: list[float],
    valid_accuracy: list[float],
    title: str
):
    plt.figure(figsize=(16, 8))

    plt.title(title + ' loss')

    plt.plot(train_loss, label='Train loss')
    plt.plot(valid_loss, label='Valid loss')
    plt.legend()
    plt.grid()

    plt.show()

    plt.figure(figsize=(16, 8))

    plt.title(title + ' accuracy')
    
    plt.plot(train_accuracy, label='Train accuracy')
    plt.plot(valid_accuracy, label='Valid accuracy')
    plt.legend()
    plt.grid()

    plt.show()

In [None]:
def whole_train_valid_cycle(
    model, train_loader, valid_loader, optimizer, loss_fn, device, threshold, title
):
    train_loss_history, valid_loss_history = [], []
    train_accuracy_history, valid_accuracy_history = [], []

    for epoch in range(100):
        train_loss, train_accuracy = train(
            model, train_loader, optimizer, loss_fn, device
        )
        valid_loss, valid_accuracy = evaluate(model, valid_loader, loss_fn, device)

        train_loss_history.append(train_loss)
        valid_loss_history.append(valid_loss)

        train_accuracy_history.append(train_accuracy)
        valid_accuracy_history.append(valid_accuracy)

        clear_output(wait=True)

        plot_stats(
            train_loss_history,
            valid_loss_history,
            train_accuracy_history,
            valid_accuracy_history,
            title,
        )

        if valid_accuracy >= threshold:
            break

In [None]:
@torch.inference_mode()
def predict_segmentation(model: nn.Module, loader: DataLoader, device: torch.device):
    model.eval()

    prediction = []

    for x, _ in loader:
        output = model(x.to(device)).cpu()

        prediction.append(torch.argmax(output, dim=1))

    prediction = torch.cat(prediction)

    return prediction

In [None]:
def main(model_class, threshold, title):
    set_seed(0xDEADF00D)

    transform = T.Compose(
        [
            T.Resize((256, 256)),
            T.ToTensor(),
        ]
    )

    target_transform = T.Compose(
        [
            T.Resize((256, 256)),
            T.PILToTensor(),
            T.Lambda(lambda x: (x - 1).long())
        ]
    )

    train_dataset = OxfordIIITPet('/home/jupyter/mnt/datasets/pets', transform=transform, download=True, target_transform=target_transform, target_types='segmentation')
    valid_dataset = OxfordIIITPet('/home/jupyter/mnt/datasets/pets', transform=transform, download=True, split='test', target_transform=target_transform, target_types='segmentation')
    
    np.random.seed(100)
    idx = np.random.randint(len(valid_dataset), size=200).tolist()
    
    valid_dataset = Subset(valid_dataset, idx)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=8, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False, num_workers=8, pin_memory=True)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    model = model_class().to(device)

    optimizer = Adam(model.parameters(), lr=1e-3)

    loss_fn = nn.CrossEntropyLoss()

    whole_train_valid_cycle(
        model, train_loader, valid_loader, optimizer, loss_fn, device, threshold, title
    )

    torch.save(predict_segmentation(model, valid_loader, device).reshape([200, 1, 256, 256]).to(torch.uint8), 'prediction.pt')

In [None]:
main(UNET, 0.88, 'UNET segmentation')