<center><h1>Fast Evaluate COCO / Pascal VOC</h1>
<h2>Matthias Bartolo</h2>

</center>

#### Package Imports and Constants - Remain the same

In [None]:
import json
import random
import torch
import torchvision
from torchvision.models.detection.retinanet import RetinaNetClassificationHead
from functools import partial

In [None]:
DIR_INPUT = '../datasets/Pascal-VOC-2012' # COCO-Dataset
DIR_TRAIN = f'{DIR_INPUT}/train'
DIR_VALID = f'{DIR_INPUT}/valid'
DIR_TEST = f'{DIR_INPUT}/valid' # Pascal VOC does not have test
DIR_IMAGES = 'images'
DIR_ANNOTATIONS = '_annotations.coco.json'
IMG_RESIZE = (800, 800)

#### Classes - Remain the same

In [None]:
# To get the classes from the annotation file
classes_annotation_path = f'{DIR_TRAIN}/{DIR_ANNOTATIONS}'

# Load the annotation file
with open(classes_annotation_path, 'r') as f:
    coco_data = json.load(f)

# Extract class categories
CLASSES = [
    {
        "id": category["id"],
        "name": category["name"],
        "supercategory": category.get("supercategory", "None"),
        "color": (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
    }
    for category in coco_data["categories"]
]

# Change id = 0 to Background
CLASSES[0]['name'] = 'Background'

# For COCO dataset: extend to 91 classes
# coco_80_91 = [
#     "zebra",  
#     "toaster",     # 81
#     "sink",        # 82
#     "refrigerator",# 83
#     "book",        # 84
#     "clock",       # 85
#     "vase",        # 86
#     "scissors",    # 87
#     "teddy bear",  # 88
#     "hair drier",  # 89
#     "toothbrush"   # 90
# ]

# # add the other classes from 80 to 90
# for i in range(80, 91):
#     CLASSES.append({
#         "id": i,
#         "name": coco_80_91[i-80],
#         "supercategory": "None",
#         "color": (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
#     })

NUM_CLASSES = len(CLASSES) # Number of classes in the dataset

#### Image Means and Stds - Remain the same

In [None]:
# Calculating the image mean and standard deviation
"""
From Pytorch documentation:
    - mean (sequence) – Sequence of means for each channel.
    - std (sequence) – Sequence of standard deviations for each channel.
    https://pytorch.org/vision/0.9/_modules/torchvision/models/detection/faster_rcnn.html

    Varies based on the dataset used. For COCO dataset, the mean and standard deviation are:
        - mean = [0.485, 0.456, 0.406]
        - std = [0.229, 0.224, 0.225]

    IMPORTANT: CHANGE THESE FOR FINAL DATASET
"""
# COCO dataset
# img_means = [
#     0.338, 0.320, 0.292, 0.077
# ]

# img_stds = [
#     0.314, 0.304, 0.302, 0.126
# ]

# Pascal VOC dataset
img_means = [0.452, 0.431, 0.399, 0.142]
img_stds = [0.275, 0.273, 0.284, 0.216]


ALL_PRIVILEGED_INFORMATION_DIRS = [
    "Box_Mask",
]

#### Batch Size - Remain the same

In [None]:
batch_size = 4#8
num_workers = 0#4

#### Specifications - Change

#### RetinaNet Baseline Model

In [None]:
# Directory Inputs
SAVE_DIR = f'../runs/RetinaNet/RetinaNet_PascalVOC_baseline1'

# Privileged Information Paths
PRIVILEGED_INFORMATION_DIRS = [

# Special Constructed Features
    # "Box_Mask",
]

# Number of input image channels RGB + Extras
NUM_CHANNELS = 3 + len(PRIVILEGED_INFORMATION_DIRS)

# Defining Model

# Set the device to CUDA or CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load the RetinaNet model with pretrained weights
weights = torchvision.models.detection.RetinaNet_ResNet50_FPN_Weights.DEFAULT
pre_trained_model = torchvision.models.detection.retinanet_resnet50_fpn(weights=weights)

model = pre_trained_model.to(device)

# Modify the first convolutional layer for Multi-channel input
model.backbone.body.conv1 = torch.nn.Conv2d(NUM_CHANNELS, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

# Initialize the first convolutional layer's weights
torch.nn.init.kaiming_normal_(model.backbone.body.conv1.weight, mode='fan_out', nonlinearity='relu')

# Get the number of input features for the classification head
in_features = model.head.classification_head.cls_logits.in_channels
num_anchors = model.head.classification_head.num_anchors

# Modify classification head to match the number of classes for your task
# RetinaNetClassificationHead is redefined to include the correct number of classes
model.head.classification_head = RetinaNetClassificationHead(
    in_channels=in_features,
    num_classes=NUM_CLASSES,
    num_anchors=num_anchors,
    norm_layer=partial(torch.nn.GroupNorm, 32)
)

# Move the model to the correct device (e.g., CUDA or CPU)
model = model.to(device)

# Verify the model structure
print(model)

In [None]:
import evaluate

evaluate.main_function(
    DIR_TEST=DIR_TEST,
    DIR_IMAGES=DIR_IMAGES,
    DIR_ANNOTATIONS=DIR_ANNOTATIONS,
    IMG_RESIZE=IMG_RESIZE,
    SAVE_DIR=SAVE_DIR,
    CLASSES=CLASSES,
    NUM_CLASSES=NUM_CLASSES,
    PRIVILEGED_INFORMATION_DIRS=PRIVILEGED_INFORMATION_DIRS,
    NUM_CHANNELS=NUM_CHANNELS,
    img_means=img_means,
    img_stds=img_stds,
    ALL_PRIVILEGED_INFORMATION_DIRS=ALL_PRIVILEGED_INFORMATION_DIRS,
    model=model,
    BATCH_SIZE=batch_size,
    NUM_WORKERS=num_workers,
    model_name='RetinaNet',
)

del model

#### RetinaNet Teacher Model

In [None]:
# Directory Inputs
SAVE_DIR = f'../runs/RetinaNet/RetinaNet_PascalVOC_teacher1'

# Privileged Information Paths
PRIVILEGED_INFORMATION_DIRS = [

# Special Constructed Features
    "Box_Mask",
]

# Number of input image channels RGB + Extras
NUM_CHANNELS = 3 + len(PRIVILEGED_INFORMATION_DIRS)

# Defining Model

# Set the device to CUDA or CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load the RetinaNet model with pretrained weights
weights = torchvision.models.detection.RetinaNet_ResNet50_FPN_Weights.DEFAULT
pre_trained_model = torchvision.models.detection.retinanet_resnet50_fpn(weights=weights)

model = pre_trained_model.to(device)

# Modify the first convolutional layer for Multi-channel input
model.backbone.body.conv1 = torch.nn.Conv2d(NUM_CHANNELS, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

# Initialize the first convolutional layer's weights
torch.nn.init.kaiming_normal_(model.backbone.body.conv1.weight, mode='fan_out', nonlinearity='relu')

# Get the number of input features for the classification head
in_features = model.head.classification_head.cls_logits.in_channels
num_anchors = model.head.classification_head.num_anchors

# Modify classification head to match the number of classes for your task
# RetinaNetClassificationHead is redefined to include the correct number of classes
model.head.classification_head = RetinaNetClassificationHead(
    in_channels=in_features,
    num_classes=NUM_CLASSES,
    num_anchors=num_anchors,
    norm_layer=partial(torch.nn.GroupNorm, 32)
)

# Move the model to the correct device (e.g., CUDA or CPU)
model = model.to(device)

# Verify the model structure
print(model)

In [None]:
import evaluate

evaluate.main_function(
    DIR_TEST=DIR_TEST,
    DIR_IMAGES=DIR_IMAGES,
    DIR_ANNOTATIONS=DIR_ANNOTATIONS,
    IMG_RESIZE=IMG_RESIZE,
    SAVE_DIR=SAVE_DIR,
    CLASSES=CLASSES,
    NUM_CLASSES=NUM_CLASSES,
    PRIVILEGED_INFORMATION_DIRS=PRIVILEGED_INFORMATION_DIRS,
    NUM_CHANNELS=NUM_CHANNELS,
    img_means=img_means,
    img_stds=img_stds,
    ALL_PRIVILEGED_INFORMATION_DIRS=ALL_PRIVILEGED_INFORMATION_DIRS,
    model=model,
    BATCH_SIZE=batch_size,
    NUM_WORKERS=num_workers,
    model_name='RetinaNet',
)

del model

#### RetinaNet Student Model

In [None]:
# Directory Inputs
SAVE_DIR = f'../runs/RetinaNet/RetinaNet_PascalVOC_student1'

# Privileged Information Paths
PRIVILEGED_INFORMATION_DIRS = [

# Special Constructed Features
    # "Box_Mask",
]

# Number of input image channels RGB + Extras
NUM_CHANNELS = 3 + len(PRIVILEGED_INFORMATION_DIRS)

# Defining Student Model
# Set the device to CUDA or CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load the RetinaNet model with pretrained weights
weights = torchvision.models.detection.RetinaNet_ResNet50_FPN_Weights.DEFAULT
pre_trained_model = torchvision.models.detection.retinanet_resnet50_fpn(weights=weights)

model = pre_trained_model.to(device)

# Get the number of input features for the classification head
in_features = model.head.classification_head.cls_logits.in_channels
num_anchors = model.head.classification_head.num_anchors

# Modify classification head to match the number of classes for your task
# RetinaNetClassificationHead is redefined to include the correct number of classes
model.head.classification_head = RetinaNetClassificationHead(
    in_channels=in_features,
    num_classes=NUM_CLASSES,
    num_anchors=num_anchors,
    norm_layer=partial(torch.nn.GroupNorm, 32)
)

# Move the model to the correct device (e.g., CUDA or CPU)
model = model.to(device)

# Verify the model structure
print(model)

In [None]:
import evaluate

evaluate.main_function(
    DIR_TEST=DIR_TEST,
    DIR_IMAGES=DIR_IMAGES,
    DIR_ANNOTATIONS=DIR_ANNOTATIONS,
    IMG_RESIZE=IMG_RESIZE,
    SAVE_DIR=SAVE_DIR,
    CLASSES=CLASSES,
    NUM_CLASSES=NUM_CLASSES,
    PRIVILEGED_INFORMATION_DIRS=PRIVILEGED_INFORMATION_DIRS,
    NUM_CHANNELS=NUM_CHANNELS,
    img_means=img_means,
    img_stds=img_stds,
    ALL_PRIVILEGED_INFORMATION_DIRS=ALL_PRIVILEGED_INFORMATION_DIRS,
    model=model,
    BATCH_SIZE=batch_size,
    NUM_WORKERS=num_workers,
    model_name='RetinaNet',
)

del model