# Training Notebook #
This notebook will be used to test training of neural networks using pytorch

In [None]:
import torch, os, random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from PIL import Image, ImageDraw
from IPython.display import display
from tqdm.notebook import tqdm
from utils import non_max_iou_suppression, mean_average_precision

#### Hyperparameters ####

In [None]:
#seed = 2023
#torch.manual_seed(2023)

model_choice = "small"
#do_training = False

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

learning_rate = 2e-5
batch_size = 10 # mini-batch is always 1 so this only affects vram usage
weight_decay = 0
epochs = 150
num_workers = 0
pin_memory = True

# loss hyperparameter (they influence each other)
lambda_coord = 5     # multiplier to the loss part relative to the coordinates of the bounding boxes(needs to be high probably)
lambda_no_obj = 0.05  # multiplier to the loss part that regulates the fact that the network detects an objects where there isn't one
# Also to note that there is also the class_loss component and obj_loss component(the opposite of the no_obj loss) which multiplier is 1 and they are still influenced by lambda_coord and lambda_no_obj

# inference hyperparameters
confidence_threshold = 0.6 # highly affects time to detect(and detection itself). If the influence of no_obj_loss is too weak than this must be set low, otherwise high
iou_threshold = 0.7 # affects how many "duplicate" bounding boxes are merged during non-max suppression

dataset_path = "./data/RisikoDataset"

## Dataset ##
Setup dataset class and dataset object. The output of the getitem function of the dataset is a tuple of two tensors, one for the image values (converted to float and scaled so that each pixel value range is [0,1)]) and one for the true label of the objects.
This last one is a tensor of dimensions [num_of_objects, 5] and it is structured like this: [x_center, y_center, width, height, obj_class].

Alternatively upon calling getitem, one might specify mode_plot=true which alters the first part of the output, returning the image in PIL format instead of a tensor.

In [None]:
class RisikoDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_dir:str, mode:str, transform=None):
        if mode != "train" and mode != "val" and mode != "test" and mode != "real":
            raise Exception("Mode value of dataset not valid")

        self.imgs_dir = dataset_dir + "/" + mode + "/images"
        self.annots_dir = dataset_dir + "/" + mode + "/labels"

        self.annotations = sorted( filter( lambda x: os.path.isfile(os.path.join(self.annots_dir, x)), os.listdir(self.annots_dir) ) )
        self.images = sorted( filter( lambda x: os.path.isfile(os.path.join(self.imgs_dir, x)), os.listdir(self.imgs_dir) ) )
        self.transform = transform

        offsets_1 = torch.stack([torch.arange(0, 1, 1/128, dtype=torch.float32).repeat(72), torch.arange(0, 1, 1/72, dtype=torch.float32).repeat(128, 1).t().flatten()]).t()
        offsets_2 = torch.stack([torch.arange(1/128, 1.0001, 1/128, dtype=torch.float32).repeat(72), torch.arange(1/72, 1.0001, 1/72, dtype=torch.float32).repeat(128, 1).t().flatten()]).t()
        self.grid_boxes = torch.cat([offsets_1, offsets_2], 1)

        if len(self.annotations) != len(self.images):
            raise Exception("Number of annotations is different from the number of images")

        for i in range(len(self.annotations)):
            if os.path.splitext(os.path.basename(self.annotations[i]))[0] != os.path.splitext(os.path.basename(self.images[i]))[0]:
                raise Exception("Mismatch between images and annotations at id " + str(i) + ".   imgName = " + os.path.splitext(os.path.basename(self.images[i]))[0] + "   labelName = " + os.path.splitext(os.path.basename(self.annotations[i]))[0])
    
    def __len__(self) -> int:
        return len(self.images)
    
    def __getitem__(self, idx:int, mode_plot:bool=False) -> tuple[torch.Tensor, torch.tensor]:
        annotations_file_data = np.genfromtxt(fname= self.annots_dir + "/" + self.annotations[idx], delimiter=' ', dtype=np.float32)
        classes, bboxes = np.hsplit(annotations_file_data, np.array([1]))
        #classes[:] = 0 #to check if there are issues with classification

        basic_annotations = torch.cat([torch.from_numpy(bboxes), torch.from_numpy(classes)], 1)
        
        img = Image.open(self.imgs_dir + "/" + self.images[idx]).convert("RGB")

        if self.transform: img = self.transform(img)
        
        if mode_plot: return img, basic_annotations

        pil_to_tensor = transforms.Compose([transforms.PILToTensor()])
        img:torch.Tensor = pil_to_tensor(img)

        # normalize image from 0 to 1
        img = img.to(torch.float32) / 256

        annotations = torch.ones([300,5], dtype=torch.float32) * -1
        annotations[:basic_annotations.size()[0], ...] = basic_annotations

        return img, annotations

def shorten_annotations_tensor(annnotations:torch.Tensor) -> torch.Tensor:
    return annnotations[annnotations != -1].reshape([-1,5])


train_set = RisikoDataset(dataset_dir=dataset_path, mode="train")
val_set = RisikoDataset(dataset_dir=dataset_path, mode="val")
test_set = RisikoDataset(dataset_dir=dataset_path, mode="test")

train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=pin_memory, drop_last=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=1, shuffle=False, num_workers=1, pin_memory=pin_memory, drop_last=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=False, num_workers=1, pin_memory=pin_memory, drop_last=True)

### Dataset check ###
Print random image with bouding box to be sure that everything is working correctly

In [None]:
def draw_bboxes_on_image(dataset: RisikoDataset, index:int):
    img, labels = dataset.__getitem__(index, mode_plot=True)

    labels = shorten_annotations_tensor(labels)

    bboxes = labels[..., 0:4]

    img_draw = ImageDraw.Draw(img)
    bboxes = bboxes * torch.tensor([1280,720,1280,720])

    for i in range(bboxes.shape[0]):
        bbox = bboxes[i]

        x0 = bbox[0] - bbox[2] / 2
        x1 = bbox[0] + bbox[2] / 2
        y0 = bbox[1] - bbox[3] / 2
        y1 = bbox[1] + bbox[3] / 2

        img_draw.rectangle([x0, y0, x1, y1], outline="red")
        
    display(img)

    img, labels = dataset.__getitem__(index)
    print(img)
    print(labels)


draw_bboxes_on_image(train_set, random.randint(0, len(train_set)-1))


## Neural Network ##
Definition of the Neural Network

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, 3)
        self.conv2 = nn.Conv2d(64, 128, 3, 3)
        self.conv3 = nn.Conv2d(128, 256, (3,5))
        self.conv4 = nn.Conv2d(256, 512, (3,5))
        self.conv5 = nn.Conv2d(512, 512, (3,5))
        self.conv6 = nn.Conv2d(512, 256, 3)
        self.conv7 = nn.Conv2d(256, 64, 3, padding=1)
        self.finalConv = nn.Conv2d(64, 12+1+4, 3, padding=1) # 12 for classes, 1 for obj presence prob. and 4 for bbox
        self.scale = torch.ones(17, dtype=torch.float32)
        self.scale[13:15] = torch.tensor([1/128, 1/72])
        self.scale = self.scale.expand(128*72,17).to(device)
        self.center_offset = torch.zeros([128*72,17], dtype=torch.float32)
        self.center_offset[..., 13:15] = torch.stack([torch.arange(0, 1, 1/128, dtype=torch.float32).repeat(72), torch.arange(0, 1, 1/72, dtype=torch.float32).repeat(128, 1).t().flatten()]).t()
        self.center_offset = self.center_offset.to(device)

    def forward(self, x):
        x = F.leaky_relu(self.conv1(x))
        x = F.leaky_relu(self.conv2(x))
        x = F.leaky_relu(self.conv3(x))
        x = F.leaky_relu(self.conv4(x))
        x = F.leaky_relu(self.conv5(x))
        x = F.leaky_relu(self.conv6(x))
        x = F.leaky_relu(self.conv7(x))
        x = F.sigmoid(self.finalConv(x))
        x = x.flatten(start_dim=-2)
        x = x.transpose(-2,-1)
        x = x * self.scale + self.center_offset

        return x

class SmallNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, 3)
        self.conv2 = nn.Conv2d(64, 128, 3, 3)
        self.conv3 = nn.Conv2d(128, 128, (3,5))
        self.conv4 = nn.Conv2d(128, 256, (3,5))
        self.conv5 = nn.Conv2d(256, 128, (3,5))
        self.conv6 = nn.Conv2d(128, 64, 3)
        #self.conv7 = nn.Conv2d(128, 64, 3, padding=1)
        self.finalConv = nn.Conv2d(64, 12+1+4, 3, padding=1) # 12 for classes, 1 for obj presence prob. and 4 for bbox
        self.scale = torch.ones(17, dtype=torch.float32)
        self.scale[13:15] = torch.tensor([1/128, 1/72])
        self.scale = self.scale.expand(128*72,17).to(device)
        self.center_offset = torch.zeros([128*72,17], dtype=torch.float32)
        self.center_offset[..., 13:15] = torch.stack([torch.arange(0, 1, 1/128, dtype=torch.float32).repeat(72), torch.arange(0, 1, 1/72, dtype=torch.float32).repeat(128, 1).t().flatten()]).t()
        self.center_offset = self.center_offset.to(device)

    def forward(self, x):
        x = F.leaky_relu(self.conv1(x))
        x = F.leaky_relu(self.conv2(x))
        x = F.leaky_relu(self.conv3(x))
        x = F.leaky_relu(self.conv4(x))
        x = F.leaky_relu(self.conv5(x))
        x = F.leaky_relu(self.conv6(x))
        #x = F.leaky_relu(self.conv7(x))
        x = F.sigmoid(self.finalConv(x))
        x = x.flatten(start_dim=-2)
        x = x.transpose(-2,-1)
        x = x * self.scale + self.center_offset

        return x

net = SmallNet().to(device)

### Testing forward function ###

In [None]:
img, labels = train_set.__getitem__(0)
labels = shorten_annotations_tensor(labels)
img = img.to(device)
output = net(img)
print(output.size())
print(output[..., 12:])

## Loss Function ##


In [None]:
class CustomLoss(nn.Module):
    def __init__(self, lambda_coord:float, lambda_no_obj:float):
        super(CustomLoss, self).__init__()
        self.lambda_coord = lambda_coord
        self.lambda_no_obj = lambda_no_obj
        self.mse = nn.MSELoss(reduction="sum").cuda()

        self.scale_center = torch.tensor([128,72], dtype=torch.float32).to(device)
        

    # predictions: (72,128,)
    def forward(self, predictions:torch.Tensor, target:torch.Tensor):
        predictions = predictions.flatten(end_dim=-2)
        
        target_center_cell_id = torch.mul(target[..., 0:2], self.scale_center).floor().int().to(device)
        flat_id = target_center_cell_id.mul(torch.tensor([1,128], dtype=torch.int32).to(device)).sum(1).to(device)

        predicted_targets = predictions[flat_id].to(device)

        # ==================== #
        #       BOX LOSS       #
        # ==================== #
        box_predictions, box_targets = predicted_targets[..., 13:17], target[..., 0:4]
        box_predictions[..., 2:4] = torch.sqrt(box_predictions[..., 2:4] + 1e-6) * 10 # avoids numerical issues since square root derivative is 1/x
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4]) * 10
        box_predictions[..., 0] = box_predictions[..., 0] * 128
        box_predictions[..., 1] = box_predictions[..., 1] * 72
        box_targets[..., 0] = box_targets[..., 0] * 128
        box_targets[..., 1] = box_targets[..., 1] * 72

        box_loss = self.mse(box_predictions, box_targets)
        
        # ==================== #
        #       OBJ LOSS       #
        # ==================== #
        obj_loss = self.mse(predicted_targets[..., 12], torch.ones(predicted_targets.size(0)).to(device))

        # ==================== #
        #     NO OBJ LOSS      #
        # ==================== #
        no_obj_ids = torch.ones(predictions.size()[0], dtype=torch.bool).to(device)
        no_obj_ids[flat_id] = 0
        predictions[flat_id, 12] = 0
        no_obj_loss = self.mse(predictions[..., 12], torch.zeros(predictions.size(0)).to(device))

        # ==================== #
        #      CLASS LOSS      #
        # ==================== #
        class_target = torch.zeros([predicted_targets.size()[0], 12], dtype=torch.float32).to(device)
        class_target[torch.arange(0, predicted_targets.size()[0]), target[...,4].int()] = 1
        class_loss = self.mse(predicted_targets[..., :12], class_target)

        loss = self.lambda_coord * box_loss + obj_loss + self.lambda_no_obj * no_obj_loss + class_loss

        return loss

### Testing Loss function ###

In [None]:
loss_function = CustomLoss(lambda_coord=1.0, lambda_no_obj=1).cuda()

emu_target = torch.tensor(
    [
        [0.1, 0.1, 0.0, 0.0, 10],
        [0.9999999, 0.9999999, 0.0, 0.0, 5]
    ], dtype=torch.float32)

scale_center = torch.tensor([128,72], dtype=torch.float32)
target_center_cell_id = emu_target[..., 0:2].mul(scale_center).floor().int()
flat_id = target_center_cell_id.mul(torch.tensor([1,128], dtype=torch.int32)).sum(1)

emu_output = torch.zeros([128*72,12+1+4])
emu_output[flat_id[0], 13:17] = emu_target[0, 0:4]
emu_output[flat_id[1], 13:17] = emu_target[1, 0:4]
emu_output[flat_id, 12] = 1

emu_output[flat_id[0], 10] = 1
emu_output[flat_id[1], 5] = 1

emu_output[[0,1,2], [3,4,5]] = 1 # correct(no effect on result since the respective region does not contain any objects)
#emu_output[[0,1,2], 12] = 1

emu_output, emu_target = emu_output.to(device), emu_target.to(device)

#print(emu_output[flat_id])
#print(emu_target)

loss = loss_function(emu_output,emu_target)
print(loss)

## Training Function ##

In [None]:
def train_function(train_loader, model, optimizer, loss_function):
    loop = tqdm(train_loader, leave=True)
    mean_loss = []

    for batch_idx, (x,y) in enumerate(loop):
        x, y = x.to(device), y.to(device)

        for i in range(x.size(0)):
            x_minibatch, y_minibatch = x[i], y[i]
            x_minibatch, y_minibatch = x_minibatch, shorten_annotations_tensor(y_minibatch)

            out = model(x_minibatch)
            loss = loss_function(out, y_minibatch)
            mean_loss.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        #update progress-bar
        loop.set_postfix(loss=loss.item())

    print(f"Mean loss was {sum(mean_loss)/len(mean_loss)}")

## Train ##

In [None]:
model_to_load = "models/Net_165epochs_5-05"
if os.path.isfile(model_to_load):
    model = torch.load(model_to_load)
    model.eval()
else:
    if model_choice == "small":
        model = SmallNet().to(device)
    else:
        model = Net().to(device)
    do_training = True

if do_training:
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    loss_function = CustomLoss(lambda_coord=lambda_coord,lambda_no_obj=lambda_no_obj).cuda()

    for epoch in range(epochs):
        print("Training Epoch " + str(epoch+1))
        train_function(train_loader, model, optimizer, loss_function)

        #if (epoch+1) % 15 == 0:
        #    model_name = "models/150epochs_" + str(epoch+1)
        #    torch.save(model, model_name)
    
    torch.save(model, "models/latest_model")
    model.eval()

### Evaluation Function ###

In [None]:
def get_boxes_without_low_conf(net_output:torch.Tensor, confidence_threshold:float) -> list[list]:

    # PARTIAL NON-MAX SUPRESSION DONE HERE
    # first remove under-threshold bounding boxes from total(faster doing this with tensors than lists)
    conf_t = torch.tensor(confidence_threshold, dtype=torch.float32)
    net_output = net_output[net_output[..., 12]  > conf_t]

    # select only one class for each one of the remaining/high-confidence boxes
    boxes_data = torch.zeros([net_output.size()[0], 6], dtype=torch.float32)
    boxes_data[..., 0] = net_output[..., 0:12].argmax(1)
    boxes_data[..., 1] = net_output[..., 12]
    bbox_wh_half = torch.mul(net_output[..., 15:], 0.5)
    boxes_data[..., 2:4] = torch.sub(net_output[..., 13:15], bbox_wh_half)
    boxes_data[..., 4:6] = torch.add(net_output[..., 13:15], bbox_wh_half)
    
    return boxes_data.tolist()

def get_boxes_from_labels(labels:torch.Tensor) -> list[list]:

    boxes_data = torch.zeros([labels.size()[0], 6], dtype=torch.float32)
    boxes_data[..., 0] = labels[..., 4]
    boxes_data[..., 1] = 1
    bbox_wh_half = torch.mul(labels[..., 2:4], 0.5)
    boxes_data[..., 2:4] = torch.sub(labels[..., 0:2], bbox_wh_half)
    boxes_data[..., 4:6] = torch.add(labels[..., 0:2], bbox_wh_half)

    return boxes_data.tolist()

def get_boxes(model, loader, iou_threshold:float, confidence_threshold:float) -> tuple[list,list]:
    train_idx = 0

    all_pred_boxes = []
    all_true_boxes = []

    for batch_idx, (x, labels) in enumerate(loader):
        
        x = x.to(device)
        #labels = shorten_annotations_tensor(labels)
        labels = labels.to(device)

        with torch.no_grad():
            predictions = model(x)

        batch_size = x.shape[0]
        
        for idx in range(batch_size):
            true_bboxes = get_boxes_from_labels(shorten_annotations_tensor(labels[idx]))
            bboxes = get_boxes_without_low_conf(predictions[idx], confidence_threshold)
            
            nms_boxes = non_max_iou_suppression(bboxes, iou_threshold=iou_threshold)

            for nms_box in nms_boxes:
                all_pred_boxes.append([train_idx] + nms_box)

            for box in true_bboxes:
                # many will get converted to 0 pred
                #if box[1] > threshold:
                all_true_boxes.append([train_idx] + box)

            train_idx += 1

    return all_pred_boxes, all_true_boxes

In [None]:
# get boxes into list of lists. Also apply nms on predicted values
#pred_boxes, target_boxes = get_boxes(model=model, loader=test_loader, iou_threshold=iou_threshold, confidence_threshold=confidence_threshold)

# perform mAP
#mean_avg_prec = mean_average_precision(pred_boxes, target_boxes, iou_threshold=iou_threshold, num_classes=12)
#print(f"Test mAP: {mean_avg_prec}")

In [None]:
dataset = test_set

index = random.randint(0, len(dataset)-1)
img, labels = dataset.__getitem__(index)
img = img.to(device)

prediction = model(img).to("cpu")

bboxes = get_boxes_without_low_conf(prediction, confidence_threshold)
            
bboxes = torch.tensor(non_max_iou_suppression(bboxes, iou_threshold=iou_threshold))

img, labels = dataset.__getitem__(index, mode_plot=True)
img_draw = ImageDraw.Draw(img)
bboxes = bboxes[...,2:] * torch.tensor([1280,720,1280,720])

for i in range(bboxes.shape[0]):
    bbox = bboxes[i]

    x0 = bbox[0]
    x1 = bbox[2]
    y0 = bbox[1]
    y1 = bbox[3]

    img_draw.rectangle([x0, y0, x1, y1], outline="red")
        
display(img)

In [None]:
img_resize = transforms.Compose([transforms.Resize([720,1280])])
real_set = RisikoDataset(dataset_dir=dataset_path, mode="real", transform=img_resize)
real_loader = torch.utils.data.DataLoader(real_set, batch_size=1, shuffle=False, num_workers=1, pin_memory=pin_memory, drop_last=True)

def evaluate(model, dataset, dataloader):
    # get boxes into list of lists. Also apply nms on predicted values
    pred_boxes, target_boxes = get_boxes(model=model, loader=dataloader, iou_threshold=iou_threshold, confidence_threshold=confidence_threshold)

    # perform mAP
    mean_avg_prec = mean_average_precision(pred_boxes, target_boxes, iou_threshold=iou_threshold, num_classes=12)
    print(f"mAP: {mean_avg_prec}")

    index = random.randint(0, len(dataset)-1)
    img, labels = dataset.__getitem__(index)
    print(img.size())
    img = img.to(device)

    prediction = model(img).to("cpu")

    bboxes = get_boxes_without_low_conf(prediction, confidence_threshold=confidence_threshold)
                
    bboxes = torch.tensor(non_max_iou_suppression(bboxes, iou_threshold=iou_threshold))

    img, labels = dataset.__getitem__(index, mode_plot=True)
    img_draw = ImageDraw.Draw(img)
    bboxes = bboxes[...,2:] * torch.tensor([1280,720,1280,720])

    for i in range(bboxes.shape[0]):
        bbox = bboxes[i]

        x0 = bbox[0]
        x1 = bbox[2]
        y0 = bbox[1]
        y1 = bbox[3]

        img_draw.rectangle([x0, y0, x1, y1], outline="red")
            
    display(img)

evaluate(model, test_set, test_loader)
evaluate(model, real_set, real_loader)