In [1]:
from PIL import Image
import os
import cv2
import numpy as np
import xml.etree.ElementTree as ET
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import torch
import albumentations as A
from albumentations.pytorch import ToTensorV2
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
import torch.nn.functional as F
import torchvision.ops as ops 
from torchvision.ops import roi_pool
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV

  from .autonotebook import tqdm as notebook_tqdm
  check_for_updates()


In [2]:
data = {
    'filename': [],
    'width': [],
    'height': [],
    'class': [],
    'xmin': [],
    'ymin': [],
    'xmax': [],
    'ymax': []
}

In [3]:
def get_file_image_dimensions(file_path):
    if not os.path.isfile(file_path):
        return None, None
    with Image.open(file_path) as img:
        width, height = img.size
    return width, height

def get_xml_image_dimensions(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    size = root.find('size')
    if size is not None:
        width = size.find('width').text
        height = size.find('height').text
        if width and height:
            return int(width), int(height)
    return 0, 0  


def get_image_dimensions(xml_file, image_file_path):
    width, height = get_xml_image_dimensions(xml_file)
    
    if width == 0 or height == 0:
        width, height = get_file_image_dimensions(image_file_path)
        
    return width, height


def parse_xml(xml_file, image_file_path):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    filename = root.find('filename').text
    
    width, height = get_image_dimensions(xml_file, image_file_path)


    for obj in root.iter('object'):
        obj_class = obj.find('name').text
        bbox = obj.find('bndbox')
        xmin = int(bbox.find('xmin').text)
        ymin = int(bbox.find('ymin').text)
        xmax = int(bbox.find('xmax').text)
        ymax = int(bbox.find('ymax').text)

        data['filename'].append(filename)
        data['width'].append(width)
        data['height'].append(height)
        data['class'].append(obj_class)
        data['xmin'].append(xmin)
        data['ymin'].append(ymin)
        data['xmax'].append(xmax)
        data['ymax'].append(ymax)


In [4]:
class FruitDataset(Dataset):
    def __init__(self, data_dir, transforms=None, image_size=(224, 224)):
        self.data_dir = data_dir
        self.transforms = transforms
        self.image_size = image_size 
        
        self.images = [f for f in os.listdir(data_dir) if f.endswith('.jpg')]
        
        for image_file in self.images:
            xml_file = image_file.replace('.jpg', '.xml')
            xml_path = os.path.join(data_dir, xml_file)
            image_path = os.path.join(data_dir, image_file)
            if os.path.exists(xml_path):
                parse_xml(xml_path, image_path)
        
        self.dataframe = pd.DataFrame(data)

    def __len__(self):
        return len(self.images)
    
    def class_to_label(self, class_name):
        class_mapping = {'apple': 0, 'banana': 1, 'orange': 2}
        return class_mapping.get(class_name, 0) 
    
    def __getitem__(self, idx):
        image_name = self.images[idx]
        image_path = os.path.join(self.data_dir, image_name)

    # Завантажуємо зображення
        image = cv2.imread(image_path)
    
    # Перетворюємо в RGB
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0

        boxes = []
        labels = []
    
        image_data = self.dataframe[self.dataframe['filename'] == image_name]
        for _, row in image_data.iterrows():
            xmin = row['xmin']
            ymin = row['ymin']
            xmax = row['xmax']
            ymax = row['ymax']
            label = self.class_to_label(row['class'])
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(label)
    
        orig_height, orig_width = image.shape[:2]
    
    # Приведення всіх зображень до одного розміру
        image = cv2.resize(image, self.image_size)
    
    # Пропорційне масштабування bounding boxes
        scale_x = self.image_size[0] / orig_width
        scale_y = self.image_size[1] / orig_height
        boxes = [[xmin * scale_x, ymin * scale_y, xmax * scale_x, ymax * scale_y] for xmin, ymin, xmax, ymax in boxes]
    
        boxes = [[xmin / self.image_size[0], ymin / self.image_size[1], xmax / self.image_size[0], ymax / self.image_size[1]] for xmin, ymin, xmax, ymax in boxes]

        if self.transforms:
            transformed = self.transforms(image=image, bboxes=boxes, labels=labels)
            image = transformed['image']
            boxes = torch.as_tensor(transformed['bboxes'], dtype=torch.float32)

        labels = torch.as_tensor(labels, dtype=torch.int64)
    
        target = {"boxes": boxes, "labels": labels}
    
        return image, target

In [5]:
transform = A.Compose([
    A.Resize(224, 224),
    #A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(p=1.0),
], bbox_params=A.BboxParams(format='albumentations', label_fields=['labels']))

dataset = FruitDataset(data_dir='/Users/matvejzasadko/Downloads/All/Study/NNetworks/Lb1/archive/train_zip/train', transforms=transform, image_size=(224, 224))

dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

for images, targets in dataloader:
    print(images[0].shape, targets[0]['boxes'])

torch.Size([3, 224, 224]) tensor([[0.0752, 0.2995, 0.8213, 0.7448],
        [0.3545, 0.2747, 0.9893, 0.9818],
        [0.0361, 0.2435, 0.7744, 0.6315]])
torch.Size([3, 224, 224]) tensor([[0.0623, 0.0806, 0.9466, 0.9403]])
torch.Size([3, 224, 224]) tensor([[0.5188, 0.3262, 0.9000, 0.7088],
        [0.1312, 0.5325, 0.4913, 0.8637],
        [0.2425, 0.3625, 0.5962, 0.6812]])
torch.Size([3, 224, 224]) tensor([[0.2594, 0.2350, 0.6450, 0.7517],
        [0.6187, 0.3083, 0.9862, 0.8117]])
torch.Size([3, 224, 224]) tensor([[0.5446, 0.0688, 0.9638, 0.5081]])
torch.Size([3, 224, 224]) tensor([[0.4953, 0.3741, 0.8453, 0.9281],
        [0.0750, 0.4365, 0.6578, 0.8825],
        [0.1266, 0.2566, 0.7125, 0.6115]])
torch.Size([3, 224, 224]) tensor([[0.0133, 0.5699, 0.2867, 0.8853],
        [0.2367, 0.5591, 0.4800, 0.8280],
        [0.1467, 0.1505, 0.9133, 0.4946],
        [0.6367, 0.6344, 0.9500, 0.9606]])
torch.Size([3, 224, 224]) tensor([[0.3267, 0.0757, 0.7040, 0.5917],
        [0.0015, 0.0688, 0.28



tensor([[0.1463, 0.2129, 0.4642, 0.7479],
        [0.5884, 0.3595, 0.8821, 0.7905],
        [0.2653, 0.4634, 0.5568, 0.8756],
        [0.7358, 0.3152, 0.9884, 0.7717]])
torch.Size([3, 224, 224]) tensor([[0.1100, 0.1026, 0.8800, 0.8462]])
torch.Size([3, 224, 224]) tensor([[0.5168, 0.1902, 0.9804, 0.9498]])
torch.Size([3, 224, 224]) tensor([[0.1233, 0.1182, 0.8490, 1.0000]])
torch.Size([3, 224, 224]) tensor([[0.3313, 0.2281, 0.7083, 0.7984]])
torch.Size([3, 224, 224]) tensor([[0.4593, 0.3480, 0.9971, 1.0000]])
torch.Size([3, 224, 224]) tensor([[0.0833, 0.1400, 0.9167, 0.9900]])
torch.Size([3, 224, 224]) tensor([[9.7173e-02, 4.3042e-01, 8.0654e-01, 9.1450e-01],
        [8.8339e-04, 5.8962e-04, 3.5954e-01, 3.6380e-01],
        [9.1873e-02, 2.3585e-03, 5.5124e-01, 3.2842e-01]])
torch.Size([3, 224, 224]) tensor([[0.1900, 0.4540, 0.5240, 0.8080],
        [0.4580, 0.4380, 0.8120, 0.7520],
        [0.3380, 0.2560, 0.6960, 0.5940]])
torch.Size([3, 224, 224]) tensor([[0.1625, 0.4916, 0.3675, 0.77



In [6]:
class YOLO(nn.Module):
    def __init__(self, in_channels=3, split_size=10, num_boxes=2, num_classes=3):
        super(YOLO, self).__init__()
        self.S = split_size
        self.B = num_boxes
        self.C = num_classes

        self.conv_layers = nn.Sequential(
            nn.Conv2d(in_channels, 64, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(192),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(192, 128, kernel_size=1, stride=1, padding=0),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1),
            
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1),
            
            nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1),
            
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            *[
                nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0),
                nn.BatchNorm2d(256),
                nn.LeakyReLU(0.1),
                
                nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
                nn.BatchNorm2d(512),
                nn.LeakyReLU(0.1)
            ] * 4,
            
            nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),
            
            nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )

        self._calculate_conv_output()

        self.fcs = nn.Sequential(
            nn.Flatten(),
            nn.Linear(self.conv_output_size, 4096),
            nn.Dropout(0.5),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, self.S * self.S * (self.C + self.B * 5))
        )

    def _calculate_conv_output(self):
        with torch.no_grad():
            dummy_input = torch.randn(1, 3, 224, 224)
            conv_output = self.conv_layers(dummy_input)
            self.conv_output_size = conv_output.view(-1).size(0)

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fcs(x)
        return x

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

S, B, C = 7, 1, 3

model = YOLO(in_channels=3, split_size=S, num_boxes=B, num_classes=C).to(device)

dummy_input = torch.randn(1, 3, 224, 224).to(device)

output = model(dummy_input)
print(output.shape) 

torch.Size([1, 392])


In [8]:
def intersection_over_union(boxes_preds, boxes_labels, box_format='midpoint'):
 
    if box_format == 'midpoint':
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2
    if box_format == 'corners':
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4] 
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]
    
    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)
    
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
    
    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))
    
    return intersection / (box1_area + box2_area - intersection + 1e-6)

In [10]:
class YoloLoss(nn.Module):
    def __init__(self, S=7, B=1, C=3, lambda_coord=5):
        super(YoloLoss, self).__init__()
        self.S = S
        self.B = B
        self.C = C
        self.lambda_coord = lambda_coord

        self.mse = nn.MSELoss(reduction="sum")

    def forward(self, predictions, target):
        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B * 5)

        if self.B >= 2:
            iou_b1 = intersection_over_union(predictions[..., self.C + 1:self.C + 5], target[..., self.C + 1:self.C + 5])
            iou_b2 = intersection_over_union(predictions[..., self.C + 6:self.C + 10], target[..., self.C + 1:self.C + 5])
            ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)
            iou_maxes, bestbox = torch.max(ious, dim=0)
        else:
            iou_b1 = intersection_over_union(predictions[..., self.C + 1:self.C + 5], target[..., self.C + 1:self.C + 5])
            iou_maxes = iou_b1
            bestbox = torch.zeros_like(iou_b1, dtype=torch.bool)  
     
        exists_box = target[..., self.C].unsqueeze(3)

        box_predictions = exists_box * (
            (bestbox * predictions[..., self.C + 6:self.C + 10] if self.B >= 2 else predictions[..., self.C + 1:self.C + 5])
        )
        box_targets = exists_box * target[..., self.C + 1:self.C + 5]

        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[..., 2:4] + 1e-6)
        )
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2),
        )
ї
        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :self.C], end_dim=-2),
            torch.flatten(exists_box * target[..., :self.C], end_dim=-2),
        )

        return class_loss, self.lambda_coord * box_loss


In [11]:
LEARNING_RATE = 0.00001
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16
WEIGHT_DECAY = 0.0003
EPOCHS = 20
NUM_WORKERS = 2
PIN_MEMORY = True
LOAD_MODEL = False
LOAD_MODEL_FILE = "model.pth"

S, B, C = 7, 1, 3

model = YOLO(in_channels=3, split_size=S, num_boxes=B, num_classes=C).to(DEVICE)
criterion = YoloLoss(S=S, B=B, C=C)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    model.train()
    total_class_loss = 0
    total_box_loss = 0

    for batch in dataloader:
        images, targets = batch  
        
        if isinstance(images, tuple):
            images = torch.stack(images)
        
        images = images.to(DEVICE)
        
        target_tensor = torch.zeros((len(images), S, S, C + B * 5), device=DEVICE)
        for i, target in enumerate(targets):
            for box, label in zip(target['boxes'], target['labels']):
                x_center = (box[0] + box[2]) / 2 * S  
                y_center = (box[1] + box[3]) / 2 * S
                grid_x, grid_y = int(x_center), int(y_center)

                target_tensor[i, grid_y, grid_x, :2] = torch.tensor([x_center - grid_x, y_center - grid_y], device=DEVICE)
                target_tensor[i, grid_y, grid_x, 2:4] = box[2:] - box[:2]
                target_tensor[i, grid_y, grid_x, 4] = 1.0  
                target_tensor[i, grid_y, grid_x, 5 + int(label.item())] = 1.0  

        predictions = model(images)
        
        class_loss, box_loss = criterion(predictions, target_tensor)
        
        optimizer.zero_grad()
        (class_loss + box_loss).backward()  
        optimizer.step()

        total_class_loss += class_loss.item()
        total_box_loss += box_loss.item()
    
    avg_class_loss = total_class_loss / len(dataloader)
    avg_box_loss = total_box_loss / len(dataloader)

    print(f"Epoch [{epoch+1}/{EPOCHS}], Class Loss: {avg_class_loss:.4f}, Box Loss: {avg_box_loss:.4f}")




Epoch [1/20], Class Loss: 1.9155, Box Loss: 25.0652




Epoch [2/20], Class Loss: 1.2205, Box Loss: 16.6648




Epoch [3/20], Class Loss: 1.0526, Box Loss: 15.2889




Epoch [4/20], Class Loss: 0.9167, Box Loss: 13.3029




Epoch [5/20], Class Loss: 0.8916, Box Loss: 13.1018




Epoch [6/20], Class Loss: 0.8692, Box Loss: 12.4108




Epoch [7/20], Class Loss: 0.8041, Box Loss: 11.1631




Epoch [8/20], Class Loss: 0.7315, Box Loss: 10.6619




Epoch [9/20], Class Loss: 0.7399, Box Loss: 9.4557




Epoch [10/20], Class Loss: 0.6507, Box Loss: 9.2914




Epoch [11/20], Class Loss: 0.6352, Box Loss: 9.1030




Epoch [12/20], Class Loss: 0.6046, Box Loss: 8.5568




Epoch [13/20], Class Loss: 0.6352, Box Loss: 8.0774




Epoch [14/20], Class Loss: 0.5534, Box Loss: 7.9955




Epoch [15/20], Class Loss: 0.5722, Box Loss: 7.6664




Epoch [16/20], Class Loss: 0.5458, Box Loss: 7.6368




Epoch [17/20], Class Loss: 0.5584, Box Loss: 6.9575




Epoch [18/20], Class Loss: 0.5322, Box Loss: 7.3211




Epoch [19/20], Class Loss: 0.5602, Box Loss: 7.4001




Epoch [20/20], Class Loss: 0.5671, Box Loss: 7.5399


In [12]:
def collate_fn(batch):
    images, targets = zip(*batch)
    images = torch.stack(images, 0)  
    return images, targets 

test_dataset = FruitDataset(
    data_dir='/Users/matvejzasadko/Downloads/All/Study/NNetworks/Lb1/archive/test_zip/test', 
    transforms=transform,
    image_size=(224, 224)
)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

model.eval()  
total_correct_classifications = 0
total_objects = 0
total_box_loss = 0
num_batches = 0

with torch.no_grad():  
    for images, targets in test_loader:
        images = images.to(DEVICE)
        
        target_tensor = torch.zeros((len(images), S, S, C + B * 5), device=DEVICE)
        for i, target in enumerate(targets):
            for box, label in zip(target['boxes'], target['labels']):
                x_center = (box[0] + box[2]) / 2 * S  
                y_center = (box[1] + box[3]) / 2 * S
                grid_x, grid_y = int(x_center), int(y_center)

                target_tensor[i, grid_y, grid_x, :2] = torch.tensor([x_center - grid_x, y_center - grid_y], device=DEVICE)
                target_tensor[i, grid_y, grid_x, 2:4] = box[2:] - box[:2]
                target_tensor[i, grid_y, grid_x, 4] = 1.0  
                target_tensor[i, grid_y, grid_x, 5 + int(label.item())] = 1.0  

        predictions = model(images).view(-1, S, S, C + B * 5)
        
        for i in range(len(images)):
            for grid_y in range(S):
                for grid_x in range(S):
                    if target_tensor[i, grid_y, grid_x, 4] == 1: 
                        true_class = torch.argmax(target_tensor[i, grid_y, grid_x, 5:])
                        predicted_class = torch.argmax(predictions[i, grid_y, grid_x, 5:])
                        
                        if predicted_class == true_class:
                            total_correct_classifications += 1
                        total_objects += 1
        
        _, box_loss = criterion(predictions, target_tensor)
        total_box_loss += box_loss.item()
        num_batches += 1

accuracy = (total_correct_classifications / total_objects) * 100 if total_objects > 0 else 0
avg_box_loss = total_box_loss / num_batches

print(f"Test Accuracy: {accuracy:.2f}%, Test Box Loss: {avg_box_loss:.4f}")


Test Accuracy: 66.67%, Test Box Loss: 10.2388




Tuning

In [26]:
class YOLO_Tune(nn.Module):
    def __init__(self, in_channels=3, split_size=10, num_boxes=2, num_classes=3,
                 kernel_sizes=[7, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3],
                 strides=[2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1],
                 paddings=[3, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1],
                 num_filters=[64, 192, 128, 256, 256, 512, 512, 1024, 1024, 1024, 1024, 1024],
                 dropout_rate=0.5):
        super(YOLO_Tune, self).__init__()
        self.S = split_size
        self.B = num_boxes
        self.C = num_classes
        
        layers = []
        in_channels_current = in_channels
        input_size = 224  

        for i in range(len(kernel_sizes)):
            layers.append(nn.Conv2d(in_channels_current, num_filters[i], kernel_size=kernel_sizes[i], 
                                    stride=strides[i], padding=paddings[i]))
            layers.append(nn.BatchNorm2d(num_filters[i]))
            layers.append(nn.LeakyReLU(0.1))

            input_size = (input_size + 2 * paddings[i] - kernel_sizes[i]) // strides[i] + 1

            if input_size > 2 and (i + 1) % 2 == 0: 
                layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
                input_size //= 2

            if input_size <= 1:
                print("Skipping additional layers to prevent 1x1 spatial dimensions.")
                break

            in_channels_current = num_filters[i]

        self.conv_layers = nn.Sequential(*layers)

        self._calculate_conv_output()

        self.fcs = nn.Sequential(
            nn.Flatten(),
            nn.Linear(self.conv_output_size, 4096),
            nn.Dropout(dropout_rate),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, self.S * self.S * (self.C + self.B * 5))
        )

    def _calculate_conv_output(self):
        with torch.no_grad():
            dummy_input = torch.randn(1, 3, 224, 224)
            conv_output = self.conv_layers(dummy_input)
            self.conv_output_size = conv_output.view(-1).size(0)

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fcs(x)
        return x


In [27]:
import random
import torch
import torch.optim as optim

param_grid = {
    'kernel_sizes': [[7, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3], [5, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3]],
    'strides': [[2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1], [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]],
    'paddings': [[3, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1], [2, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1]],
    'num_filters': [[64, 192, 128, 256, 256, 512, 512, 1024, 1024, 1024, 1024, 1024], [32, 64, 128, 256, 256, 512, 512, 1024, 1024, 1024, 1024, 1024]],
    'dropout_rate': [0.3, 0.5]
}

num_trials = 10
best_loss = float("inf")
best_params = None

EPOCHS = 3

num_trials = 10
best_loss = float("inf")
best_params = None

for trial in range(num_trials):
    params = {
        'kernel_sizes': random.choice([[7, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3], [5, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3]]),
        'strides': random.choice([[2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1], [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]]),
        'paddings': random.choice([[3, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1], [2, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1]]),
        'num_filters': random.choice([[64, 192, 128, 256, 256, 512, 512, 1024, 1024, 1024, 1024, 1024], [32, 64, 128, 256, 256, 512, 512, 1024, 1024, 1024, 1024, 1024]]),
        'dropout_rate': random.choice([0.3, 0.5])
    }

    model = YOLO_Tune(
        in_channels=3,
        split_size=S,
        num_boxes=B,
        num_classes=C,
        kernel_sizes=params['kernel_sizes'],
        strides=params['strides'],
        paddings=params['paddings'],
        num_filters=params['num_filters'],
        dropout_rate=params['dropout_rate']
    ).to(DEVICE)
    
    try:
        model._calculate_conv_output()
    except ValueError:
        print("Skipped configuration due to incompatible spatial dimensions.")
        continue  

    criterion = YoloLoss(S=S, B=B, C=C)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    for epoch in range(EPOCHS):
        model.train()
        total_class_loss = 0
        total_box_loss = 0

        for batch in dataloader:
            images, targets = batch  
            
            if isinstance(images, tuple):
                images = torch.stack(images)
            
            images = images.to(DEVICE)
            
            target_tensor = torch.zeros((len(images), S, S, C + B * 5), device=DEVICE)
            for i, target in enumerate(targets):
                for box, label in zip(target['boxes'], target['labels']):
                    x_center = (box[0] + box[2]) / 2 * S  
                    y_center = (box[1] + box[3]) / 2 * S
                    grid_x, grid_y = int(x_center), int(y_center)

                    target_tensor[i, grid_y, grid_x, :2] = torch.tensor([x_center - grid_x, y_center - grid_y], device=DEVICE)
                    target_tensor[i, grid_y, grid_x, 2:4] = box[2:] - box[:2]
                    target_tensor[i, grid_y, grid_x, 4] = 1.0  
                    target_tensor[i, grid_y, grid_x, 5 + int(label.item())] = 1.0  

            predictions = model(images)
            
            class_loss, box_loss = criterion(predictions, target_tensor)
            
            optimizer.zero_grad()
            (class_loss + box_loss).backward()  
            optimizer.step()

            total_class_loss += class_loss.item()
            total_box_loss += box_loss.item()
        
        avg_class_loss = total_class_loss / len(dataloader)
        avg_box_loss = total_box_loss / len(dataloader)
        total_loss = avg_class_loss + avg_box_loss

        print(f"Trial [{trial+1}/{num_trials}], Epoch [{epoch+1}/{EPOCHS}], Class Loss: {avg_class_loss:.4f}, Box Loss: {avg_box_loss:.4f}, Total Loss: {total_loss:.4f}")

    if total_loss < best_loss:
        best_loss = total_loss
        best_params = params

print("Best parameters found:")
print(best_params)
print("Best total loss:", best_loss)

Skipping additional layers to prevent 1x1 spatial dimensions.




Trial [1/10], Epoch [1/3], Class Loss: 3.3731, Box Loss: 31.2203, Total Loss: 34.5934




Trial [1/10], Epoch [2/3], Class Loss: 1.7951, Box Loss: 20.5161, Total Loss: 22.3112




Trial [1/10], Epoch [3/3], Class Loss: 1.3063, Box Loss: 18.0510, Total Loss: 19.3572




Trial [2/10], Epoch [1/3], Class Loss: 2.2033, Box Loss: 23.1581, Total Loss: 25.3614




Trial [2/10], Epoch [2/3], Class Loss: 1.2882, Box Loss: 14.5840, Total Loss: 15.8722




Trial [2/10], Epoch [3/3], Class Loss: 0.9645, Box Loss: 11.9238, Total Loss: 12.8882




Trial [3/10], Epoch [1/3], Class Loss: 2.0121, Box Loss: 23.6266, Total Loss: 25.6387




Trial [3/10], Epoch [2/3], Class Loss: 1.0120, Box Loss: 12.8223, Total Loss: 13.8343




Trial [3/10], Epoch [3/3], Class Loss: 0.7606, Box Loss: 10.5672, Total Loss: 11.3278
Skipping additional layers to prevent 1x1 spatial dimensions.




Trial [4/10], Epoch [1/3], Class Loss: 2.1161, Box Loss: 28.1050, Total Loss: 30.2211




Trial [4/10], Epoch [2/3], Class Loss: 1.2176, Box Loss: 16.0849, Total Loss: 17.3025




Trial [4/10], Epoch [3/3], Class Loss: 0.9924, Box Loss: 13.8509, Total Loss: 14.8432
Skipping additional layers to prevent 1x1 spatial dimensions.




Trial [5/10], Epoch [1/3], Class Loss: 2.7224, Box Loss: 31.3687, Total Loss: 34.0910




Trial [5/10], Epoch [2/3], Class Loss: 1.7107, Box Loss: 20.3930, Total Loss: 22.1037




Trial [5/10], Epoch [3/3], Class Loss: 1.3340, Box Loss: 17.6060, Total Loss: 18.9400




Trial [6/10], Epoch [1/3], Class Loss: 1.8628, Box Loss: 22.4842, Total Loss: 24.3470




Trial [6/10], Epoch [2/3], Class Loss: 1.0108, Box Loss: 12.7928, Total Loss: 13.8037




Trial [6/10], Epoch [3/3], Class Loss: 0.7556, Box Loss: 10.1569, Total Loss: 10.9125




Trial [7/10], Epoch [1/3], Class Loss: 2.2052, Box Loss: 25.5661, Total Loss: 27.7713




Trial [7/10], Epoch [2/3], Class Loss: 1.2455, Box Loss: 15.3820, Total Loss: 16.6276




Trial [7/10], Epoch [3/3], Class Loss: 0.9895, Box Loss: 13.1469, Total Loss: 14.1364
Skipping additional layers to prevent 1x1 spatial dimensions.




Trial [8/10], Epoch [1/3], Class Loss: 2.1114, Box Loss: 26.5045, Total Loss: 28.6159




Trial [8/10], Epoch [2/3], Class Loss: 1.2101, Box Loss: 16.1041, Total Loss: 17.3142




Trial [8/10], Epoch [3/3], Class Loss: 0.9172, Box Loss: 12.8537, Total Loss: 13.7710




Trial [9/10], Epoch [1/3], Class Loss: 2.1078, Box Loss: 25.7052, Total Loss: 27.8130




Trial [9/10], Epoch [2/3], Class Loss: 1.3360, Box Loss: 16.4837, Total Loss: 17.8197




Trial [9/10], Epoch [3/3], Class Loss: 1.0679, Box Loss: 14.2637, Total Loss: 15.3315




Trial [10/10], Epoch [1/3], Class Loss: 1.6315, Box Loss: 21.7490, Total Loss: 23.3805




Trial [10/10], Epoch [2/3], Class Loss: 0.9003, Box Loss: 12.8504, Total Loss: 13.7507




Trial [10/10], Epoch [3/3], Class Loss: 0.6609, Box Loss: 10.4107, Total Loss: 11.0716
Best parameters found:
{'kernel_sizes': [7, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3], 'strides': [2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1], 'paddings': [3, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1], 'num_filters': [32, 64, 128, 256, 256, 512, 512, 1024, 1024, 1024, 1024, 1024], 'dropout_rate': 0.3}
Best total loss: 10.912496860325337


In [55]:
EPOCHS = 10
LEARNING_RATE = 0.0001
# model = YOLO_Tune(
#         in_channels=3,
#         split_size=S,
#         num_boxes=B,
#         num_classes=C,
#         kernel_sizes=[7, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3],
#         strides=[2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1],
#         paddings=[3, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1],
#         num_filters=[32, 64, 128, 256, 256, 512, 512, 1024, 1024, 1024, 1024, 1024],
#         dropout_rate=0.3
#     ).to(DEVICE)

model2 = YOLO_Tune(
        in_channels=3,
        split_size=S,
        num_boxes=B,
        num_classes=C,
        kernel_sizes=[7, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3],
        strides=[2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1],
        paddings=[3, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1],
        num_filters=[32, 64, 128, 256, 256, 512, 512, 1024, 1024, 1024, 1024, 1024],
        dropout_rate=0.3
    ).to(DEVICE)

In [56]:
criterion = YoloLoss(S=S, B=B, C=C)
optimizer = optim.Adam(model2.parameters(), lr=LEARNING_RATE)

In [57]:
for epoch in range(EPOCHS):
        model2.train()
        total_class_loss = 0
        total_box_loss = 0

        for batch in dataloader:
            images, targets = batch  
            
            if isinstance(images, tuple):
                images = torch.stack(images)
            
            images = images.to(DEVICE)
            
            target_tensor = torch.zeros((len(images), S, S, C + B * 5), device=DEVICE)
            for i, target in enumerate(targets):
                for box, label in zip(target['boxes'], target['labels']):
                    x_center = (box[0] + box[2]) / 2 * S  
                    y_center = (box[1] + box[3]) / 2 * S
                    grid_x, grid_y = int(x_center), int(y_center)

                    target_tensor[i, grid_y, grid_x, :2] = torch.tensor([x_center - grid_x, y_center - grid_y], device=DEVICE)
                    target_tensor[i, grid_y, grid_x, 2:4] = box[2:] - box[:2]
                    target_tensor[i, grid_y, grid_x, 4] = 1.0  
                    target_tensor[i, grid_y, grid_x, 5 + int(label.item())] = 1.0  

            predictions = model2(images)
            
            class_loss, box_loss = criterion(predictions, target_tensor)
            
            optimizer.zero_grad()
            (class_loss + box_loss).backward()  
            optimizer.step()

            total_class_loss += class_loss.item()
            total_box_loss += box_loss.item()
        avg_class_loss = total_class_loss / len(dataloader)
        avg_box_loss = total_box_loss / len(dataloader)
        total_loss = avg_class_loss + avg_box_loss

        print(f"Epoch [{epoch+1}/{EPOCHS}], Class Loss: {avg_class_loss:.4f}, Box Loss: {avg_box_loss:.4f}, Total Loss: {total_loss:.4f}")
        



Epoch [1/10], Class Loss: 1.2698, Box Loss: 30.8198, Total Loss: 32.0896




Epoch [2/10], Class Loss: 1.0786, Box Loss: 26.8031, Total Loss: 27.8817




Epoch [3/10], Class Loss: 0.8770, Box Loss: 20.1121, Total Loss: 20.9891




Epoch [4/10], Class Loss: 0.8538, Box Loss: 18.0183, Total Loss: 18.8721




Epoch [5/10], Class Loss: 0.6843, Box Loss: 13.8234, Total Loss: 14.5076




Epoch [6/10], Class Loss: 0.6485, Box Loss: 13.5138, Total Loss: 14.1623




Epoch [7/10], Class Loss: 0.6085, Box Loss: 11.6768, Total Loss: 12.2852




Epoch [8/10], Class Loss: 0.7660, Box Loss: 13.4749, Total Loss: 14.2409




Epoch [9/10], Class Loss: 0.6541, Box Loss: 10.9532, Total Loss: 11.6072




Epoch [10/10], Class Loss: 0.5667, Box Loss: 9.8841, Total Loss: 10.4508


In [58]:
import torch
from collections import defaultdict

def collate_fn(batch):
    images, targets = zip(*batch)
    images = torch.stack(images, 0)  
    return images, targets 

test_dataset = FruitDataset(
    data_dir='/Users/matvejzasadko/Downloads/All/Study/NNetworks/Lb1/archive/test_zip/test', 
    transforms=transform,
    image_size=(224, 224)
)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

model2.eval()  
total_correct_classifications = 0
total_objects = 0
total_box_loss = 0
num_batches = 0

correct_per_class = defaultdict(int)
total_per_class = defaultdict(int)

with torch.no_grad():  
    for images, targets in test_loader:
        images = images.to(DEVICE)
        
        target_tensor = torch.zeros((len(images), S, S, C + B * 5), device=DEVICE)
        for i, target in enumerate(targets):
            for box, label in zip(target['boxes'], target['labels']):
                x_center = (box[0] + box[2]) / 2 * S  
                y_center = (box[1] + box[3]) / 2 * S
                grid_x, grid_y = int(x_center), int(y_center)

                target_tensor[i, grid_y, grid_x, :2] = torch.tensor([x_center - grid_x, y_center - grid_y], device=DEVICE)
                target_tensor[i, grid_y, grid_x, 2:4] = box[2:] - box[:2]
                target_tensor[i, grid_y, grid_x, 4] = 1.0  
                target_tensor[i, grid_y, grid_x, 5 + int(label.item())] = 1.0  

        predictions = model(images).view(-1, S, S, C + B * 5)
        
        for i in range(len(images)):
            for grid_y in range(S):
                for grid_x in range(S):
                    if target_tensor[i, grid_y, grid_x, 4] == 1: 
                        true_class = torch.argmax(target_tensor[i, grid_y, grid_x, 5:])
                        predicted_class = torch.argmax(predictions[i, grid_y, grid_x, 5:])
                        
                        if predicted_class == true_class:
                            total_correct_classifications += 1
                            correct_per_class[true_class.item()] += 1
                        total_objects += 1
                        total_per_class[true_class.item()] += 1
        
        _, box_loss = criterion(predictions, target_tensor)
        total_box_loss += box_loss.item()
        num_batches += 1

accuracy = (total_correct_classifications / total_objects) * 100 if total_objects > 0 else 0
avg_box_loss = total_box_loss / num_batches

print(f"Test Accuracy: {accuracy:.2f}%, Test Box Loss: {avg_box_loss:.4f}")

for class_id in range(C):
    if total_per_class[class_id] > 0:
        class_accuracy = (correct_per_class[class_id] / total_per_class[class_id]) * 100
        print(f"Accuracy for class {class_id}: {class_accuracy:.2f}%")
    else:
        print(f"Accuracy for class {class_id}: No samples")

Test Accuracy: 49.12%, Test Box Loss: 15.7766
Accuracy for class 0: 68.57%
Accuracy for class 1: 45.95%
Accuracy for class 2: 35.71%




Нажаль, модель типу YOLO не дуже справилась з завданням object detection, на такій маленькій кількості даних
Точність на тестових даних склала всього 66%
Після підбору гіперпараметрів схоже, відбувся overfitting, модель показала значно кращі результати на тренувальних даних, проте на тестових даних точністі стала меншою
Найбільше модель помиляється на даних класу 2(банани), це через те, що зображення бананівв найбільш різноманітні, і не вистачає картинок, щоб модель до цього пристосувалась
Найкращий сет гіперпараметрів {'kernel_sizes': [7, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3], 'strides': [2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1], 'paddings': [3, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1], 'num_filters': [32, 64, 128, 256, 256, 512, 512, 1024, 1024, 1024, 1024, 1024], 'dropout_rate': 0.3};

Kernel size - розмір згорткового ядра;   
Strides - крок згортки;  
Peddings - скільки пікселів буде додано навколо меж зображення перед застосуванням згортки;  
num_filters - скільки різних фільтрів буде застосовано до вхідного зображення або попереднього шару для виділення різних ознак;

Strides - визначає ймовірність "відключення" випадкових нейронів у шарі під час навчання, щоб запобігти перенавчанню;    
    
    
    