# Library imports



In [33]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torchinfo import summary
from torch.utils.data import Dataset, DataLoader
from torchvision.ops import batched_nms

import matplotlib.pyplot as plt
import matplotlib.patches as patches
from IPython.display import display

from torch.amp import GradScaler
from torch.amp import autocast

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import cv2
from PIL import Image
import os
from tqdm import tqdm
import glob

# Config

In [34]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CHECKPOINT = "dr_bee_checkpoint.pth.tar"

IMAGE_SIZE=640
#ANCHORS는 image 단위에서 [0,1]의 값을 가짐
ANCHORS = [
    [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],
    [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],
    [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],
]
STRIDE=[32, 16, 8]
GRID_SIZE=[ (IMAGE_SIZE//s) for s in STRIDE]
SCALED_ANCHORS = (
    torch.tensor(ANCHORS)
    * torch.tensor(GRID_SIZE).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)
).to(DEVICE)

#LEARNING HYPERPARAMETER
EPOCHS=1
BATCH_SIZE=2#16
NUM_CLASSES = 3
LEARNING_RATE=1e-4
WEIGHT_DECAY=5e-4
NMS_THRESHOLD=0.2
CONFIDENCE_THRESHOLD=0.7

#LOSS HYPERPARAMETER
LAMBDA_COORD = 5.0
LAMBDA_NO_OBJ = 0.5


LOAD_MODEL=False
SAVE_MODEL=True

CLASS_LABEL={0:'normal',1:'mite',2:'virus'}

#DIRECTORY
IMG_TRAIN_DIR='./kaggle/input/dr-bee/images/train'
LABEL_TRAIN_DIR='./kaggle/input/dr-bee/labels/train'

IMG_TEST_DIR='./kaggle/input/dr-bee/images/val'
LABEL_TEST_DIR='./kaggle/input/dr-bee/labels/val'

OUTPUT_DIR='./kaggle/working/output_images'

# Dataset

## Dataset define

In [35]:
train_transform = A.Compose(
    [
        #이미 image는 640x640크기에 검은 패딩이 추가 된 상태로 들어옴

        # Random color jittering
        A.ColorJitter(
            brightness=0.5, contrast=0.5,
            saturation=0.5, hue=0.5, p=0.5
        ),
        #0.5 확률로 수평 반전
        A.HorizontalFlip(p=0.5),

        # 0.5 확률로 수직 반전
        A.VerticalFlip(p=0.5),

        #  # 랜덤 밝기 대비 조정
        # A.RandomBrightnessContrast(
        #     brightness_limit=0.2,
        #     contrast_limit=0.2, p=0.5
        # ),

        # Normalize
        A.Normalize(
            mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255
        ),
        # Convert the image to PyTorch tensor
        ToTensorV2()
    ],
    # Augmentation for bounding boxes
    bbox_params=A.BboxParams(
                    # yolo 형식은 x_center, y_center, widht, height로 넣어줘야함
                    format="yolo",
                    #가시성이 떨어지는 애들은 제거
                    min_visibility=0.4,
                    label_fields=[]
                )
)

# Transform for testing
# 오류가 뜰 수 있지만 크게 상관 없음
# 오류는 바운딩 박스를 처리하려는 옵션은 설정했지만, 이를 처리할 변환(transform)이 정의 되지 않아서 그런 것. 하지만 test data는 정규화 외에 어떤 작업도 하면 안됨
test_transform = A.Compose(
    [
        #이미 image는 640x640크기에 검은 패딩이 추가 된 상태로 들어옴

        # Normalize
        A.Normalize(
            mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255
        ),
        # Convert the image to PyTorch tensor
        ToTensorV2()
    ],
    # Augmentation for bounding boxes
    bbox_params=A.BboxParams(
                    format="yolo",
                    min_visibility=0.4,
                    label_fields=[]
                )
)

In [36]:
class YoloDataset(Dataset):
    def __init__(self, label_dir: str, img_dir: str, image_size: int = IMAGE_SIZE, anchors: list = ANCHORS,
                 grid_size: list = GRID_SIZE, num_classes: int = NUM_CLASSES, transform=None):
        self.label_dir = label_dir
        self.img_dir = img_dir
        self.image_size = image_size
        self.grid_size = grid_size
        self.num_classes = num_classes
        # Anchor Box 크기, 이미 normalized됨 [(w1, h1), (w2, h2), ...]
        self.anchors = anchors
        self.transform = transform
        self.image_ids = [os.path.splitext(os.path.basename(f))[0] for f in glob.glob(os.path.join(img_dir, '*'))
                          if os.path.splitext(f)[1].lower() in ['.jpg', '.png', '.jpeg']]

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        label_path = os.path.join(self.label_dir, f"{image_id}.txt")

        bboxes = np.roll(np.loadtxt(fname=label_path, delimiter=" ", ndmin=2), shift=-1, axis=1).tolist()
        img_path = os.path.join(self.img_dir, f"{image_id}.jpg")
        image = np.array(Image.open(img_path).convert("RGB"))
        bboxes = [([min(max(coord, 1e-6), 1.0) for coord in bbox[:4]] + [bbox[4]]) for bbox in bboxes]

        if self.transform:
            try:
                transformed = self.transform(image=image, bboxes=bboxes)
                image = transformed["image"]
                bboxes = transformed["bboxes"]
            except ValueError as e:
                print(f"{image_id} 이미지에서 {e} 발생")
                image = A.Compose([A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255),
                                   ToTensorV2()])(image=image)["image"]

        targets = [torch.zeros((len(self.anchors[scale_idx]), size, size, 5 + self.num_classes)) for scale_idx, size in
                   enumerate(self.grid_size)]

        for bbox in bboxes:
            x_center, y_center, width, height, class_id = bbox
            for scale_idx, grid_size in enumerate(self.grid_size):
                grid_x = int(x_center * grid_size)
                grid_y = int(y_center * grid_size)

                anchor_boxes = torch.tensor(self.anchors[scale_idx], dtype=torch.float32)
                box_wh = torch.tensor([width, height], dtype=torch.float32)
                iou_scores = self.compute_iou(box_wh, anchor_boxes)
                best_anchor = torch.argmax(iou_scores)

                targets[scale_idx][best_anchor, grid_x, grid_y, 0] = 1  # Confidence
                targets[scale_idx][best_anchor, grid_x, grid_y, 1:5] = torch.tensor([
                    (x_center * grid_size) - grid_x,
                    (y_center * grid_size) - grid_y,
                    width * grid_size,
                    height * grid_size
                ])
                targets[scale_idx][best_anchor, grid_x, grid_y, (5 + int(class_id))] = 1

        return image, targets, image_id

    @staticmethod
    def compute_iou(box_wh, anchor_boxes):
        intersection = torch.clamp(
            torch.min(box_wh[0], anchor_boxes[:, 0]) * torch.min(box_wh[1], anchor_boxes[:, 1]), min=0.0)
        box_area = box_wh[0] * box_wh[1]
        anchor_areas = anchor_boxes[:, 0] * anchor_boxes[:, 1]
        union = box_area + anchor_areas - intersection
        return intersection / union


## Dataset TEST

In [37]:
# # 데이터셋 객체 생성
# check_dataset = YoloDataset(
#     label_dir=LABEL_TEST_DIR,
#     img_dir=IMG_TEST_DIR,
#     image_size=IMAGE_SIZE,
#     anchors=ANCHORS,
#     grid_size=GRID_SIZE,
#     num_classes=NUM_CLASSES,
#     transform=test_transform
# )
#
# # DataLoader 생성
# #batch size가 1
# check_dataset_loader = DataLoader(
#     check_dataset,
#     batch_size=1,
#     shuffle=True
# )
#
# check_data_iter = iter(check_dataset_loader)
# image, targets = next(check_data_iter)
# # targets = [#scale,batch_size, #anchor, grid_size,grid_size,[...]]
#
# """
# 이미지와 바운딩 박스를 시각화
# """
# fig, ax = plt.subplots(1)
# #batch size가 있기 때문에, image[0]을 해서 1번째 batch의 img를 가져와야함
# ax.imshow(image[0].permute(1, 2, 0))  # (C, H, W) -> (H, W, C)
#
# # 각 스케일에서 타겟 박스 그리기
# print("3개의 scale에서 모두 동일한 bbox의 이미지가 뜨면 성공!")
# figures = []
# for scale_idx in range(len(GRID_SIZE)):
#     #batch size가 1이므로 [scale_idx][0] 인덱스를 불러옴
#     target=targets[scale_idx][0]
#     # 타겟에서 활성화된 박스만 가져오기
#     active_boxes = torch.nonzero(target[..., 0])  # Confidence가 1인 경우
#     for box in active_boxes:
#         anchor_idx, grid_x, grid_y = box[:3]  # 앵커, 그리드 좌표 추출
#
#         cx, cy, w, h = target[anchor_idx, grid_x, grid_y, 1:5]  # 바운딩 박스 정보
#
#         # 실제 이미지 좌표로 변환
#         cx = (grid_x + cx.item()) * STRIDE[scale_idx]
#         cy = (grid_y + cy.item()) * STRIDE[scale_idx]
#         w = w.item() * STRIDE[scale_idx]
#         h = h.item() * STRIDE[scale_idx]
#         # 바운딩 박스 그리기
#         rect = patches.Rectangle(
#             (cx - w / 2, cy - h / 2), w, h,
#             linewidth=2, edgecolor="r", facecolor="none"
#         )
#         ax.add_patch(rect)
#     figures.append(fig)
# for fig in figures:
#     display(fig)
# plt.close()

# Model Architecture

## Blocks

In [38]:
# Basic Conv Block 정의
class CNNBlock(nn.Module):
    def __init__(self, in_channels:int, out_channels:int, kernel_size:int, stride:int=1,padding:int=0):
        super().__init__()

        self.conv = nn.Sequential(
            # TODO : (Conv, BatchNorm, LeakyReLU) 스펙 보고 구현
            nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, bias=False,stride=stride,padding=padding),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.1)
        )

    def forward(self, x:torch.tensor):
        return self.conv(x)


# ResidualBlock 정의
class ResidualBlock(nn.Module):
    def __init__(self, in_channels:int, num_repeats:int=1): # residual block은 input channel 수와 output channel 수가 동일하다.
        super().__init__()
        res_layers=[]
        for _ in range(num_repeats):
            res_layers.append(nn.Sequential(
            CNNBlock(in_channels,in_channels//2,kernel_size=1,stride=1,padding=0),
            CNNBlock(in_channels//2,in_channels,kernel_size=3,stride=1,padding=1),
        ))
        self.layers = nn.ModuleList(res_layers)


    def forward(self, x:torch.tensor):
        for layer in self.layers:
            skip_connection = x
            x = layer(x)
            x+=skip_connection
        return x



# DarkNet53 정의
class Darknet53(nn.Module):
    def __init__(self,in_channels:int=3):
        super().__init__()
        # TODO : define darknet53 (위에서 정의한 Conv block과 Res block 활용)
        self.block1 = nn.Sequential(
            CNNBlock(in_channels, 32, kernel_size=3, stride=1, padding=1),
            CNNBlock(32, 64, kernel_size=3, stride=2, padding=1),
            ResidualBlock(64, num_repeats=1),
            CNNBlock(64, 128, kernel_size=3, stride=2, padding=1),
            ResidualBlock(128, num_repeats=2),
            CNNBlock(128, 256, kernel_size=3, stride=2, padding=1),
            ResidualBlock(256, num_repeats=8),
        )
        self.block2 = nn.Sequential(
            CNNBlock(256, 512, kernel_size=3, stride=2, padding=1),
            ResidualBlock(512, num_repeats=8),
        )
        self.block3 = nn.Sequential(
            CNNBlock(512, 1024, kernel_size=3, stride=2, padding=1),
            ResidualBlock(1024, num_repeats=4),
        )

    def forward(self, x:torch.tensor):
        # TODO : Darknet53에서 output으로 나오는 세가지 feature map 생산
        high_feature_map = self.block1(x)
        medium_feature_map = self.block2(high_feature_map)
        low_feature_map = self.block3(medium_feature_map)
        return high_feature_map, medium_feature_map, low_feature_map

class UpSampling(nn.Module):
    def __init__(self, in_channels:int, out_channels:int):
        super().__init__()

        self.upsample = nn.Sequential(
            # TODO : YOLO Network Architecture에서 Upsampling에 사용
            CNNBlock(in_channels, out_channels, kernel_size=1, stride=1, padding=0),
            nn.Upsample(scale_factor=2)
        )
    def forward(self, x:torch.tensor):
        return self.upsample(x)


class YoloBlock(nn.Module):
    def __init__(self,in_channels:int,out_channels:int):
        super().__init__()
        self.route_conv = nn.Sequential(
            # TODO : define route conv & output conv
            CNNBlock(in_channels, out_channels, kernel_size=1, stride=1, padding=0),
            CNNBlock(out_channels, out_channels*2, kernel_size=3, stride=1, padding=1),
            CNNBlock(out_channels*2, out_channels, kernel_size=1, stride=1, padding=0),
            CNNBlock(out_channels, out_channels*2, kernel_size=3, stride=1, padding=1),
            CNNBlock(out_channels*2, out_channels, kernel_size=1, stride=1, padding=0),
        )

    def forward(self, x:torch.tensor):
        route = self.route_conv(x)
        return route        #DetectionLayer로 전달


class DetectionLayer(nn.Module):
    def __init__(self, in_channels:int, num_classes:int):
        super().__init__()
        self.num_classes = num_classes
        # TODO : YOLO Network에서 output 된 결과를 이용하여 prediction

        self.pred=nn.Sequential(
            CNNBlock(in_channels, in_channels*2, kernel_size=3, stride=1, padding=1),
            nn.Conv2d(in_channels*2,(num_classes+5)*3 , kernel_size=1,stride=1,padding=0)
        )

    def forward(self, x:torch.tensor):
        output = self.pred(x)
        # [batch size, (#bounding box) * (predicted bounding box), cell_width, cell_height ]
        output = output.view(x.size(0), 3, self.num_classes + 5, x.size(2), x.size(3))
        # [batch size, #bounding box,predicted bounding box, cell_width, cell_height ]
        output = output.permute(0, 1, 3, 4, 2)
        # [batch size, 동일한 scale에서의 #anchor box, grid_cell_size, grid_cell_size ,[confidence, grid_cell_x_center, grid_cell_y_center, grid_cell_width,grid_cell_height,P(class0), P(class1),P(class2)]]
        return output

## yolov3 architecture

In [39]:
class YOLOv3(nn.Module):
    def __init__(self, in_channels:int=3,num_classes:int= 3):
        super().__init__()

        self.num_classes = num_classes

        self.darknet = Darknet53(in_channels=in_channels)

        self.yolo_block_01 = YoloBlock(1024,512)
        self.detectlayer_01 = DetectionLayer(512, num_classes)
        self.upsample_01 = UpSampling(512, 256)

        # input_channels : darknet53 feature map 02 채널(512) + upsampling 채널(256)
        self.yolo_block_02 = YoloBlock(512 + 256, 256)
        self.detectlayer_02 = DetectionLayer(256, num_classes)
        self.upsample_02 = UpSampling(256, 128)

        # input_channels : darknet53 feature map 01 채널(256) + upsampling 채널(128)
        self.yolo_block_03 = YoloBlock(256 + 128, 128)
        self.detectlayer_03 = DetectionLayer(128, num_classes)

    def forward(self, x:torch.tensor):
        high_feature_map, medium_feature_map, low_feature_map =self.darknet(x)

        x= self.yolo_block_01(low_feature_map)
        output_01 = self.detectlayer_01(x)
        x = self.upsample_01(x)

        x = self.yolo_block_02(torch.cat([x,medium_feature_map], dim=1))
        output_02 = self.detectlayer_02(x)
        x = self.upsample_02(x)

        x = self.yolo_block_03(torch.cat([x, high_feature_map], dim=1))
        output_03 = self.detectlayer_03(x)

        return output_01, output_02, output_03

## Model Architecture TEST

In [40]:
# # Creating model and testing output shapes
# test_model = YOLOv3(num_classes=NUM_CLASSES)
# x = torch.randn((1, 3, IMAGE_SIZE, IMAGE_SIZE))
# out = test_model(x)
# print(out[0].shape)
# print(out[1].shape)
# print(out[2].shape)
#
# # Asserting output shapes
# assert test_model(x)[0].shape == (1, 3, IMAGE_SIZE // 32, IMAGE_SIZE // 32, NUM_CLASSES + 5) # B, RGB, cell size, cell size, (c, x, y, w, h) + classes_prob
# assert test_model(x)[1].shape == (1, 3, IMAGE_SIZE // 16, IMAGE_SIZE // 16, NUM_CLASSES + 5)
# assert test_model(x)[2].shape == (1, 3, IMAGE_SIZE // 8, IMAGE_SIZE // 8, NUM_CLASSES + 5)
# print("Output shapes are correct!")
#
# # torch summary
# summary(test_model, input_size=(2, 3, IMAGE_SIZE, IMAGE_SIZE), device=DEVICE)

# Define Util & Loss function
참고 자료 : https://www.geeksforgeeks.org/yolov3-from-scratch-using-pytorch/

## Utils

In [41]:
def giou(box1:torch.tensor, box2:torch.tensor)->torch.tensor:
    # [x_center,y_center,width,height]
    # batch를 고려하여 차원 유지
    box1_x1 = (box1[..., 0:1] - box1[..., 2:3]) * 0.5
    box1_y1 = (box1[..., 1:2] - box1[..., 3:4]) * 0.5
    box1_x2 = (box1[..., 0:1] + box1[..., 2:3]) * 0.5
    box1_y2 = (box1[..., 1:2] + box1[..., 3:4]) * 0.5

    box2_x1 = (box2[..., 0:1] - box2[..., 2:3]) * 0.5
    box2_y1 = (box2[..., 1:2] - box2[..., 3:4]) * 0.5
    box2_x2 = (box2[..., 0:1] + box2[..., 2:3]) * 0.5
    box2_y2 = (box2[..., 1:2] + box2[..., 3:4]) * 0.5

    #intersection 계산
    inter_x1 = torch.max(box1_x1, box2_x1)
    inter_x2 = torch.min(box1_x2, box2_x2)
    inter_y1 = torch.max(box1_y1, box2_y1)
    inter_y2 = torch.min(box1_y2, box2_y2)

    inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * torch.clamp(inter_y2 - inter_y1, min=0)
    box1_area = (box1_x2 - box1_x1) * (box1_y2 - box1_y1)
    box2_area = (box2_x2 - box2_x1) * (box2_y2 - box2_y1)
    union_area = box1_area + box2_area - inter_area

    iou = inter_area / torch.clamp(union_area, min=1e-6)

    enclosing_x1 = torch.min(box1_x1, box2_x1)
    enclosing_y1 = torch.min(box1_y1, box2_y1)
    enclosing_x2 = torch.max(box1_x2, box2_x2)
    enclosing_y2 = torch.max(box1_y2, box2_y2)
    enclosing_area = torch.clamp(enclosing_x2 - enclosing_x1, min=0) * torch.clamp(enclosing_y2 - enclosing_y1, min=0)

    giou = iou - (enclosing_area - union_area) / torch.clamp(enclosing_area, min=1e-6)

    return giou

In [42]:
def convert_cells_to_bboxes(pred :torch.tensor ,scaled_anchors:torch.tensor ):
    #Scale은 벗겨진 상태로 들어옴
    #[#batch,#anchor, grid_size, grid_size,[confidence, x, y, width, height, P(class1),P(class2),P(class3)]]
    batch_size = pred.shape[0]
    grid_size = pred.shape[2] #혹은 pred.shape[3]
    num_anchors = len(scaled_anchors)
    scaled_anchors = scaled_anchors.to(pred.device).reshape(1, num_anchors, 1, 1, 2)


    box_predictions = pred[..., 1:5]

    box_predictions[..., 0:2] = torch.sigmoid(box_predictions[..., 0:2])
    box_predictions[..., 2:4] = torch.exp(box_predictions[..., 2:4]) * scaled_anchors

    pred_confidence = torch.sigmoid(pred[..., 0:1])
    pred_best_class = torch.argmax(pred[..., 5:], dim=-1).unsqueeze(-1)

    #0~(grid_size-1)의 값을 같는 tensor 생성
    grid_indices = torch.arange(grid_size, device=pred.device).repeat(batch_size, num_anchors, grid_size, 1).unsqueeze(-1)

    #grid_size로 나눠줘서 normalized 시킴
    pred_x_center = (box_predictions[..., 0:1] + grid_indices)/grid_size
    pred_y_center = (box_predictions[..., 1:2] + grid_indices.permute(0, 1, 3, 2, 4))/grid_size
    pred_width_height = box_predictions[..., 2:4]/grid_size

    converted_bboxes = torch.cat((pred_confidence, pred_x_center, pred_y_center, pred_width_height,pred_best_class), dim=-1)

    # Grid Cell이 의미가 없으므로 중간을 풀어줌
    converted_bboxes = converted_bboxes.reshape(batch_size, num_anchors * grid_size * grid_size, 6)

    return converted_bboxes


In [43]:
def convert_coordinate_yolo_to_pascal(yolo_cordi:torch.tensor,img_size:int=IMAGE_SIZE):
    """
    yolo_cordi의 formatting
    [(#scale)*(#anchors)*grid_size*grid_size, [x_center,y_center,width,height]]
    """
    img_scale_yolo_cordi = yolo_cordi[...,0:4]*IMAGE_SIZE
    x_y_1= img_scale_yolo_cordi[...,0:2] - img_scale_yolo_cordi[...,2:4]/2
    x_y_2= img_scale_yolo_cordi[...,0:2] + img_scale_yolo_cordi[...,2:4]/2
    result_cordi=torch.cat((x_y_1,x_y_2),dim=-1)
    return result_cordi

## Model checkpoint

In [44]:
def save_checkpoint(model, optimizer, filename = CHECKPOINT):
    print("==> Saving checkpoint")
    checkpoint = {
        "state_dict": model.state_dict(),
        "optimizer": optimizer.state_dict(),
    }
    torch.save(checkpoint, filename)

# Function to load checkpoint
def load_checkpoint(checkpoint_file, model, optimizer, lr=LEARNING_RATE, device=DEVICE):
    print("==> Loading checkpoint")
    checkpoint = torch.load(checkpoint_file, map_location=device)
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

    for param_group in optimizer.param_groups:
        param_group["lr"] = lr

## Loss function

In [45]:
class YoloLoss(nn.Module):
    def __init__(self, lambda_coord:float = LAMBDA_COORD, lambda_no_obj:float=LAMBDA_NO_OBJ):
        super().__init__()
        self.lambda_coord = lambda_coord
        self.lambda_no_obj = lambda_no_obj
        self.mse = nn.MSELoss()
        self.bce = nn.BCEWithLogitsLoss()
        self.cross_entropy = nn.CrossEntropyLoss()
        self.sigmoid = nn.Sigmoid()

    def forward(self, pred:torch.tensor, target:torch.tensor, scaled_anchors:torch.tensor):
        # Identifying which cells in target have objects and which have no objects
        obj = target[..., 0] == 1
        no_obj = target[..., 0] == 0

        # Calculating No object loss
        no_object_loss = self.bce(
            (pred[..., 0:1][no_obj]), (target[..., 0:1][no_obj]),
        )

        # Reshaping anchors to match predictions
        # bw, bh를 계산하기 위해서는 pw, ph 값이 필요하다.
        num_anchors = len(scaled_anchors)
        scaled_anchors = scaled_anchors.to(pred.device).reshape(1, num_anchors, 1, 1, 2)

        # 모델을 통해서 Predict된 결과는 tx, ty, tw, th 이르모 bx, by, bw, bh fromat으로 맞춰준다.
        # bx = sigmoid(tx) + cx 이지만 cx term을 여기서 더해줄 필요는 없다
        # 이는 상대 좌표를 기반으로 손실 계산을 하기 때문이며 계산이 이미 그리드 셀 내 상대적 offset을 다루는데 초점이 맞춰져 있기 때문이다.
        box_preds = torch.cat([self.sigmoid(pred[..., 1:3]),
                               torch.exp(pred[..., 3:5]) * scaled_anchors], dim = -1)

        # Calculating IoU for prediction and target
        # iou_term = torchvision.ops.box_iou(box_preds[obj], target[..., 1:5][obj]).detach()

        # torchvision.ops.box_iou는 box format을 (x1, y1, x2, y2)로 받기 때문에 (cx, cy, w, h)로 처리한 우리 tensor를 바로 넣을 수는 없다
        # 해결방식 1. iou 직접 짜기
        # 해결방식 2. formatting 해서 집어넣기
        iou_term = giou(box_preds[obj], target[..., 1:5][obj]).detach()

        # Calculating Object loss
        # 논문에서는 object loss를 계산할 때 존재/비존재 모두 BCE를 쓴다고 나와있지만
        # 실제로는 object가 존재할 경우 iou_term을 곱해주고 MSE 방식을 씀으로써
        # 수렴 안정성, iou를 곱함으로써 객체의 정확한 위치와 겹침 정도 고려, 계산의 효율성을 챙길 수 있다.
        # pred에 sigmoid를 씌우는 이유는 0과 1 사이 값으로 조절하기 위해서이다.
        # target은 존재하는 경우 1로 고정이므로 iou_term을 곱해주면 0~1 사이의 값이 된다.
        object_loss = self.mse(self.sigmoid(pred[..., 0:1][obj]),
                               iou_term * target[..., 0:1][obj])

        # Predicted box coordinates
        # bx = sigmoid(tx)
        pred[..., 1:3] = self.sigmoid(pred[..., 1:3])

        # Target box coordinates
        # bw = pw * exp(tw) 이므로 tw = log(pw / bw)
        target[..., 3:5] = torch.log(torch.clamp(target[..., 3:5] / scaled_anchors, min=1e-6) )

        # Calculating box coordinates
        # 논문에서는 box coordinate loss를 계산할 때 'sum of squared error'를 사용한다고 나와있다.
        # pred[...,1:5] format은 [tx, ty, tw, th]이고 target[..., 1:5] format은 [bx, by, bw, bh]이므로
        # format을 통일 시켜줘야 한다.
        # 논문에서는 target에 역함수를 취하여 pred[..., 1:5] format에 맞추어 계산한 것 같으나 위의 처리 후 아래 계산과 동일하다.
        # 우리 format은 [bx, by, tw, th]로 맞춰주고 loss 계산
        box_loss = self.mse(pred[..., 1:5][obj], target[..., 1:5][obj])

        # Calculating class loss
        # 논문에서는 아래와 같이 softmax(cross-entropy에서 사용됨)를 사용해보았을때 좋은 성능이 안나온다고 말했다.
        # class_loss = self.ce((pred[..., 5:][obj]), target[..., 5:][obj].long())
        # 논문을 따라가기 위해 우리도 logistic classifier(BCE에서 사용)을 사용해보자
        class_loss = self.bce(pred[..., 5:][obj], target[..., 5:][obj].float())


        return (
            self.lambda_coord * box_loss
            + object_loss
            + self.lambda_no_obj * no_object_loss
            + class_loss
        )

# MODEL TRAIN

In [46]:
def training_loop(loader, model, optimizer, loss_fn, scaler, scaled_anchors:torch.tensor=SCALED_ANCHORS):
    # Creating a progress bar
    # tqdm 통해서 진행도 표시
    progress_bar = tqdm(loader, leave=True)

    # Initializing a list to store the losses
    # iteration 당 loss 값 기록
    losses = []

    # Iterating over the training data
    for _, (images, targets, _) in enumerate(progress_bar):
        # images는 이미지
        # target은 grid_cell 단위의 label 데이터
        # 이 두 개를 GPU나 CPU에 옮기기
        images = images.to(DEVICE)
        targets = [t.to(DEVICE) for t in targets]

        # autocast()를 사용할 경우 모델이 더 빠르고 효율적으로 계산하도록 자동으로 숫자 크기를 줄여줌
        with autocast(DEVICE):
            # 만든 모델에 이미지 입력데이터(image)를 넣고 얻어낸 결과 <- grid_cell 단위, 크기 별로 pred[0], pred[1], pred[2] 존재
            pred  = model(images)

            # Calculating the loss at each scale
            # 모델의 loss를 scale별로 계산하고 더하기
            loss = 0
            for i in range(3):
                loss += loss_fn(pred[i], targets[i], scaled_anchors[i])

        # Add the loss to the list
        losses.append(loss.item())

        # Reset gradients
        optimizer.zero_grad()

        """
        - 아래부터 사용되는 scaler는 torch.cuda.amp.GradScaler로
        모델 훈련을 더 빠르고 효율적으로 하기 위해 사용
        - 혼합 정밀도 문제를 해결
        """
        # Backpropagate the loss
        # 손실 값 스케일링을 통해서 float 16의 소수점 손실 문제 해결
        scaler.scale(loss).backward()

        # Optimization step
        # 스케일링된 손실 값을 기반으로 모델 가중치 업데이트
        scaler.step(optimizer)

        # Update the scaler for next iteration
        # 스케일링 값 조정
        scaler.update()

        # update progress bar with loss
        # 손실 값들의 평균 계산
        mean_loss = sum(losses) / len(losses)
        # 진행 바에 평균 손실 값 출력
        progress_bar.set_postfix(loss=mean_loss)

In [47]:
# # 데이터셋 객체 생성
# train_dataset = YoloDataset(
#     label_dir=LABEL_TRAIN_DIR,
#     img_dir=IMG_TRAIN_DIR,
#     image_size=IMAGE_SIZE,
#     anchors=ANCHORS,
#     grid_size=GRID_SIZE,
#     num_classes=NUM_CLASSES,
#     transform=train_transform
# )
#
# # DataLoader 생성
# train_loader = DataLoader(
#     train_dataset,
#     batch_size=BATCH_SIZE,
#     shuffle=True
# )
#
# model = YOLOv3(in_channels=3,num_classes=NUM_CLASSES).to(DEVICE)
#
# optimizer = optim.AdamW (model.parameters(), lr = LEARNING_RATE, weight_decay = WEIGHT_DECAY)
#
# loss_fn = YoloLoss(LAMBDA_COORD,LAMBDA_NO_OBJ)
# scaler = GradScaler()
#
# for e in range(1, EPOCHS+1):
#     print("Epoch:", e)
#     training_loop(
#         loader=train_loader,
#         model=model,
#         optimizer=optimizer,
#         loss_fn=loss_fn,
#         scaler=scaler,
#         scaled_anchors=SCALED_ANCHORS
#     )
#
#     # Saving the model
#     if SAVE_MODEL:
#         save_checkpoint(model, optimizer, filename=CHECKPOINT)

# MODEL TEST


In [48]:
# Setting the load_model to True
LOAD_MODEL = True

# Defining the model, optimizer, loss function, and scaler
model = YOLOv3(in_channels=3, num_classes=NUM_CLASSES).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
loss_fn = YoloLoss()
scaler = GradScaler()

# Loading the checkpoint if needed
if LOAD_MODEL:
    load_checkpoint(CHECKPOINT, model, optimizer, LEARNING_RATE)

# Defining the test dataset and data loader
test_dataset = YoloDataset(
    label_dir=LABEL_TEST_DIR,
    img_dir=IMG_TEST_DIR,
    image_size=IMAGE_SIZE,
    anchors=ANCHORS,
    grid_size=GRID_SIZE,
    num_classes=NUM_CLASSES,
    transform=test_transform,
)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)  # Create the directory if it doesn't exist

#BatchNorm에서 평가 모드와, 학습모드가 다름# Set the model to evaluation mode
# Set the model to evaluation mode
model.eval()
with torch.no_grad():
    for batch_idx, (images, targets, image_ids) in enumerate(test_loader):
        # Move the input to the appropriate device
        images = images.to(DEVICE)

        # Getting the model predictions
        preds = model(images)

        # Getting bounding boxes for each scale
        total_bboxes = []
        for scale_idx in range(3):
            boxes_by_scale = convert_cells_to_bboxes(preds[scale_idx], SCALED_ANCHORS[scale_idx])
            total_bboxes.append(boxes_by_scale)

        # Concatenate boxes across scales
        total_bboxes = torch.cat(total_bboxes, dim=1)

        for batch in range(images.shape[0]):
            # Applying non-max suppression to remove overlapping bounding boxes
            pascal_coords = convert_coordinate_yolo_to_pascal(total_bboxes[batch, ..., 1:5])
            scores = total_bboxes[batch, ..., 0]
            class_idxs = total_bboxes[batch, ..., 5]

            nms_output_indices = batched_nms(
                boxes=pascal_coords,
                scores=scores,
                idxs=class_idxs,
                iou_threshold=NMS_THRESHOLD
            )

            # Filtering the boxes after NMS and applying confidence threshold
            nms_output_bboxes = pascal_coords[nms_output_indices]
            nms_scores = scores[nms_output_indices]
            nms_class_idxs = class_idxs[nms_output_indices]

            # Converting images for OpenCV
            image_np = images[batch].permute(1, 2, 0).detach().cpu().numpy()  # Convert to [H, W, C]
            image_np = (image_np * 255).astype(np.uint8)  # Convert to uint8 format (0-255 range)

            # Ensure color channels are in BGR format for OpenCV
            try:
                if image_np.shape[-1] == 3:
                    image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
                else:
                    print(f"Unexpected number of channels in image: {image_np.shape[-1]}")
                    continue
            except Exception as e:
                print(f"Error converting color format: {e}")
                continue

            # Plotting bounding boxes on the image
            for bbox, score, class_idx in zip(nms_output_bboxes, nms_scores, nms_class_idxs):
                if score < CONFIDENCE_THRESHOLD:
                    continue
                x1, y1, x2, y2 = map(int, bbox)
                if int(class_idx) == 0:
                    color = (0, 255, 0)
                elif int(class_idx) == 1:
                    color = (0, 0, 255)
                elif int(class_idx) == 2:
                    color = (255, 0, 0)
                else:
                    color = (255, 255, 255)

                cv2.rectangle(image_np, (x1, y1), (x2, y2), color, 2)

                text = f"{CLASS_LABEL[int(class_idx)]}: {score:.2f}"
                cv2.putText(image_np, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)

            # Save the image to /output_images directory with the original image ID in the filename
            output_filename = f"{image_ids[batch]}.jpg"
            output_path = os.path.join(OUTPUT_DIR, output_filename)

            # Attempt to save the image and provide detailed error information
            try:
                save_status = cv2.imwrite(output_path, image_np)
                if not save_status:
                    print(f"Failed to save: {output_path}")
                    print(f"Image shape: {image_np.shape}, dtype: {image_np.dtype}, path: {output_path}")
            except Exception as e:
                print(f"Error saving {output_path}: {e}")

# Set the model back to training mode
model.train()


==> Loading checkpoint


  checkpoint = torch.load(checkpoint_file, map_location=device)


YOLOv3(
  (darknet): Darknet53(
    (block1): Sequential(
      (0): CNNBlock(
        (conv): Sequential(
          (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): LeakyReLU(negative_slope=0.1)
        )
      )
      (1): CNNBlock(
        (conv): Sequential(
          (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): LeakyReLU(negative_slope=0.1)
        )
      )
      (2): ResidualBlock(
        (layers): ModuleList(
          (0): Sequential(
            (0): CNNBlock(
              (conv): Sequential(
                (0): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
               