In [52]:
import numpy as np
import os 
import pandas as pd
import cv2
import torch
import matplotlib.pyplot as plt
from ipywidgets import interact
import albumentations as A
from albumentations.pytorch import ToTensorV2
import torchvision
from torch import nn
import torchsummary
from torch.utils.data import DataLoader
from collections import defaultdict
from torchvision.utils import make_grid

In [53]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

# Utils

In [54]:
CLASS_NAME_TO_ID = {'BS': 0, 'SCRATCH': 1}
CLASS_ID_TO_NAME = {0: 'BS', 1: 'SCRATCH'}
BOX_COLOR = {'BS':(200, 0, 0), 'SCRATCH':(0, 0, 200)}
TEXT_COLOR = (255, 255, 255)

def save_model(model_state, model_name, save_dir="./trained_model"):
    os.makedirs(save_dir, exist_ok=True)
    torch.save(model_state, os.path.join(save_dir, model_name))


def visualize_bbox(image, bbox, class_name, color=BOX_COLOR, thickness=2):
    x_center, y_center, w, h = bbox
    x_min = int(x_center - w/2)
    y_min = int(y_center - h/2)
    x_max = int(x_center + w/2)
    y_max = int(y_center + h/2)
    
    cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color=color[class_name], thickness=thickness)
    
    ((text_width, text_height), _) = cv2.getTextSize(class_name, cv2.FONT_HERSHEY_SIMPLEX, 0.35, 1)    
    cv2.rectangle(image, (x_min, y_min - int(1.3 * text_height)), (x_min + text_width, y_min), color[class_name], -1)
    cv2.putText(
        image,
        text=class_name,
        org=(x_min, y_min - int(0.3 * text_height)),
        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
        fontScale=0.35, 
        color=TEXT_COLOR, 
        lineType=cv2.LINE_AA,
    )
    return image


def visualize(image, bboxes, category_ids):
    img = image.copy()
    for bbox, category_id in zip(bboxes, category_ids):
#         print('category_id: ',category_id)
        class_name = CLASS_ID_TO_NAME[category_id.item()]
        img = visualize_bbox(img, bbox, class_name)
    return img

# Datasets

In [55]:
class PET_dataset():
    def __init__(self,part,neck_dir,body_dir,phase, transformer=None):
        self.neck_dir=neck_dir
        self.body_dir=body_dir
        self.part=part
        self.phase=phase
        self.transformer=transformer
        if(self.part=="body"):
            self.image_files = sorted([fn for fn in os.listdir(self.body_dir+"/"+self.phase+"/image") if fn.endswith("jpg")])
            self.label_files= sorted([lab for lab in os.listdir(self.body_dir+"/"+self.phase+"/label") if lab.endswith("txt")])
        elif(self.part=="neck"):
            self.image_files = sorted([fn for fn in os.listdir(self.neck_dir+"/"+self.phase+"/image") if fn.endswith("jpg")])
            self.label_files= sorted([lab for lab in os.listdir(self.neck_dir+"/"+self.phase+"/label") if lab.endswith("txt")])
            
        
    def __getitem__(self,index):
        filename, image = self.get_image(self.part, index)
        bboxes, class_ids = self.get_label(self.part, index)
        
        if(self.transformer):
            transformed_data=self.transformer(image=image, bboxes=bboxes, class_ids=class_ids)
            image = transformed_data['image']
            bboxes = np.array(transformed_data['bboxes'])
            class_ids = np.array(transformed_data['class_ids'])
            
        
        target = {}
#         print(f'bboxes:{bboxes}\nclass_ids:{class_ids}\nlen_bboxes:{len(bboxes)}\nlen_class_ids:{len(class_ids)}')
#         print(f'filename: {filename}')
        target["boxes"] = torch.Tensor(bboxes).float()
        target["labels"] = torch.Tensor(class_ids).long()
        
        ###
        bboxes=torch.Tensor(bboxes).float()
        class_ids=torch.Tensor(class_ids).long()
        target = np.concatenate((bboxes, class_ids[:, np.newaxis]), axis=1)
        ###
        return image, target, filename
    
    def __len__(self, ):
        return len(self.image_files)
    
    def get_image(self, part, index): # 이미지 불러오는 함수
        filename = self.image_files[index]
        if(part=="body"):
#             print(f"body called!-> {self.part}")
            image_path = self.body_dir+"/"+self.phase+"/image/"+filename
        elif(part=="neck"):
#             print(f"neck called!-> {self.part}")
            image_path = self.neck_dir+"/"+self.phase+"/image/"+filename
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        return filename, image
    
    def get_label(self, part, index): # label (box좌표, class_id) 불러오는 함수
        label_filename=self.label_files[index]
        if(part=="body"):
#             print(f"body label called!-> {self.part}")
            label_path = self.body_dir+"/"+self.phase+"/label/"+label_filename
        elif(part=="neck"):
#             print(f"neck label called!-> {self.part}")
            label_path = self.neck_dir+"/"+self.phase+"/label/"+label_filename
        with open(label_path, 'r') as file:
            labels = file.readlines()
        
        class_ids=[]
        bboxes=[]
        for label in labels:
            label=label.replace("\n", "")
            obj=label.split(' ')[0]
            coor=label.split(' ')[1:]
            obj=int(obj)
            coor=list(map(float, coor))
            class_ids.append(obj)
            bboxes.append(coor)
            
        return bboxes, class_ids
    

In [56]:
IMAGE_SIZE = 448

transformer = A.Compose([ 
        # bounding box의 변환, augmentation에서 albumentations는 Detection 학습을 할 때 굉장히 유용하다. 
        A.Resize(height=IMAGE_SIZE, width=IMAGE_SIZE),
        A.Normalize(mean=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225)),
        ToTensorV2(),
        # albumentations 라이브러리에서는 Normalization을 먼저 진행해 주고 tensor화를 진행해 주어야한다.
    ],
    # box 위치에 대한 transformation도 함께 진행된다. 
    bbox_params=A.BboxParams(format='yolo', label_fields=['class_ids']),
)

def collate_fn(batch):
    image_list = []
    target_list = []
    filename_list = []
    
    for a,b,c in batch:
        image_list.append(a)
        target_list.append(b)
        filename_list.append(c)

    return torch.stack(image_list, dim=0), target_list, filename_list


In [57]:
def build_dataloader(part, NECK_PATH, BODY_PATH, batch_size=2):
    transformer = A.Compose([
            A.Resize(height=IMAGE_SIZE, width=IMAGE_SIZE),
            A.Normalize(mean=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225)),
            ToTensorV2(),
        ],
        bbox_params=A.BboxParams(format='yolo', label_fields=['class_ids']),
    )
    
    dataloaders = {}
#     train_dataset = Detection_dataset(data_dir=data_dir, phase="train", transformer=transformer)
    train_dataset=PET_dataset(part ,neck_dir=NECK_PATH,body_dir=BODY_PATH,phase='train', transformer=transformer)
    dataloaders["train"] = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

#     val_dataset = Detection_dataset(data_dir=data_dir, phase="val", transformer=transformer)
    val_dataset=PET_dataset(part ,neck_dir=NECK_PATH,body_dir=BODY_PATH,phase='valid', transformer=transformer)
    dataloaders["val"] = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)
    return dataloaders

In [58]:
NECK_PATH = '/home/host_data/PET_data/Neck'
BODY_PATH = '/home/host_data/PET_data/Body'
trainset=PET_dataset(part='body',neck_dir=NECK_PATH,body_dir=BODY_PATH,phase='train', transformer=transformer)
trainset_no_trans=PET_dataset(part='body',neck_dir=NECK_PATH,body_dir=BODY_PATH,phase='train', transformer=None)


In [59]:
# target 모양을 (x_cen, y_cen, w, h ,class_id)로 변경
image, target, filename = trainset[0]
print(f"image.shape:{image.shape}")

print(f"target:{target}")

image.shape:torch.Size([3, 448, 448])
target:[[0.58172178 0.55863488 0.06113537 0.04489235 1.        ]
 [0.92389268 0.30393952 0.01996257 0.01603298 0.        ]
 [0.80255771 0.09001374 0.01996257 0.01786532 0.        ]
 [0.0246413  0.60444343 0.02557704 0.09253321 1.        ]
 [0.74422956 0.08131012 0.0205864  0.01603298 0.        ]
 [0.20586401 0.51007789 0.02495321 0.05863491 1.        ]
 [0.22181535 0.16869904 0.12494074 0.03748969 1.        ]]


In [60]:
image, target, filename = trainset_no_trans[0]
print(f"image.shape:{image.shape}")
# print(f"image.type:{image.type}") #numpy

print(f"target:{target}")

image.shape:(2183, 1603, 3)
target:[[0.58172178 0.55863488 0.06113537 0.04489235 1.        ]
 [0.92389268 0.30393952 0.01996257 0.01603298 0.        ]
 [0.80255771 0.09001374 0.01996257 0.01786532 0.        ]
 [0.0246413  0.60444343 0.02557704 0.09253321 1.        ]
 [0.74422956 0.08131012 0.0205864  0.01603298 0.        ]
 [0.20586401 0.51007789 0.02495321 0.05863491 1.        ]
 [0.22181535 0.16869904 0.12494074 0.03748969 1.        ]]


In [61]:
len(trainset)

117

In [62]:
@interact(index=(0, len(trainset_no_trans)-1))

def show_sample(index=0):
    image, target, filename = trainset_no_trans[index]
#     image=image.permute(1,2,0).numpy()
    img_H, img_W, _ = image.shape
    print(image.shape)

#     bboxes = target['boxes']
#     class_ids = target["labels"]
    
    ###
    bboxes = target[:, 0:4]
    class_ids = target[:, 4]
    ###
    bboxes[:, [0,2]] *= img_W
    bboxes[:, [1,3]] *= img_H

    canvas = visualize(image, bboxes, class_ids)
    plt.figure(figsize=(6,6))
    plt.imshow(canvas)
    plt.axis('off')
    plt.show()

# show_sample()

interactive(children=(IntSlider(value=0, description='index', max=116), Output()), _dom_classes=('widget-inter…

In [63]:
@interact(index=(0, len(trainset)-1))

def show_sample(index=0):
    image, target, filename = trainset[index]
    image=image.permute(1,2,0).numpy()
    img_H, img_W, _ = image.shape
    print(image.shape)

#     bboxes = target['boxes']
#     class_ids = target["labels"]
    ###
    bboxes = target[:, 0:4]
    class_ids = target[:, 4]
    ###
    bboxes[:, [0,2]] *= img_W
    bboxes[:, [1,3]] *= img_H

    canvas = visualize(image, bboxes, class_ids)
    plt.figure(figsize=(6,6))
    plt.imshow(canvas)
    plt.axis('off')
    plt.show()

# show_sample()

interactive(children=(IntSlider(value=0, description='index', max=116), Output()), _dom_classes=('widget-inter…

## Model

In [64]:
class YOLO_SWIN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        self.num_classes = num_classes
        self.num_bboxes = 2
        self.grid_size = 7

#         resnet18 = torchvision.models.resnet18(pretrained = True)
        swin_s=torchvision.models.swin_v2_t(weights='IMAGENET1K_V1')
        layers = [m for m in swin_s.children()] #Resnet에서 Yolo에서 가져올수 있을만한 layer만 선별적으로 가져오기 위해서

        # 기존 Resnet18의 layer들중에서 맨 뒤에 두개만 제외하고 다 가져와서 Backbone으로 사용
        self.backbone = nn.Sequential(*layers[:-3]) 

        # self.neck = nn.Sequential(
        #     nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=1, padding=0, bias=False), #Channel의 수만 변경
        #     nn.BatchNorm2d(1024),
        #     nn.ReLU(inplace=True),
        #     nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1,bias=False), #spatial 손실이 없게끔 padding 설정
        #     nn.BatchNorm2d(1024),
        #     nn.ReLU(inplace=True),
        #     nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1,bias=False),
        #     nn.BatchNorm2d(1024),
        #     nn.ReLU(inplace=True),
        #     nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1,bias=False),
        #     nn.BatchNorm2d(1024),
        #     nn.ReLU(inplace=True)
        #     # 여기 위에를 통과하는 동안에는 feature에 대한 정보만 바뀌고, resolution에 대한 정보는 바뀌지x
        # )

        # neck을 통과해서 feature map을 가져오고, head 부분에서 output depth와 grid size를 조절해서 뱉게해준다.

        # self.head = nn.Sequential(
        #     nn.Conv2d(in_channels=1024, out_channels=(4+1)*self.num_bboxes+num_classes, kernel_size=1, padding=0, bias=False),
        #     nn.AdaptiveAvgPool2d(output_size=(self.grid_size, self.grid_size))
        # )
        self.head = nn.Sequential(
                nn.Conv2d(in_channels=768, out_channels=1024, kernel_size=1, padding=0,bias=False),
                nn.BatchNorm2d(1024),
                nn.ReLU(inplace=True),
                nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1,bias=False),
                nn.BatchNorm2d(1024),
                nn.ReLU(inplace=True),
                nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1,bias=False),
                nn.BatchNorm2d(1024),
                nn.ReLU(inplace=True),
                nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1,bias=False),
                nn.BatchNorm2d(1024),
                nn.ReLU(inplace=True),

                nn.Conv2d(in_channels=1024, out_channels=(4+1)*self.num_bboxes+num_classes, kernel_size=1, padding=0, bias=False),
                nn.AdaptiveAvgPool2d(output_size=(self.grid_size, self.grid_size))
            )

    def forward(self, x):
        out = self.backbone(x)
        # out = self.neck(out)
        out = self.head(out) # input (batch, 3, 448, 448) -> output feature (batch, 12, 7, 7)
        return out


In [66]:
NUM_CLASSES = 2
model = YOLO_SWIN(num_classes=NUM_CLASSES)
model.to(device)

YOLO_SWIN(
  (backbone): Sequential(
    (0): Sequential(
      (0): Sequential(
        (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
        (1): Permute()
        (2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      )
      (1): Sequential(
        (0): SwinTransformerBlockV2(
          (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
          (attn): ShiftedWindowAttentionV2(
            (qkv): Linear(in_features=96, out_features=288, bias=True)
            (proj): Linear(in_features=96, out_features=96, bias=True)
            (cpb_mlp): Sequential(
              (0): Linear(in_features=2, out_features=512, bias=True)
              (1): ReLU(inplace=True)
              (2): Linear(in_features=512, out_features=3, bias=False)
            )
          )
          (stochastic_depth): StochasticDepth(p=0.0, mode=row)
          (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (0): Linear(in_features=96, out_f

In [67]:
torchsummary.summary(model, (3,448,448))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 96, 112, 112]           4,704
           Permute-2         [-1, 112, 112, 96]               0
         LayerNorm-3         [-1, 112, 112, 96]             192
            Linear-4          [-1, 15, 15, 512]           1,536
              ReLU-5          [-1, 15, 15, 512]               0
            Linear-6            [-1, 15, 15, 3]           1,536
ShiftedWindowAttentionV2-7         [-1, 112, 112, 96]               0
         LayerNorm-8         [-1, 112, 112, 96]             192
   StochasticDepth-9         [-1, 112, 112, 96]               0
           Linear-10        [-1, 112, 112, 384]          37,248
             GELU-11        [-1, 112, 112, 384]               0
          Dropout-12        [-1, 112, 112, 384]               0
           Linear-13         [-1, 112, 112, 96]          36,960
          Dropout-14         [-1,

In [68]:
x = torch.randn(1, 3, 448, 448).to(device)
with torch.no_grad():
    y = model(x)
print(y.shape)

torch.Size([1, 12, 7, 7])


# Loss func

In [69]:
class YOLO_LOSS():
    def __init__(self, num_classes, device, lambda_coord=5., lambda_noobj=0.5):
        self.num_classes = num_classes
        self.device = device
        self.grid_size = 7
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj
        self.mse_loss = nn.MSELoss(reduction="sum")

    def __call__(self, predictions, targets):
        self.batch_size, _, _, _ = predictions.shape
        groundtruths = self.build_batch_target_grid(targets)
        groundtruths = groundtruths.to(self.device)
        
        with torch.no_grad():
            iou1 = self.get_IoU(predictions[:, 1:5, ...], groundtruths[:, 1:5, ...])
            iou2 = self.get_IoU(predictions[:, 6:10, ...], groundtruths[:, 1:5, ...])

        ious = torch.stack([iou1, iou2], dim=1)
        max_iou, best_box = ious.max(dim=1, keepdim=True)
        max_iou = torch.cat([max_iou, max_iou], dim=1)
        best_box = torch.cat([best_box.eq(0), best_box.eq(1)], dim=1)

        predictions_ = predictions[:, :5*2, ...].reshape(self.batch_size, 2, 5, self.grid_size, self.grid_size)
        obj_pred = predictions_[:, :, 0, ...]
        xy_pred = predictions_[:, :, 1:3, ...]
        wh_pred = predictions_[:, :, 3:5, ...]
        cls_pred = predictions[:, 5*2:, ...]

        groundtruths_ = groundtruths[:, :5, ...].reshape(self.batch_size, 1, 5, self.grid_size, self.grid_size)
        obj_target = groundtruths_[:, :, 0, ...]
        xy_target = groundtruths_[:, :, 1:3, ...]
        wh_target= groundtruths_[:, :, 3:5, ...]
        cls_target = groundtruths[:, 5:, ...]
        
        positive = obj_target * best_box

        obj_loss = self.mse_loss(positive * obj_pred, positive * ious)
        noobj_loss = self.mse_loss((1 - positive) * obj_pred, ious*0)
        xy_loss = self.mse_loss(positive.unsqueeze(dim=2) * xy_pred, positive.unsqueeze(dim=2) * xy_target)
        wh_loss = self.mse_loss(positive.unsqueeze(dim=2) * (wh_pred.sign() * (wh_pred.abs() + 1e-8).sqrt()),
                           positive.unsqueeze(dim=2) * (wh_target + 1e-8).sqrt())
        cls_loss = self.mse_loss(obj_target * cls_pred, cls_target)
        
        obj_loss /= self.batch_size
        noobj_loss /= self.batch_size
        bbox_loss = (xy_loss+wh_loss) / self.batch_size
        cls_loss /= self.batch_size
        
        total_loss = obj_loss + self.lambda_noobj*noobj_loss + self.lambda_coord*bbox_loss + cls_loss
        return total_loss, (obj_loss.item(), noobj_loss.item(), bbox_loss.item(), cls_loss.item())
    
    def build_target_grid(self, target):
        target_grid = torch.zeros((1+4+self.num_classes, self.grid_size, self.grid_size), device=self.device)

        for gt in target:
            xc, yc, w, h, cls_id = gt
            xn = (xc % (1/self.grid_size))
            yn = (yc % (1/self.grid_size))
            cls_id = int(cls_id)

            i_grid = int(xc * self.grid_size)
            j_grid = int(yc * self.grid_size)
            target_grid[0, j_grid, i_grid] = 1
            target_grid[1:5, j_grid, i_grid] = torch.Tensor([xn,yn,w,h])
#             print(5+cls_id, j_grid, i_grid)
            target_grid[5+cls_id, j_grid, i_grid] = 1

        return target_grid
    
    def build_batch_target_grid(self, targets):
        target_grid_batch = torch.stack([self.build_target_grid(target) for target in targets], dim=0)
        return target_grid_batch
    
    def get_IoU(self, cbox1, cbox2):
        box1 = self.xywh_to_xyxy(cbox1)
        box2 = self.xywh_to_xyxy(cbox2)

        x1 = torch.max(box1[:, 0, ...], box2[:, 0, ...])
        y1 = torch.max(box1[:, 1, ...], box2[:, 1, ...])
        x2 = torch.min(box1[:, 2, ...], box2[:, 2, ...])
        y2 = torch.min(box1[:, 3, ...], box2[:, 3, ...])

        intersection = (x2-x1).clamp(min=0) * (y2-y1).clamp(min=0)
        union = abs(cbox1[:, 2, ...]*cbox1[:, 3, ...]) + \
                abs(cbox2[:, 2, ...]*cbox2[:, 3, ...]) - intersection

        intersection[intersection.gt(0)] = intersection[intersection.gt(0)] / union[intersection.gt(0)]
        return intersection
    
    def generate_xy_normed_grid(self):
        y_offset, x_offset = torch.meshgrid(torch.arange(self.grid_size), torch.arange(self.grid_size))
        xy_grid = torch.stack([x_offset, y_offset], dim=0)
        xy_normed_grid = xy_grid / self.grid_size
        return xy_normed_grid.to(self.device)

    def xywh_to_xyxy(self, bboxes):
        xy_normed_grid = self.generate_xy_normed_grid()
        xcyc = bboxes[:,0:2,...] + xy_normed_grid.tile(self.batch_size, 1,1,1)
        wh = bboxes[:,2:4,...]
        x1y1 = xcyc - (wh/2)
        x2y2 = xcyc + (wh/2)
        return torch.cat([x1y1, x2y2], dim=1)

# Train

In [70]:
def train_one_epoch(dataloaders, model, criterion, optimizer, device):
    train_loss = defaultdict(float)
    val_loss = defaultdict(float)
    
    for phase in ["train", "val"]:
        if phase == "train":
            model.train()
        else:
            model.eval()
        
        running_loss = defaultdict(float)
        for index, batch in enumerate(dataloaders[phase]):
            images = batch[0].to(device)
            targets = batch[1]
            filenames = batch[2]
            
            with torch.set_grad_enabled(phase == "train"): # phase가 train 일때만 gradient 추적기능을 킨다.
                predictions = model(images) #prediction shape=> B,12,7,7
#             print(f"predictions:{predictions}, \ntargets: {targets}\n")
            loss, (obj_loss, noobj_loss, bbox_loss, cls_loss) = criterion(predictions, targets)
#             print(f"loss:{loss}, obj_loss:{obj_loss}, noobj_loss:{noobj_loss}\nbbox_loss:{bbox_loss}, cls_loss:{cls_loss}\n--------------\n")
            if phase == "train":
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                # 현재 epoch단계에서 loss가 얼마인지 running loss 가출력
                running_loss["total_loss"] += loss.item()
                running_loss["obj_loss"] += obj_loss
                running_loss["noobj_loss"] += noobj_loss
                running_loss["bbox_loss"] += bbox_loss
                running_loss["cls_loss"] += cls_loss
                
                train_loss["total_loss"] += loss.item()
                train_loss["obj_loss"] += obj_loss
                train_loss["noobj_loss"] += noobj_loss
                train_loss["bbox_loss"] += bbox_loss
                train_loss["cls_loss"] += cls_loss
                
                if (index > 0) and (index % VERBOSE_FREQ) == 0:
                    text = f"<<<iteration:[{index}/{len(dataloaders[phase])}] - "
                    for k, v in running_loss.items():
                        text += f"{k}: {v/VERBOSE_FREQ:.4f}  "
                        running_loss[k] = 0.
                    print(text)
            else:
                val_loss["total_loss"] += loss.item()
                val_loss["obj_loss"] += obj_loss
                val_loss["noobj_loss"] += noobj_loss
                val_loss["bbox_loss"] += bbox_loss
                val_loss["cls_loss"] += cls_loss

    for k in train_loss.keys():
        train_loss[k] /= len(dataloaders["train"])
        val_loss[k] /= len(dataloaders["val"])
    return train_loss, val_loss

In [71]:
# data_dir = "/content/drive/MyDrive/fastCamMedicalProj/DATASET/DATASET/Detection/"
NECK_PATH = '/home/host_data/PET_data/Neck'
BODY_PATH = '/home/host_data/PET_data/Body'
is_cuda = True

NUM_CLASSES = 2
IMAGE_SIZE = 448
BATCH_SIZE = 1
VERBOSE_FREQ = 20
LR=0.0001
BACKBONE="YOLO_SWIN_T"
PART="body"
num_epochs = 100
# DEVICE = torch.device('cuda' if torch.cuda.is_available and is_cuda else 'cpu')

dataloaders = build_dataloader(part=PART,NECK_PATH=NECK_PATH,BODY_PATH=BODY_PATH,batch_size=BATCH_SIZE)
model = YOLO_SWIN(num_classes=NUM_CLASSES)
model = model.to(device)
criterion = YOLO_LOSS(num_classes=NUM_CLASSES, device=device)
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

In [72]:
import wandb
import random

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="yolo_swin",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": LR,
    "batch_size": BATCH_SIZE,
    "architecture": BACKBONE,
    "dataset": "BODY",
    "epochs": num_epochs,
    }
)

[34m[1mwandb[0m: Currently logged in as: [33mgomduribo[0m ([33murp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [73]:
num_epochs = 100

best_epoch = 0
best_score = float('inf')
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    train_loss, val_loss = train_one_epoch(dataloaders, model, criterion, optimizer, device)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
#     train_loss["obj_loss"] += obj_loss
#     train_loss["noobj_loss"] += noobj_loss
#     train_loss["bbox_loss"] += bbox_loss
#     train_loss["cls_loss"] += cls_loss
    wandb.log({"Train Loss": train_loss['total_loss'],
               "Train obj Loss":train_loss["obj_loss"],
               "Train bbox Loss":train_loss["bbox_loss"],
               "Train class Loss":train_loss["cls_loss"],
               "Val Loss": val_loss['total_loss'],
               "Val obj Loss":val_loss["obj_loss"],
               "Val bbox Loss":val_loss["bbox_loss"],
               "Val class Loss":val_loss["cls_loss"],})
    print(f"\nepoch:{epoch+1}/{num_epochs} - Train Loss: {train_loss['total_loss']:.4f}, Val Loss: {val_loss['total_loss']:.4f}\n")
    
    if (epoch+1) % 10 == 0:
        save_model(model.state_dict(), f'model_{epoch+1}.pth', save_dir=f"./trained_model/{BACKBONE}_{PART}_LR{LR}")
wandb.finish()

<<<iteration:[20/117] - total_loss: 24.2522  obj_loss: 0.3188  noobj_loss: 4.2417  bbox_loss: 3.7197  cls_loss: 3.2139  
<<<iteration:[40/117] - total_loss: 50.3951  obj_loss: 0.1571  noobj_loss: 4.2987  bbox_loss: 8.8497  cls_loss: 3.8402  
<<<iteration:[60/117] - total_loss: 36.9247  obj_loss: 1.4886  noobj_loss: 15.4979  bbox_loss: 4.9622  cls_loss: 2.8763  
<<<iteration:[80/117] - total_loss: 16.9519  obj_loss: 0.0231  noobj_loss: 1.2643  bbox_loss: 2.7769  cls_loss: 2.4122  
<<<iteration:[100/117] - total_loss: 12.9996  obj_loss: 0.0337  noobj_loss: 0.7079  bbox_loss: 2.0449  cls_loss: 2.3876  

epoch:1/100 - Train Loss: 26.0890, Val Loss: 8.0821

<<<iteration:[20/117] - total_loss: 9.5633  obj_loss: 0.0345  noobj_loss: 0.4513  bbox_loss: 1.3280  cls_loss: 2.6632  
<<<iteration:[40/117] - total_loss: 14.9126  obj_loss: 0.0258  noobj_loss: 0.4205  bbox_loss: 2.3190  cls_loss: 3.0814  
<<<iteration:[60/117] - total_loss: 10.8487  obj_loss: 0.0182  noobj_loss: 0.3880  bbox_loss: 1.65

<<<iteration:[80/117] - total_loss: 4.2970  obj_loss: 0.0268  noobj_loss: 0.0296  bbox_loss: 0.3479  cls_loss: 2.5160  
<<<iteration:[100/117] - total_loss: 3.6144  obj_loss: 0.0600  noobj_loss: 0.0296  bbox_loss: 0.1584  cls_loss: 2.7479  

epoch:13/100 - Train Loss: 3.6866, Val Loss: 3.1932

<<<iteration:[20/117] - total_loss: 4.2148  obj_loss: 0.0414  noobj_loss: 0.0282  bbox_loss: 0.3376  cls_loss: 2.4713  
<<<iteration:[40/117] - total_loss: 3.8532  obj_loss: 0.0897  noobj_loss: 0.0269  bbox_loss: 0.2900  cls_loss: 2.3002  
<<<iteration:[60/117] - total_loss: 2.8435  obj_loss: 0.0566  noobj_loss: 0.0219  bbox_loss: 0.1441  cls_loss: 2.0554  
<<<iteration:[80/117] - total_loss: 3.9249  obj_loss: 0.0449  noobj_loss: 0.0280  bbox_loss: 0.3559  cls_loss: 2.0863  
<<<iteration:[100/117] - total_loss: 6.4896  obj_loss: 0.0483  noobj_loss: 0.0285  bbox_loss: 0.6991  cls_loss: 2.9316  

epoch:14/100 - Train Loss: 4.2246, Val Loss: 2.8350

<<<iteration:[20/117] - total_loss: 4.6304  obj_lo

<<<iteration:[40/117] - total_loss: 4.6165  obj_loss: 0.0688  noobj_loss: 0.0261  bbox_loss: 0.3518  cls_loss: 2.7757  
<<<iteration:[60/117] - total_loss: 3.3568  obj_loss: 0.0505  noobj_loss: 0.0192  bbox_loss: 0.1737  cls_loss: 2.4282  
<<<iteration:[80/117] - total_loss: 2.3584  obj_loss: 0.0413  noobj_loss: 0.0219  bbox_loss: 0.1185  cls_loss: 1.7135  
<<<iteration:[100/117] - total_loss: 4.3806  obj_loss: 0.0565  noobj_loss: 0.0172  bbox_loss: 0.3520  cls_loss: 2.5554  

epoch:26/100 - Train Loss: 3.5284, Val Loss: 5.6974

<<<iteration:[20/117] - total_loss: 2.8523  obj_loss: 0.0355  noobj_loss: 0.0246  bbox_loss: 0.1117  cls_loss: 2.2462  
<<<iteration:[40/117] - total_loss: 3.0272  obj_loss: 0.0371  noobj_loss: 0.0177  bbox_loss: 0.1172  cls_loss: 2.3955  
<<<iteration:[60/117] - total_loss: 2.8808  obj_loss: 0.0590  noobj_loss: 0.0204  bbox_loss: 0.0873  cls_loss: 2.3750  
<<<iteration:[80/117] - total_loss: 2.7042  obj_loss: 0.0345  noobj_loss: 0.0152  bbox_loss: 0.0888  cls_

<<<iteration:[100/117] - total_loss: 2.8980  obj_loss: 0.0605  noobj_loss: 0.0182  bbox_loss: 0.1003  cls_loss: 2.3270  

epoch:38/100 - Train Loss: 3.2640, Val Loss: 3.1508

<<<iteration:[20/117] - total_loss: 2.7185  obj_loss: 0.0550  noobj_loss: 0.0183  bbox_loss: 0.1034  cls_loss: 2.1375  
<<<iteration:[40/117] - total_loss: 2.9162  obj_loss: 0.0571  noobj_loss: 0.0190  bbox_loss: 0.1041  cls_loss: 2.3291  
<<<iteration:[60/117] - total_loss: 2.7210  obj_loss: 0.0328  noobj_loss: 0.0176  bbox_loss: 0.0865  cls_loss: 2.2468  
<<<iteration:[80/117] - total_loss: 4.6603  obj_loss: 0.0444  noobj_loss: 0.0180  bbox_loss: 0.4442  cls_loss: 2.3857  
<<<iteration:[100/117] - total_loss: 6.0289  obj_loss: 0.0544  noobj_loss: 0.0174  bbox_loss: 0.7459  cls_loss: 2.2361  

epoch:39/100 - Train Loss: 3.6605, Val Loss: 3.4128

<<<iteration:[20/117] - total_loss: 5.9227  obj_loss: 0.0420  noobj_loss: 0.0171  bbox_loss: 0.7249  cls_loss: 2.2479  
<<<iteration:[40/117] - total_loss: 2.5492  obj_lo

<<<iteration:[60/117] - total_loss: 3.0233  obj_loss: 0.0632  noobj_loss: 0.0208  bbox_loss: 0.0789  cls_loss: 2.5554  
<<<iteration:[80/117] - total_loss: 2.7194  obj_loss: 0.0451  noobj_loss: 0.0212  bbox_loss: 0.0918  cls_loss: 2.2046  
<<<iteration:[100/117] - total_loss: 3.2279  obj_loss: 0.0275  noobj_loss: 0.0179  bbox_loss: 0.0894  cls_loss: 2.7447  

epoch:51/100 - Train Loss: 2.7606, Val Loss: 3.0774

<<<iteration:[20/117] - total_loss: 2.9690  obj_loss: 0.0573  noobj_loss: 0.0169  bbox_loss: 0.1067  cls_loss: 2.3698  
<<<iteration:[40/117] - total_loss: 2.6636  obj_loss: 0.0725  noobj_loss: 0.0148  bbox_loss: 0.1514  cls_loss: 1.8266  
<<<iteration:[60/117] - total_loss: 2.8389  obj_loss: 0.0453  noobj_loss: 0.0234  bbox_loss: 0.0670  cls_loss: 2.4469  
<<<iteration:[80/117] - total_loss: 2.3365  obj_loss: 0.0562  noobj_loss: 0.0161  bbox_loss: 0.0816  cls_loss: 1.8644  
<<<iteration:[100/117] - total_loss: 3.0238  obj_loss: 0.0287  noobj_loss: 0.0219  bbox_loss: 0.1463  cls


epoch:63/100 - Train Loss: 2.6927, Val Loss: 2.8187

<<<iteration:[20/117] - total_loss: 3.3863  obj_loss: 0.0988  noobj_loss: 0.0190  bbox_loss: 0.1643  cls_loss: 2.4566  
<<<iteration:[40/117] - total_loss: 2.1267  obj_loss: 0.0466  noobj_loss: 0.0223  bbox_loss: 0.0679  cls_loss: 1.7294  
<<<iteration:[60/117] - total_loss: 2.6702  obj_loss: 0.0230  noobj_loss: 0.0147  bbox_loss: 0.0652  cls_loss: 2.3139  
<<<iteration:[80/117] - total_loss: 2.9018  obj_loss: 0.0803  noobj_loss: 0.0178  bbox_loss: 0.1251  cls_loss: 2.1869  
<<<iteration:[100/117] - total_loss: 2.8251  obj_loss: 0.0719  noobj_loss: 0.0249  bbox_loss: 0.0877  cls_loss: 2.3022  

epoch:64/100 - Train Loss: 2.7202, Val Loss: 2.9436

<<<iteration:[20/117] - total_loss: 3.3048  obj_loss: 0.0497  noobj_loss: 0.0172  bbox_loss: 0.1409  cls_loss: 2.5420  
<<<iteration:[40/117] - total_loss: 2.5271  obj_loss: 0.0327  noobj_loss: 0.0187  bbox_loss: 0.0835  cls_loss: 2.0676  
<<<iteration:[60/117] - total_loss: 2.5160  obj_los

<<<iteration:[80/117] - total_loss: 3.2362  obj_loss: 0.0612  noobj_loss: 0.0222  bbox_loss: 0.1669  cls_loss: 2.3294  
<<<iteration:[100/117] - total_loss: 2.7340  obj_loss: 0.0611  noobj_loss: 0.0202  bbox_loss: 0.0816  cls_loss: 2.2546  

epoch:76/100 - Train Loss: 2.6064, Val Loss: 3.0505

<<<iteration:[20/117] - total_loss: 2.1120  obj_loss: 0.0521  noobj_loss: 0.0241  bbox_loss: 0.0659  cls_loss: 1.7182  
<<<iteration:[40/117] - total_loss: 2.7303  obj_loss: 0.0897  noobj_loss: 0.0270  bbox_loss: 0.0773  cls_loss: 2.2405  
<<<iteration:[60/117] - total_loss: 2.6587  obj_loss: 0.0572  noobj_loss: 0.0214  bbox_loss: 0.1073  cls_loss: 2.0540  
<<<iteration:[80/117] - total_loss: 2.8480  obj_loss: 0.0496  noobj_loss: 0.0261  bbox_loss: 0.1370  cls_loss: 2.1001  
<<<iteration:[100/117] - total_loss: 2.7800  obj_loss: 0.0555  noobj_loss: 0.0184  bbox_loss: 0.0986  cls_loss: 2.2225  

epoch:77/100 - Train Loss: 2.6570, Val Loss: 3.4851

<<<iteration:[20/117] - total_loss: 3.0703  obj_lo

<<<iteration:[40/117] - total_loss: 2.4953  obj_loss: 0.0542  noobj_loss: 0.0166  bbox_loss: 0.0801  cls_loss: 2.0325  
<<<iteration:[60/117] - total_loss: 3.0976  obj_loss: 0.0481  noobj_loss: 0.0213  bbox_loss: 0.1821  cls_loss: 2.1286  
<<<iteration:[80/117] - total_loss: 2.0816  obj_loss: 0.0602  noobj_loss: 0.0264  bbox_loss: 0.0507  cls_loss: 1.7545  
<<<iteration:[100/117] - total_loss: 2.5406  obj_loss: 0.0531  noobj_loss: 0.0166  bbox_loss: 0.1102  cls_loss: 1.9280  

epoch:89/100 - Train Loss: 2.7122, Val Loss: 3.8466

<<<iteration:[20/117] - total_loss: 2.7086  obj_loss: 0.0691  noobj_loss: 0.0264  bbox_loss: 0.1456  cls_loss: 1.8984  
<<<iteration:[40/117] - total_loss: 3.0871  obj_loss: 0.0565  noobj_loss: 0.0185  bbox_loss: 0.0919  cls_loss: 2.5619  
<<<iteration:[60/117] - total_loss: 2.7795  obj_loss: 0.0652  noobj_loss: 0.0190  bbox_loss: 0.1175  cls_loss: 2.1173  
<<<iteration:[80/117] - total_loss: 3.0779  obj_loss: 0.0539  noobj_loss: 0.0179  bbox_loss: 0.1050  cls_

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train Loss,▅▃█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train bbox Loss,▅▃█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train class Loss,█▅▅▄▄▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▃▂▂▂▂▁
Train obj Loss,█▁▁▁▂▁▁▂▂▁▂▂▂▂▂▁▂▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
Val Loss,▆█▆▅▃▁▁▂▃▁▄▂▁▁▁▂▁▂▁▂▁▁▁▁▁▁▂▁▁▂▂▁▂▃▁▂▁▁▁▁
Val bbox Loss,▆█▆▄▃▁▁▁▄▁▃▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▃▁▂▁▁▁▁
Val class Loss,▄▂▃▄▂▂▂▃▁▂█▂▂▂▁▄▁▂▂▄▂▁▁▂▂▂▅▁▁▄▃▁▆▅▁▄▁▃▂▃
Val obj Loss,▃▃▂▁▃▇▆▁▂▆▄▆▇▆▆▁▄▁▃▃▄█▃▅▅▂▂▇▄▄▂▅▃▁▆▁▇▇▅▇

0,1
Train Loss,2.44607
Train bbox Loss,0.08766
Train class Loss,1.93767
Train obj Loss,0.05947
Val Loss,3.19651
Val bbox Loss,0.08837
Val class Loss,2.6582
Val obj Loss,0.07039


# Inference

In [74]:
import numpy as np
import os 
import pandas as pd
import cv2
import torch
import matplotlib.pyplot as plt
from ipywidgets import interact
import albumentations as A
from albumentations.pytorch import ToTensorV2
import torchvision
from torch import nn
import torchsummary
from torch.utils.data import DataLoader
from collections import defaultdict
from torchvision.utils import make_grid

In [75]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [76]:
def load_model(ckpt_path, num_classes, device):
    checkpoint = torch.load(ckpt_path, map_location=device)
    model = YOLO_SWIN(num_classes=num_classes)
    model.load_state_dict(checkpoint)
    model = model.to(device)
    model.eval()
    return model

In [77]:
IMAGE_SIZE=448
transformer = A.Compose([
    A.Resize(height=IMAGE_SIZE, width=IMAGE_SIZE),
    A.Normalize(mean=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

In [85]:
ckpt_path="./trained_model/YOLO_SWIN_T_body_LR0.0001/model_90.pth"
model = load_model(ckpt_path, NUM_CLASSES, device)

In [86]:
@torch.no_grad()
def model_predict(image, model, conf_thres=0.2, iou_threshold=0.1):
    predictions = model(image)
    prediction = predictions.detach().cpu().squeeze(dim=0)
    
    grid_size = prediction.shape[-1]
    y_grid, x_grid = torch.meshgrid(torch.arange(grid_size), torch.arange(grid_size))
    stride_size = IMAGE_SIZE/grid_size

    conf = prediction[[0,5], ...].reshape(1, -1)
    xc = (prediction[[1,6], ...] * IMAGE_SIZE + x_grid*stride_size).reshape(1,-1)
    yc = (prediction[[2,7], ...] * IMAGE_SIZE + y_grid*stride_size).reshape(1,-1)
    w = (prediction[[3,8], ...] * IMAGE_SIZE).reshape(1,-1)
    h = (prediction[[4,9], ...] * IMAGE_SIZE).reshape(1,-1)
    cls = torch.max(prediction[10:, ...].reshape(NUM_CLASSES, -1), dim=0).indices.tile(1,2)
    
    x_min = xc - w/2
    y_min = yc - h/2
    x_max = xc + w/2
    y_max = yc + h/2

    prediction_res = torch.cat([x_min, y_min, x_max, y_max, conf, cls], dim=0)
    prediction_res = prediction_res.transpose(0,1)

    # x_min과 y_min이 음수가 되지않고, x_max와 y_max가 이미지 크기를 넘지 않게 제한
    prediction_res[:, 2].clip(min=0, max=image.shape[1]) 
    prediction_res[:, 3].clip(min=0, max=image.shape[0])
        
    pred_res = prediction_res[prediction_res[:, 4] > conf_thres]
    nms_index = torchvision.ops.nms(boxes=pred_res[:, 0:4], scores=pred_res[:, 4], iou_threshold=iou_threshold)
    pred_res_ = pred_res[nms_index].numpy()
    
    n_obj = pred_res_.shape[0]
    bboxes = np.zeros(shape=(n_obj, 4), dtype=np.float32)
    bboxes[:, 0:2] = (pred_res_[:, 0:2] + pred_res_[:, 2:4]) / 2
    bboxes[:, 2:4] = pred_res_[:, 2:4] - pred_res_[:, 0:2]
    scores = pred_res_[:, 4]
    class_ids = pred_res_[:, 5]
    
    # 이미지 값이 들어가면 모델을 통해서, 후처리까지 포함된 yolo 포멧의 box좌표, 그 좌표에 대한 confidence score
    # 그리고 class id를 반환
    return bboxes, scores, class_ids

In [87]:
pred_images = []
pred_labels =[]

for index, batch in enumerate(dataloaders["val"]):
    images = batch[0].to(device)
    bboxes, scores, class_ids = model_predict(images, model, conf_thres=0.1, iou_threshold=0.1)
    
    if len(bboxes) > 0:
        prediction_yolo = np.concatenate([bboxes, scores[:, np.newaxis], class_ids[:, np.newaxis]], axis=1)
    else:
        prediction_yolo = np.array([])
    
    # 텐서형의 이미지를 다시 unnormalize를 시키고, 다시 chw를 hwc로 바꾸고 넘파이로 바꾼다.
    np_image = make_grid(images[0], normalize=True).cpu().permute(1,2,0).numpy()
    pred_images.append(np_image)
    pred_labels.append(prediction_yolo)

    

In [88]:
from ipywidgets import interact

@interact(index=(0,len(pred_images)-1))
def show_result(index=0):
    print(pred_labels[index])
    if len(pred_labels[index]) > 0:
        result = visualize(pred_images[index], pred_labels[index][:, 0:4], pred_labels[index][:, 5])
    else:
        result = pred_images[index]
        
    plt.figure(figsize=(6,6))
    plt.imshow(result)
    plt.show()

interactive(children=(IntSlider(value=0, description='index', max=19), Output()), _dom_classes=('widget-intera…