# Dataset 준비

In [None]:
# !pip install torchsummary -q

In [None]:
import os
import cv2
import PIL
import numpy as np
from tqdm.notebook import tqdm
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchvision
from torchvision import transforms
import torchsummary

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
path = "/kaggle/input/image-localization-dataset/training_images"
count = 0

for file in os.listdir(path):
    if file.endswith(".jpg"):
        image_path = os.path.join(path, file)
        xml_path = os.path.join(path, file.replace(".jpg", ".xml"))
        if os.path.exists(xml_path) and os.path.exists(image_path) and count < 10:
            print(f"image:{image_path}\nxml:{xml_path}\n")
            count += 1

## XML 파일 읽기 및 구조

`xml.etree.ElementTree` 모듈을 사용하면 XML 파일에서 원하는 데이터를 쉽게 추출할 수 있습니다.  
  
XML 파일은 객체 탐지를 위한 라벨링 정보를 포함하고 있습니다. 이 파일에서는 이미지의 경로, 크기, 객체의 이름, 그리고 바운딩 박스 정보(객체의 좌표) 등이 담겨 있습니다. 

```xml
<annotation>
	<folder>single cucumber</folder>
	<filename>cucumber_31.jpg</filename>
	<path>C:\Users\Muhammed Buyukkinaci\Downloads\single cucumber\cucumber_31.jpg</path>
	<source>
		<database>Unknown</database>
	</source>
	<size>
		<width>227</width>
		<height>227</height>
		<depth>3</depth>
	</size>
	<segmented>0</segmented>
	<object>
		<name>cucumber</name>
		<pose>Unspecified</pose>
		<truncated>0</truncated>
		<difficult>0</difficult>
		<bndbox>
			<xmin>36</xmin>
			<ymin>11</ymin>
			<xmax>215</xmax>
			<ymax>207</ymax>
		</bndbox>
	</object>
</annotation>
```


이 중 주목해야 할 주요 요소는 다음과 같습니다.

- `<filename>`: 이미지 파일 이름 (예: cucumber_31.jpg)
- `<size>`: 이미지의 크기 (width, height, depth)
- `<object>`: 탐지할 객체의 정보 (예: cucumber)
- `<bndbox>`: 객체의 좌표 정보 (xmin, ymin, xmax, ymax)

In [None]:
xml_path = "/kaggle/input/image-localization-dataset/training_images/eggplant_35.xml"

# XML 파싱
tree = ET.parse(xml_path)
root = tree.getroot()

In [None]:
# 파일 이름, 경로, 이미지 크기 정보 추출
filename =
image_path =
size = 
width = 
height = 

print(f"filename: {filename}\nimage_path: {image_path}\nwidth: {width}\nheight: {height}")

In [None]:
# 객체 정보 추출
obj =
label =
bndbox =
xmin =
ymin =
xmax =
ymax =

print(f"label: {label}\nxmin: {xmin}\nymin: {ymin}\nxmax: {xmax}\nymax: {ymax}")

## 데이터 확인

In [None]:
image_path = "/kaggle/input/image-localization-dataset/training_images/eggplant_35.jpg"

image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

plt.imshow(image)
plt.show()

In [None]:
def draw_bbox_cywh_normalized(image_path):
    """
    이미지 경로 입력 시 xml 파일을 불러오고, bbox 정보를 추출하여 이미지+bbox를 그려주는 함수
    """
    # Your code
    
    plt.imshow(image)
    plt.show()

In [None]:
draw_bbox_cywh_normalized("/kaggle/input/image-localization-dataset/training_images/eggplant_35.jpg")

### Scaled Bounding Box로 이미지 그리기

In [None]:
class2label = {"mushroom": 0, "eggplant": 1, "cucumber": 2}
label2class = {v: k for k, v in class2label.items()}

sample_bbox = torch.tensor([xmin, ymin, xmax, ymax]).float() / torch.tensor([width, height, width, height]).float()

sample_label = torch.tensor(class2label[label])

print(f"bbox: {sample_bbox}\nlabel: {sample_label}")

In [None]:
sample_bbox * torch.tensor([width, height, width, height]).float()

In [None]:
def draw_bbox(image, bbox, label):
    """
    이미지 및 바운딩 박스를 그려주는 함수
    """
    # 바운딩 박스 좌표를 이미지 크기에 맞게 스케일링하고 정수형으로 변환
    # Your code
    
    # 클래스 라벨을 타이틀로 설정
    # Your code

In [None]:
draw_bbox(image, sample_bbox, sample_label)

## Dataset & DataLoader 구성

In [None]:
class customDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_files = []
        self.annotation_files = []
        self.classes = {"mushroom": 0, "eggplant": 1, "cucumber": 2}
        
        # 이미지와 XML 파일을 쌍으로 추출하여 정리
        
        for file in os.listdir(self.root_dir):
            # Your code
                    
    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        # 이미지 파일과 XML 파일 경로
        image_path = self.image_files[idx]
        xml_path = self.annotation_files[idx]
        
        # 이미지 읽기
        image = PIL.Image.open(image_path)

        # XML 파일에서 바운딩 박스 정보 추출
        # Your code

        # 객체 정보 추출
        # Your code

        if self.transform:
            image = self.transform(image)

        return image, bbox, label

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
dataset = customDataset("/kaggle/input/image-localization-dataset/training_images", transform=transform)
train_dataset, val_dataset = random_split(dataset, [0.8, 0.2])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

In [None]:
len(train_dataset), len(val_dataset)

In [None]:
train_dataset[0]

# 모델링

In [None]:
class detector(nn.Module):
    def __init__(self, num_classes):
        super(detector, self).__init__()
        
        # CNN Layer
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        
        # Fully Connected Layer
        self.fc1 = nn.Linear(64 * 28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        
        # Output layers: bbox 예측(xmin, ymin, xmax, ymax) & 분류기
        self.fc_bbox = nn.Linear(64, 4)  # 바운딩 박스 좌표 예측
        self.fc_class = nn.Linear(64, num_classes)  # 클래스 예측
    
    def forward(self, x):
        # Backbone
        # Your code
        
        # Flatten
        # Your code
        
        # FCL
        # Your code
        
        # Detection Head
        # Your code
        
        return bbox, class_logits

In [None]:
# 모델 초기화
num_classes = 3  # cucumber, eggplant, mushroom 3개의 클래스
model = detector(num_classes=num_classes).to(device)
torchsummary.summary(model, (3, 224, 224))

In [None]:
# 손실 함수 설정 (MSE for bbox, CrossEntropy for class)
criterion_bbox = nn.MSELoss()  # 바운딩 박스 좌표 예측을 위한 MSE
criterion_class = nn.CrossEntropyLoss()  # 클래스 분류를 위한 CrossEntropy

# 옵티마이저 설정
optimizer = optim.Adam(model.parameters(), lr=0.005)

In [None]:
def train_model(model, train_loader, val_loader, num_epochs, criterion_bbox, criterion_class, optimizer, device):
    model.to(device)
    
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        running_bbox_loss = 0.0
        running_class_loss = 0.0
        
        # Train
        for images, bboxes, labels in tqdm(train_loader):
            # Your code
            
            # 배치의 손실 누적
            running_loss += loss_total.item()
            running_bbox_loss += loss_bbox.item()
            running_class_loss += loss_class.item()
        
        # 에포크 당 훈련 손실 계산
        epoch_loss = running_loss / len(train_loader)
        epoch_bbox_loss = running_bbox_loss / len(train_loader)
        epoch_class_loss = running_class_loss / len(train_loader)
        
        train_losses.append(epoch_loss)
        print(f"Epoch [{epoch+1}/{num_epochs}], Train bbox Loss: {epoch_bbox_loss:.4f}, Train class Loss: {epoch_class_loss:.4f}\nTrain Toal Loss: {epoch_loss:.4f}")
        
        # Validation
        model.eval()
        val_running_loss = 0.0
        val_running_bbox_loss = 0.0
        val_running_class_loss = 0.0
        
        with torch.no_grad():
            for images, bboxes, labels in val_loader:
                # Your code
                
                val_running_loss += loss_total.item()
                val_running_bbox_loss += loss_bbox.item()
                val_running_class_loss += loss_class.item()
        
        # 에포크 당 검증 손실 계산
        val_loss = val_running_loss / len(val_loader)
        val_bbox_loss = val_running_bbox_loss / len(val_loader)
        val_class_loss = val_running_class_loss / len(val_loader)

        val_losses.append(val_loss)
        print(f"Epoch [{epoch+1}/{num_epochs}], Val bbox Loss: {val_bbox_loss:.4f}, Val class Loss: {val_class_loss:.4f}\nVal Total Loss: {val_loss:.4f}")
    
    return train_losses, val_losses

In [None]:
epochs = 30
train_losses, val_losses = train_model(model, train_loader, val_loader, epochs, criterion_bbox, criterion_class, optimizer, device)

In [None]:
plt.plot(train_losses, label='train loss')
plt.plot(val_losses, label='val loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim([0, 1])
plt.legend()
plt.show()

In [None]:
def predict(model, image):
    model.eval()
    image = image.to(device)
    output = model(image.unsqueeze(0))
    print(output)
    bbox = output[0][0].detach().cpu()
    class_logits = output[1][0].detach().cpu()
    class_prob = F.softmax(class_logits, dim=-1)
    return bbox, class_prob

In [None]:
bbox, class_prob = predict(model, val_dataset[0][0])
bbox, class_prob

In [None]:
xx = val_dataset[0][0].detach().cpu().numpy().transpose(1, 2, 0)
# denormalize
xx = ((xx * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406]))*255).astype(np.uint8)

plt.imshow(xx)
plt.show()

In [None]:
def draw_prediction(image, bbox, class_prob):
    image = image.detach().cpu().numpy().transpose(1, 2, 0)
    image = ((image * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406]))*255).astype(np.uint8)

    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    class_prob = class_prob.argmax(axis=-1)
    print(bbox, class_prob)
    predicted_img = draw_bbox(image, bbox, class_prob)
    return predicted_img

In [None]:
draw_prediction(val_dataset[0][0], bbox, class_prob)