# YOLOv8 Finetuning for Object Detection and Bounding Boxes Problems

In this notebook, we fine-tuned a YOLO model for object detection using a custom training dataset, then used the fine-tuned model to make predictions on the test set and generate a submission file for Kaggle.

In [None]:
## STEP 0: IMPORTING NEEDED LIBRARIES

%pip install -q ultralytics

from ultralytics import YOLO
import pandas as pd
import os
import cv2
from glob import glob
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import torch
import yaml

# use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
## STEP 1: DATA PREPARATION

# paths
train_csv = "./train.csv"
image_root = "./input/data-bounty-6-product-object-detection" # path to the train dataset
output_dir = "./output/yolo_dataset"  # path to save the YOLO dataset
train_images_dir = f"{output_dir}/images/train"
train_labels_dir = f"{output_dir}/labels/train"
os.makedirs(train_images_dir, exist_ok=True)
os.makedirs(train_labels_dir, exist_ok=True)

# load train
df = pd.read_csv(train_csv)

# encode classes
all_classes = []
for row in df['prediction_string']:
    parts = row.split()
    all_classes.extend(parts[::6])
label_encoder = LabelEncoder()
label_encoder.fit(all_classes)
class2id = {name: idx for idx, name in enumerate(label_encoder.classes_)}
id2class = {idx: name for name, idx in class2id.items()}

# save class names
class_names_path = f"{output_dir}/class_list.txt"
with open(class_names_path, "w") as f:
    f.write("\n".join(label_encoder.classes_))

# convert XYXY to YOLO format
def convert_bbox(x1, y1, x2, y2, img_w, img_h):
    x = (x1 + x2) / 2 / img_w
    y = (y1 + y2) / 2 / img_h
    w = (x2 - x1) / img_w
    h = (y2 - y1) / img_h
    return x, y, w, h

# create YOLO labels
for _, row in tqdm(df.iterrows(), total=len(df)):
    image_id = row["image_id"]
    image_path = f"{image_root}/Train/JPEGImages/{image_id}.jpg"
    label_path = f"{train_labels_dir}/{image_id}.txt"
    os.system(f"cp {image_path} {train_images_dir}/{image_id}.jpg")

    img = cv2.imread(image_path)
    h, w = img.shape[:2]

    with open(label_path, "w") as f:
        items = row["prediction_string"].split()
        for i in range(0, len(items), 6):
            class_name = items[i]
            x1, y1, x2, y2 = map(int, items[i+2:i+6])
            class_id = class2id[class_name]
            x, y, bw, bh = convert_bbox(x1, y1, x2, y2, w, h)
            f.write(f"{class_id} {x:.6f} {y:.6f} {bw:.6f} {bh:.6f}\n")

# create YOLOv8 data.yaml
data_yaml = {
    "train": train_images_dir,
    "val": train_images_dir,
    "nc": len(label_encoder.classes_),
    "names": [str(name) for name in label_encoder.classes_.tolist()]  # <- fix here
}

data_yaml_path = f"{output_dir}/data.yaml"
with open(data_yaml_path, "w") as f:
    yaml.dump(data_yaml, f)

In [None]:

# STEP 2: FINETUNING THE MODEL

model = YOLO("./best.pt")
results = model.train(
    data=data_yaml_path,
    epochs=5,
    imgsz=640,
    batch=8,
    device=device,
    workers=2,
    project="/kaggle/working/",
    name="yolo_finetuned"
)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m984.0/984.0 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━

 97%|█████████▋| 3695/3796 [01:13<00:02, 50.27it/s]sh: 1: Syntax error: "(" unexpected
100%|██████████| 3796/3796 [01:16<00:00, 49.93it/s]


Ultralytics 8.3.116 🚀 Python-3.11.11 torch-2.5.1+cu124 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=/kaggle/input/yolo/other/default/1/best.pt, data=/kaggle/working/yolo_dataset/data.yaml, epochs=5, time=None, patience=100, batch=8, imgsz=640, save=True, save_period=-1, cache=False, device=cuda, workers=2, project=/kaggle/working/, name=yolo_finetuned, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, sav

100%|██████████| 755k/755k [00:00<00:00, 24.4MB/s]



                   from  n    params  module                                       arguments                     
  0                  -1  1      2320  ultralytics.nn.modules.conv.Conv             [3, 80, 3, 2]                 
  1                  -1  1    115520  ultralytics.nn.modules.conv.Conv             [80, 160, 3, 2]               
  2                  -1  3    436800  ultralytics.nn.modules.block.C2f             [160, 160, 3, True]           
  3                  -1  1    461440  ultralytics.nn.modules.conv.Conv             [160, 320, 3, 2]              
  4                  -1  6   3281920  ultralytics.nn.modules.block.C2f             [320, 320, 6, True]           
  5                  -1  1   1844480  ultralytics.nn.modules.conv.Conv             [320, 640, 3, 2]              
  6                  -1  6  13117440  ultralytics.nn.modules.block.C2f             [640, 640, 6, True]           
  7                  -1  1   3687680  ultralytics.nn.modules.conv.Conv             [640

100%|██████████| 5.35M/5.35M [00:00<00:00, 108MB/s]


[34m[1mAMP: [0mchecks passed ✅
[34m[1mtrain: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 2539.0±1755.1 MB/s, size: 350.2 KB)


[34m[1mtrain: [0mScanning /kaggle/working/yolo_dataset/labels/train... 3795 images, 0 backgrounds, 0 corrupt: 100%|██████████| 3795/3795 [00:02<00:00, 1512.90it/s]

[34m[1mtrain: [0m/kaggle/working/yolo_dataset/images/train/XYGOC20200805142211123-1.jpg: 2 duplicate labels removed





[34m[1mtrain: [0mNew cache created: /kaggle/working/yolo_dataset/labels/train.cache
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 3490.3±1561.8 MB/s, size: 360.5 KB)


[34m[1mval: [0mScanning /kaggle/working/yolo_dataset/labels/train.cache... 3795 images, 0 backgrounds, 0 corrupt: 100%|██████████| 3795/3795 [00:00<?, ?it/s]

[34m[1mtrain: [0m/kaggle/working/yolo_dataset/images/train/XYGOC20200805142211123-1.jpg: 2 duplicate labels removed





Plotting labels to /kaggle/working/yolo_finetuned/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=8.5e-05, momentum=0.9) with parameter groups 97 weight(decay=0.0), 104 weight(decay=0.0005), 103 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 2 dataloader workers
Logging results to [1m/kaggle/working/yolo_finetuned[0m
Starting training for 5 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/5      6.68G     0.3102     0.1987     0.8285         42        640: 100%|██████████| 475/475 [05:44<00:00,  1.38it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 238/238 [01:47<00:00,  2.21it/s]


                   all       3795      18992      0.996      0.996      0.995      0.981

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/5      7.25G     0.2803     0.1759     0.8229         57        640: 100%|██████████| 475/475 [05:52<00:00,  1.35it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 238/238 [01:46<00:00,  2.23it/s]


                   all       3795      18992      0.994      0.998      0.995      0.982

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        3/5      7.27G     0.2821     0.1726     0.8241          2        640: 100%|██████████| 475/475 [05:50<00:00,  1.36it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 238/238 [01:46<00:00,  2.23it/s]


                   all       3795      18992      0.997      0.997      0.995      0.983

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        4/5      7.26G     0.2732     0.1733     0.8203         55        640: 100%|██████████| 475/475 [05:49<00:00,  1.36it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 238/238 [01:46<00:00,  2.23it/s]


                   all       3795      18992      0.997      0.998      0.995      0.983

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        5/5      7.31G     0.2709     0.1659     0.8218         54        640: 100%|██████████| 475/475 [05:51<00:00,  1.35it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 238/238 [01:46<00:00,  2.23it/s]


                   all       3795      18992      0.998      0.998      0.995      0.985

5 epochs completed in 0.638 hours.
Optimizer stripped from /kaggle/working/yolo_finetuned/weights/last.pt, 136.9MB
Optimizer stripped from /kaggle/working/yolo_finetuned/weights/best.pt, 136.9MB

Validating /kaggle/working/yolo_finetuned/weights/best.pt...
Ultralytics 8.3.116 🚀 Python-3.11.11 torch-2.5.1+cu124 CUDA:0 (Tesla T4, 15095MiB)
Model summary (fused): 112 layers, 68,232,387 parameters, 0 gradients, 258.0 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 238/238 [01:46<00:00,  2.23it/s]


                   all       3795      18992      0.998      0.998      0.995      0.985
                 3+2-2         44        126      0.998          1      0.995      0.975
                 3jia2         46        124          1      0.997      0.995       0.98
              aerbeisi         22         38      0.997          1      0.995      0.988
                anmuxi         70        107      0.999      0.991      0.995      0.972
                aoliao         56        613          1      0.995      0.995      0.942
                 asamu         88        246          1          1      0.995      0.991
                baicha         47        107      0.987          1      0.992      0.981
            baishikele         70        160          1      0.994      0.995      0.984
          baishikele-2         44         75      0.995          1      0.995      0.986
            baokuangli         59        179      0.999          1      0.995       0.99
           binghongch

  xa[xa < 0] = -1
  xa[xa < 0] = -1


Speed: 0.1ms preprocess, 24.0ms inference, 0.0ms loss, 0.9ms postprocess per image
Results saved to [1m/kaggle/working/yolo_finetuned[0m


In [None]:
## STEP 3: INFERENCE ON TEST DATASET

test_images = sorted(glob(f"{image_root}/Test/JPEGImages/*.jpg"))
conf_threshold = 0.4
iou_threshold = 0.4
final_model = YOLO("/kaggle/working/yolo_finetuned/weights/best.pt")

submission_data = []

for img_path in tqdm(test_images):
    result = final_model.predict(
        source=img_path,
        conf=conf_threshold,
        iou=iou_threshold,
        device=device,  # ✅ GPU if available
        verbose=False
    )[0]

    image_id = os.path.splitext(os.path.basename(img_path))[0]
    prediction_string = ""

    for box in result.boxes:
        cls_id = int(box.cls.item())
        score = box.conf.item()
        x1, y1, x2, y2 = box.xyxy[0].tolist()
        prediction_string += f"{id2class[cls_id]} {score:.4f} {int(x1)} {int(y1)} {int(x2)} {int(y2)} "

    submission_data.append({
        "image_id": image_id,
        "prediction_string": prediction_string.strip()
    })

100%|██████████| 542/542 [00:34<00:00, 15.52it/s]


✅ Final submission saved to /kaggle/working/submission10.csv


In [None]:
## STEP 4: SUBMISSION FILE

submission_df = pd.DataFrame(submission_data)
submission_df.to_csv("./output/submission.csv", index=False)
submission_df.head()

Unnamed: 0,image_id,prediction_string
0,XYG2020121711403780263182_81,damaicha 0.9782 540 109 631 194 damaicha 0.976...
1,XYG2020121811045959626219_81,youlemei 0.9773 286 151 367 227 hongniu2 0.977...
2,XYG2020121812002586066796_81,baicha 0.9797 299 120 401 225 moliqingcha 0.97...
3,XYG2020122214431356849310_81,yida 0.9795 454 431 513 489 damaicha 0.9790 65...
4,XYG2020122214522128735418_81,yida 0.9765 440 313 504 374 baishikele-2 0.976...
