In [8]:
! pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.202-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.17-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading nv

In [1]:
import os
import json
import shutil
from PIL import Image
from tqdm import tqdm

label2id = {
    "abdominal_wall_cavity": 2,
    "cystic_duct": 5,
    "cystic_plate": 0,
    "gallbladder": 1,
    "gut": 6,
    "liver": 4,
    "omentum": 3,
    "bipolar": 7,
    "clipper": 8,
    "grasper": 9,
    "hook": 10,
    "irrigator": 11,
    "scissors": 12,
    "specimenbag": 13,
}
id2label = {v: k for k, v in label2id.items()}


def build_yolo_dataset_from_ssgvqa(json_dir, qa_dir, img_root, split_ids, out_img_dir, out_lbl_dir):
    """
    Tạo dataset YOLO trực tiếp từ SSG-VQA Dataset
    - Resize ảnh về (860, 480)
    - Nhân đôi bbox trước khi convert sang YOLO format
    """

    os.makedirs(out_img_dir, exist_ok=True)
    os.makedirs(out_lbl_dir, exist_ok=True)

    for vid_id in split_ids:
        files = sorted(os.listdir(os.path.join(qa_dir, vid_id)))
        file_ids = [int(f.split(".")[0]) for f in files]

        for file_id in tqdm(file_ids, desc=f"Processing {vid_id}"):
            json_path = os.path.join(json_dir, f"{vid_id}_{file_id}.json")
            with open(json_path, "r") as f:
                data = json.load(f)

            scene = data["scenes"][0]
            img_name = scene["image_filename"]  # VIDxx_yyyy
            vid, frame_id = img_name.split("_")
            img_path = os.path.join(img_root, vid, frame_id.rjust(6, "0") + ".png")

            # resize ảnh về 860x480
            img = Image.open(img_path).convert("RGB").resize((860, 480))
            save_img_name = f"{vid_id}_{file_id}.jpg"
            save_img_path = os.path.join(out_img_dir, save_img_name)
            img.save(save_img_path)

            # tạo YOLO label
            h, w = 480, 860
            yolo_lines = []
            for obj in scene["objects"]:
                x1, y1, x2, y2 = obj["bbox"]
                w_box, h_box = x2 - x1, y2 - y1

                # bbox nhân đôi trước
                x1, y1, w_box, h_box = x1*2, y1*2, w_box*2, h_box*2

                # YOLO format normalized
                x_center = (x1 + w_box/2) / w
                y_center = (y1 + h_box/2) / h
                bw = w_box / w
                bh = h_box / h

                cls_id = label2id[obj["component"]]
                yolo_lines.append(f"{cls_id} {x_center:.6f} {y_center:.6f} {bw:.6f} {bh:.6f}")

            # lưu .txt label
            save_lbl_path = os.path.join(out_lbl_dir, save_img_name.replace(".jpg", ".txt"))
            with open(save_lbl_path, "w") as f:
                f.write("\n".join(yolo_lines))



# -------------------------
# Create YAML file
# -------------------------
def create_yaml(out_path="surgical_ssg.yaml", dataset_path="dataset_yolo"):
    names = [id2label[i] for i in range(len(id2label))]
    yaml_str = f"""
path: {dataset_path}
train: images/train
val: images/val
test: images/test

nc: {len(names)}
names: {names}
"""
    with open(out_path, "w") as f:
        f.write(yaml_str)
    print(f"✅ YAML file saved to {out_path}")

In [15]:
from ultralytics import YOLO

# -------------------------
# Train with pretrained weights (support resume)
# -------------------------
def train_yolo_with_weights(
    data_yaml,
    weight_path="yolov8n.pt",
    epochs=50,
    batch=16,
    imgsz=(480, 860),
    freeze=None,
    resume=False
):
    if resume:
        print("🔄 Resuming training from last checkpoint...")
        model = YOLO(weight_path)  # weight_path = runs/detect/trainX/weights/last.pt
        results = model.train(resume=True)
    else:
        model = YOLO(weight_path)
        results = model.train(
            data=data_yaml,
            epochs=epochs,
            batch=batch,
            imgsz=imgsz,
            freeze=freeze
        )
    return model, results

In [11]:
json_dir = "/kaggle/input/scene-graph-ssg-vqa/scene_graph"
qa_dir   = "/kaggle/input/ssg-vqa/qa_txt/qa_txt"
img_root = "/kaggle/input/cholect45/CholecT45/data"
dataset_root = "/kaggle/working/dataset"
yaml_path = "/kaggle/working/surgical_ssg.yaml"

In [3]:
## Các split theo video ID
train_seq = [
    "VID73","VID40","VID62","VID42","VID29","VID56","VID50","VID78",
    "VID66","VID13","VID52","VID06","VID36","VID05","VID12","VID26",
    "VID68","VID32","VID49","VID65","VID47","VID04","VID23","VID79",
    "VID51","VID10","VID57","VID75","VID25","VID14","VID15","VID08",
    "VID80","VID27","VID70"
]
val_seq = ["VID18","VID48","VID01","VID35","VID31"]
test_seq = ["VID22","VID74","VID60","VID02","VID43"]

# train
build_yolo_dataset_from_ssgvqa(
    json_dir, qa_dir, img_root, train_seq,
    out_img_dir="/kaggle/working/dataset/images/train",
    out_lbl_dir="/kaggle/working/dataset/labels/train"
)

# val
build_yolo_dataset_from_ssgvqa(
    json_dir, qa_dir, img_root, val_seq,
    out_img_dir="/kaggle/working/dataset/images/val",
    out_lbl_dir="/kaggle/working/dataset/labels/val"
)

# test
build_yolo_dataset_from_ssgvqa(
    json_dir, qa_dir, img_root, test_seq,
    out_img_dir="/kaggle/working/dataset/images/test",
    out_lbl_dir="/kaggle/working/dataset/labels/test"
)

Processing VID73: 100%|██████████| 394/394 [00:15<00:00, 25.86it/s]
Processing VID40: 100%|██████████| 625/625 [00:24<00:00, 25.96it/s]
Processing VID62: 100%|██████████| 422/422 [00:16<00:00, 25.96it/s]
Processing VID42: 100%|██████████| 851/851 [00:30<00:00, 27.54it/s]
Processing VID29: 100%|██████████| 663/663 [00:25<00:00, 26.15it/s]
Processing VID56: 100%|██████████| 453/453 [00:16<00:00, 27.38it/s]
Processing VID50: 100%|██████████| 254/254 [00:09<00:00, 27.98it/s]
Processing VID78: 100%|██████████| 247/247 [00:26<00:00,  9.37it/s]
Processing VID66: 100%|██████████| 502/502 [00:18<00:00, 27.83it/s]
Processing VID13: 100%|██████████| 260/260 [00:09<00:00, 28.80it/s]
Processing VID52: 100%|██████████| 689/689 [00:25<00:00, 27.36it/s]
Processing VID06: 100%|██████████| 553/553 [00:19<00:00, 28.45it/s]
Processing VID36: 100%|██████████| 626/626 [00:23<00:00, 26.21it/s]
Processing VID05: 100%|██████████| 943/943 [00:32<00:00, 28.89it/s]
Processing VID12: 100%|██████████| 342/342 [00:1

✅ YAML file saved to /kaggle/working/surgical_ssg.yaml





In [12]:
# tạo file YAML
create_yaml(out_path="/kaggle/working/surgical_ssg.yaml", dataset_path="/kaggle/working/dataset")

✅ YAML file saved to /kaggle/working/surgical_ssg.yaml


In [17]:
model, results = train_yolo_with_weights(
    data_yaml="/kaggle/working/surgical_ssg.yaml",
    weight_path="/kaggle/input/yolov8/other/n/1/best.pt",  # checkpoint trước
    epochs=50,
    batch=16,
    imgsz=(480, 860)
)

Ultralytics 8.3.202 🚀 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/kaggle/working/surgical_ssg.yaml, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=(480, 860), int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=/kaggle/input/yolov8/other/n/1/best.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train3, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, ov

  xa[xa < 0] = -1
  xa[xa < 0] = -1


                   all       2897      19332      0.762      0.579      0.643      0.417
          cystic_plate       2155       2155      0.724      0.678      0.729      0.385
           gallbladder       2363       2363      0.691      0.373      0.488      0.251
 abdominal_wall_cavity       2122       2122      0.757       0.49      0.588      0.405
               omentum       2538       2538      0.667      0.605      0.666      0.406
                 liver       2831       2831      0.912      0.929      0.959      0.871
           cystic_duct        148        148          1          0    0.00514    0.00166
                   gut       1670       1670      0.547      0.131      0.182     0.0869
               bipolar        148        149      0.835      0.819      0.854      0.532
               clipper        185        188      0.668      0.782      0.785      0.518
               grasper       2320       2675      0.768      0.627      0.727       0.44
                  hoo

In [20]:
!tar -czf working_backup.tar.gz /kaggle/working

tar: Removing leading `/' from member names
tar: /kaggle/working/working_backup.tar.gz: file changed as we read it


In [21]:
from IPython.display import FileLink
FileLink("working_backup.tar.gz")

In [23]:
# -------------------------
# Evaluate
# -------------------------
def evaluate_yolo(model, data_yaml, split="test", imgsz=(480, 860)):
    results = model.val(data=data_yaml, split=split, imgsz=imgsz)
    print("📊 Evaluation results:", results)
    return results


# -------------------------
# Predict
# -------------------------
def predict_yolo(model, source, imgsz=(480, 860), conf=0.25):
    preds = model.predict(source=source, save=True, imgsz=imgsz, conf=conf)
    print(f"🖼 Predictions saved to {preds[0].save_dir}")
    return preds



# evaluate
evaluate_yolo(model, data_yaml=yaml_path, split="test", imgsz=(480,860))

Ultralytics 8.3.202 🚀 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
Model summary (fused): 72 layers, 3,008,378 parameters, 0 gradients, 8.1 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 689.9±224.0 MB/s, size: 35.5 KB)
[K[34m[1mval: [0mScanning /kaggle/working/dataset/labels/test... 2177 images, 0 backgrounds, 3 corrupt: 100% ━━━━━━━━━━━━ 2177/2177 1.5Kit/s 1.5s0.1s
[34m[1mval: [0m/kaggle/working/dataset/images/test/VID60_176.jpg: ignoring corrupt image/label: non-normalized or out of bounds coordinates [     1.0651]
[34m[1mval: [0m/kaggle/working/dataset/images/test/VID60_184.jpg: ignoring corrupt image/label: non-normalized or out of bounds coordinates [     1.2917]
[34m[1mval: [0m/kaggle/working/dataset/images/test/VID60_229.jpg: ignoring corrupt image/label: non-normalized or out of bounds coordinates [     1.0917]
[34m[1mval: [0mNew cache created: /kaggle/working/dataset/labels/test.cache
[K                 Class

  xa[xa < 0] = -1
  xa[xa < 0] = -1


                   all       2174      14371      0.722      0.578      0.616      0.375
          cystic_plate       1349       1349      0.647      0.471      0.512       0.23
           gallbladder       1725       1725        0.6      0.424      0.477      0.241
 abdominal_wall_cavity       1911       1911      0.715       0.54      0.615       0.42
               omentum       1886       1886      0.648      0.515      0.582      0.333
                 liver       2150       2150      0.923      0.924      0.959      0.829
           cystic_duct         21         21          1          0    0.00376    0.00128
                   gut       1518       1518      0.478      0.236      0.258      0.127
               bipolar        205        206      0.915      0.835      0.906      0.547
               clipper         93         93       0.67      0.763      0.745      0.459
               grasper       1656       1781        0.7      0.529      0.613      0.338
                  hoo

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x7bb75126cd50>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0.043043,    0.044044,    0.0

In [24]:
# predict
predict_yolo(model, source=os.path.join(dataset_root,"images/test"), imgsz=(480,860))


inference results will accumulate in RAM unless `stream=True` is passed, causing potential out-of-memory
errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

image 1/2177 /kaggle/working/dataset/images/test/VID02_102.jpg: 480x864 1 abdominal_wall_cavity, 1 liver, 1 grasper, 1 hook, 5.6ms
image 2/2177 /kaggle/working/dataset/images/test/VID02_1047.jpg: 480x864 1 cystic_plate, 1 gallbladder, 1 liver, 1 hook, 6.0ms
image 3/2177 /kaggle/working/dataset/images/test/VID02_105.jpg: 480x864 1 abdominal_wall_cavity, 2 omentums, 1 liver, 1 grasper, 1 hook, 5.9ms
image 4/2177 /kaggle/working/dataset/images/test/VID02_1053.jpg: 48

[ultralytics.engine.results.Results object with attributes:
 
 boxes: ultralytics.engine.results.Boxes object
 keypoints: None
 masks: None
 names: {0: 'cystic_plate', 1: 'gallbladder', 2: 'abdominal_wall_cavity', 3: 'omentum', 4: 'liver', 5: 'cystic_duct', 6: 'gut', 7: 'bipolar', 8: 'clipper', 9: 'grasper', 10: 'hook', 11: 'irrigator', 12: 'scissors', 13: 'specimenbag'}
 obb: None
 orig_img: array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        ...,
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
  