## DvXray Full Processing Pipeline  
**Author:** Soon-Hyuck Lee  
**Dataset:** Dual-View X-ray Baggage Detection (DvXray)

This notebook:

1. Loads raw DvXray dataset (Positive + Negative)
2. Creates train/val split
3. Generates YOLO dataset (images + labels)
4. Generates COCO dataset (images + annotations)
5. Generates Classification dataset (images + .npy label vectors)
6. Handles dual-view OL/SD images
7. Handles “difficult” and missing bounding boxes
8. Supports negative samples

Run each cell in order.


### Imports & Config

In [1]:
import os
import json
import shutil
import random
import numpy as np
from tqdm import tqdm

# Base directories
BASE = "./data"
RAW_POS = f"{BASE}/raw/DvXray_Positive_Samples"
RAW_NEG = f"{BASE}/raw/DvXray_Negative_Samples"

SPLIT_DIR = f"{BASE}/processed/classification/split"
YOLO_OUT = f"{BASE}/processed/yolo"
COCO_OUT = f"{BASE}/processed/coco"
CLS_OUT  = f"{BASE}/processed/classification"

IMG_W = 800
IMG_H = 600
TRAIN_RATIO = 0.8

random.seed(19)

prohibited_item_classes = {
    'Gun': 0, 'Knife': 1, 'Wrench': 2, 'Pliers': 3, 'Scissors': 4,
    'Lighter': 5, 'Battery': 6, 'Bat': 7, 'Razor_blade': 8,
    'Saw_blade': 9, 'Fireworks': 10, 'Hammer': 11,
    'Screwdriver': 12, 'Dart': 13, 'Pressure_vessel': 14
}

print("Config loaded.")


Config loaded.


### Utility: Normalize BBox

In [2]:
def normalize_bbox(bb, json_path, view_name):
    """Normalize bbox into [x1,y1,x2,y2] or return None."""

    if bb is None or bb == "difficult":
        return None
    if bb == []:
        return None
    if isinstance(bb, list) and len(bb) == 1 and isinstance(bb[0], list):
        bb = bb[0]
    if len(bb) > 4:
        print(f"[WARN] {json_path}: {view_name} has {len(bb)} values → truncating.")
        bb = bb[:4]
    if len(bb) != 4:
        print(f"[WARN] {json_path}: {view_name} INVALID ({bb}) → skipping.")
        return None
    return bb


### Utility: Classification Label Vector

In [3]:
def get_label_vector(json_path):
    """Return multi-hot 15-dim vector."""
    data = json.load(open(json_path))
    objs = data["objects"]

    if objs == "None":
        return np.zeros(15, dtype=np.int32)

    arr = np.zeros(15, dtype=np.int32)
    for obj in objs:
        idx = prohibited_item_classes[obj["label"]]
        arr[idx] = 1
    return arr


### Step 1: Create Train/Val Split

In [4]:
os.makedirs(SPLIT_DIR, exist_ok=True)

all_ids = []
for folder in [RAW_POS, RAW_NEG]:
    for f in os.listdir(folder):
        if f.endswith(".json"):
            all_ids.append(f.replace(".json", ""))

all_ids.sort()
n = len(all_ids)
k = int(n * TRAIN_RATIO)
train_ids = set(random.sample(all_ids, k))

with open(f"{SPLIT_DIR}/train.txt", "w") as f:
    for sid in all_ids:
        if sid in train_ids:
            f.write(sid + "\n")

with open(f"{SPLIT_DIR}/val.txt", "w") as f:
    for sid in all_ids:
        if sid not in train_ids:
            f.write(sid + "\n")

print(f"Split complete. Total: {n}, Train: {k}, Val: {n-k}")


Split complete. Total: 16000, Train: 12800, Val: 3200


### Step 2: YOLO Dataset Generation

In [16]:
for split in ["train", "val"]:
    os.makedirs(f"{YOLO_OUT}/images/{split}", exist_ok=True)
    os.makedirs(f"{YOLO_OUT}/labels/{split}", exist_ok=True)

def convert_to_yolo(json_path, ol_label, sd_label):
    data = json.load(open(json_path))
    objs = data["objects"]

    f_ol = open(ol_label, "w")
    f_sd = open(sd_label, "w")

    if objs == "None":
        f_ol.close()
        f_sd.close()
        return

    for obj in objs:
        cls_id = prohibited_item_classes[obj["label"]]

        # OL
        ol = normalize_bbox(obj["ol_bb"], json_path, "ol_bb")
        if ol is not None:
            x1, y1, x2, y2 = ol
            cx = (x1 + x2) / 2 / IMG_W
            cy = (y1 + y2) / 2 / IMG_H
            w  = (x2 - x1) / IMG_W
            h  = (y2 - y1) / IMG_H
            f_ol.write(f"{cls_id} {cx} {cy} {w} {h}\n")

        # SD
        sd = normalize_bbox(obj["sd_bb"], json_path, "sd_bb")
        if sd is not None:
            x1, y1, x2, y2 = sd
            cx = (x1 + x2) / 2 / IMG_W
            cy = (y1 + y2) / 2 / IMG_H
            w  = (x2 - x1) / IMG_W
            h  = (y2 - y1) / IMG_H
            f_sd.write(f"{cls_id} {cx} {cy} {w} {h}\n")

    f_ol.close()
    f_sd.close()

print("YOLO functions loaded.")


YOLO functions loaded.


### Run YOLO Conversion

In [17]:
print("Creating YOLO dataset...")

for split in ["train", "val"]:
    ids = open(f"{SPLIT_DIR}/{split}.txt").read().split()
    for sid in tqdm(ids):
        folder = RAW_POS if os.path.exists(f"{RAW_POS}/{sid}.json") else RAW_NEG
        json_path = f"{folder}/{sid}.json"

        shutil.copy(f"{folder}/{sid}_OL.png", f"{YOLO_OUT}/images/{split}")
        shutil.copy(f"{folder}/{sid}_SD.png", f"{YOLO_OUT}/images/{split}")

        convert_to_yolo(
            json_path,
            f"{YOLO_OUT}/labels/{split}/{sid}_OL.txt",
            f"{YOLO_OUT}/labels/{split}/{sid}_SD.txt"
        )

print("YOLO dataset complete.")


Creating YOLO dataset...


100%|██████████| 12800/12800 [00:20<00:00, 612.55it/s]
100%|██████████| 3200/3200 [00:05<00:00, 593.62it/s]

YOLO dataset complete.





### Step 3: COCO Conversion

In [18]:
for split in ["train", "val"]:
    os.makedirs(f"{COCO_OUT}/{split}", exist_ok=True)
os.makedirs(f"{COCO_OUT}/annotations", exist_ok=True)

def coco_generate(split_name):

    img_id = 1
    ann_id = 1

    images = []
    annotations = []

    ids = open(f"{SPLIT_DIR}/{split_name}.txt").read().split()
    out_img_dir = f"{COCO_OUT}/{split_name}"

    for sid in tqdm(ids):
        folder = RAW_POS if os.path.exists(f"{RAW_POS}/{sid}.json") else RAW_NEG
        json_path = f"{folder}/{sid}.json"
        data = json.load(open(json_path))
        objs = data["objects"]

        for view in ["OL", "SD"]:

            fname = f"{sid}_{view}.png"
            shutil.copy(f"{folder}/{fname}", out_img_dir)

            images.append({
                "id": img_id,
                "file_name": fname,
                "width": IMG_W,
                "height": IMG_H
            })

            if objs != "None":
                for obj in objs:
                    cls = prohibited_item_classes[obj["label"]]
                    bb = obj["ol_bb"] if view == "OL" else obj["sd_bb"]
                    bb = normalize_bbox(bb, json_path, view)
                    if bb is None:
                        continue
                    x1, y1, x2, y2 = bb
                    w = x2 - x1
                    h = y2 - y1

                    annotations.append({
                        "id": ann_id,
                        "image_id": img_id,
                        "category_id": cls,
                        "bbox": [x1, y1, w, h],
                        "area": w * h,
                        "iscrowd": 0
                    })
                    ann_id += 1

            img_id += 1

    coco_dict = {
        "images": images,
        "annotations": annotations,
        "categories": [
            {"id": cid, "name": name}
            for name, cid in prohibited_item_classes.items()
        ]
    }

    with open(f"{COCO_OUT}/annotations/instances_{split_name}.json", "w") as f:
        json.dump(coco_dict, f, indent=4)

    print(f"COCO {split_name} done: {len(images)} images, {len(annotations)} annotations.")


### Run COCO

In [19]:
coco_generate("train")
coco_generate("val")
print("COCO dataset complete.")


100%|██████████| 12800/12800 [00:53<00:00, 237.32it/s]


COCO train done: 25600 images, 8406 annotations.


100%|██████████| 3200/3200 [00:21<00:00, 145.73it/s]

COCO val done: 6400 images, 2177 annotations.
COCO dataset complete.





### Step 4: Classification Dataset

In [5]:
for split in ["train", "val"]:
    os.makedirs(f"{CLS_OUT}/{split}", exist_ok=True)

for split in ["train", "val"]:
    ids = open(f"{SPLIT_DIR}/{split}.txt").read().split()
    out_dir = f"{CLS_OUT}/{split}"

    for sid in tqdm(ids):
        folder = RAW_POS if os.path.exists(f"{RAW_POS}/{sid}.json") else RAW_NEG
        json_path = f"{folder}/{sid}.json"

        lbl = get_label_vector(json_path)
        np.save(f"{out_dir}/{sid}.npy", lbl)

        shutil.copy(f"{folder}/{sid}_OL.png", out_dir)
        shutil.copy(f"{folder}/{sid}_SD.png", out_dir)

print("Classification dataset complete.")


100%|██████████| 12800/12800 [00:17<00:00, 736.91it/s]
100%|██████████| 3200/3200 [00:05<00:00, 602.63it/s]

Classification dataset complete.





## Pipeline Completed Successfully!

### ✔ YOLO dataset  
- data/processed/yolo/images/train  
- data/processed/yolo/labels/train  

### ✔ COCO dataset  
- data/processed/coco/train  
- instances_train.json  

### ✔ Classification dataset  
- data/processed/classification/train  
- .npy multi-label files  