In [4]:
import os
import shutil
import pandas as pd
from tqdm import tqdm

# Root assignment directory
PROJECT_ROOT = r"C:\Users\niran\OneDrive\Desktop\Bosch\Assignment\EDA"

# Parsed annotations CSV
ANNOT_CSV = os.path.join(PROJECT_ROOT, "eda_outputs", "tables", "parsed_annotations.csv")

# Original BDD paths
IMG_ROOT = os.path.join(PROJECT_ROOT, "data", "images")

# New YOLO dataset directory
YOLO_ROOT = os.path.join(PROJECT_ROOT, "yolo_dataset")
os.makedirs(YOLO_ROOT, exist_ok=True)

# YOLO subfolders
TRAIN_IMG_DIR = os.path.join(YOLO_ROOT, "images", "train")
VAL_IMG_DIR   = os.path.join(YOLO_ROOT, "images", "val")
TRAIN_LBL_DIR = os.path.join(YOLO_ROOT, "labels", "train")
VAL_LBL_DIR   = os.path.join(YOLO_ROOT, "labels", "val")

for d in [TRAIN_IMG_DIR, VAL_IMG_DIR, TRAIN_LBL_DIR, VAL_LBL_DIR]:
    os.makedirs(d, exist_ok=True)

In [3]:
df = pd.read_csv(ANNOT_CSV)
df.head()

Unnamed: 0,image,split,category,x1,y1,x2,y2,width,height,aspect_ratio,occluded,truncated,weather,scene,timeofday
0,0000f77c-6257be58.jpg,train,traffic light,1125.902264,133.184488,1156.978645,210.875445,31.076381,77.690957,0.4,False,False,clear,city street,daytime
1,0000f77c-6257be58.jpg,train,traffic light,1156.978645,136.637417,1191.50796,210.875443,34.529315,74.238026,0.465116,False,False,clear,city street,daytime
2,0000f77c-6257be58.jpg,train,traffic sign,1101.731743,211.122087,1170.79037,233.566141,69.058627,22.444054,3.076923,False,False,clear,city street,daytime
3,0000f77c-6257be58.jpg,train,traffic sign,0.0,0.246631,100.381647,122.825696,100.381647,122.579065,0.818913,False,True,clear,city street,daytime
4,0000f77c-6257be58.jpg,train,car,45.240919,254.530367,357.805838,487.906215,312.564919,233.375848,1.33932,False,False,clear,city street,daytime


In [4]:
CLASSES = [
    "bike", "bus", "car", "motor", "person",
    "rider", "traffic light", "traffic sign", "train", "truck"
]

CLASS2ID = {cls: i for i, cls in enumerate(CLASSES)}
CLASS2ID

{'bike': 0,
 'bus': 1,
 'car': 2,
 'motor': 3,
 'person': 4,
 'rider': 5,
 'traffic light': 6,
 'traffic sign': 7,
 'train': 8,
 'truck': 9}

In [5]:
def convert_bbox_to_yolo(x1, y1, x2, y2, img_w=1280, img_h=720):
    """
    BDD100K images are 1280x720 by default.
    """
    bbox_w = x2 - x1
    bbox_h = y2 - y1
    cx = x1 + bbox_w / 2
    cy = y1 + bbox_h / 2

    # Normalize
    return (
        cx / img_w,
        cy / img_h,
        bbox_w / img_w,
        bbox_h / img_h
    )

In [6]:
def write_yolo_labels(df_split, label_dir):
    grouped = df_split.groupby("image")

    for img_name, g in tqdm(grouped, desc=f"Writing YOLO labels -> {label_dir}"):
        txt_path = os.path.join(label_dir, img_name.replace(".jpg", ".txt"))

        lines = []
        for _, row in g.iterrows():
            cls_id = CLASS2ID[row["category"]]

            cx, cy, w, h = convert_bbox_to_yolo(
                row["x1"], row["y1"], row["x2"], row["y2"]
            )

            lines.append(f"{cls_id} {cx:.6f} {cy:.6f} {w:.6f} {h:.6f}")

        with open(txt_path, "w") as f:
            f.write("\n".join(lines))

In [7]:
df_train = df[df["split"] == "train"]
df_val   = df[df["split"] == "val"]

len(df_train), len(df_val)

(1286871, 185526)

In [8]:
def copy_split_images(df_split, src_img_dir, dst_dir):
    img_names = df_split["image"].unique()

    for img_name in tqdm(img_names, desc=f"Copy images → {dst_dir}"):
        src_path = os.path.join(src_img_dir, img_name)
        dst_path = os.path.join(dst_dir, img_name)

        # Avoid re-copying if exists
        if not os.path.exists(dst_path):
            shutil.copy(src_path, dst_path)

In [9]:
print("➡️ Copying TRAIN images...")
copy_split_images(df_train, os.path.join(IMG_ROOT, "train"), TRAIN_IMG_DIR)

print("➡️ Copying VAL images...")
copy_split_images(df_val, os.path.join(IMG_ROOT, "val"), VAL_IMG_DIR)

print("➡️ Writing TRAIN labels...")
write_yolo_labels(df_train, TRAIN_LBL_DIR)

print("➡️ Writing VAL labels...")
write_yolo_labels(df_val, VAL_LBL_DIR)
print("✅ YOLO dataset preparation complete!")

➡️ Copying TRAIN images...


Copy images → C:\Users\niran\OneDrive\Desktop\Bosch\Assignment\EDA\yolo_dataset\images\train: 100%|██████████| 69863/69863 [05:39<00:00, 206.06it/s] 


➡️ Copying VAL images...


Copy images → C:\Users\niran\OneDrive\Desktop\Bosch\Assignment\EDA\yolo_dataset\images\val: 100%|██████████| 10000/10000 [01:24<00:00, 118.26it/s]


➡️ Writing TRAIN labels...


Writing YOLO labels -> C:\Users\niran\OneDrive\Desktop\Bosch\Assignment\EDA\yolo_dataset\labels\train: 100%|██████████| 69863/69863 [01:40<00:00, 698.03it/s]


➡️ Writing VAL labels...


Writing YOLO labels -> C:\Users\niran\OneDrive\Desktop\Bosch\Assignment\EDA\yolo_dataset\labels\val: 100%|██████████| 10000/10000 [00:18<00:00, 538.53it/s]

✅ YOLO dataset preparation complete!





In [6]:
yaml_path = os.path.join(YOLO_ROOT, "bdd100k_yolo.yaml")
yr = YOLO_ROOT.replace("\\", "/")
yaml_text = f"""
path: {yr}

train: images/train
val: images/val

names:
  0: bike
  1: bus
  2: car
  3: motor
  4: person
  5: rider
  6: traffic light
  7: traffic sign
  8: train
  9: truck
"""

with open(yaml_path, "w") as f:
    f.write(yaml_text.strip())

yaml_path

'C:\\Users\\niran\\OneDrive\\Desktop\\Bosch\\Assignment\\EDA\\yolo_dataset\\bdd100k_yolo.yaml'

In [7]:
print("TRAIN Images:", len(os.listdir(TRAIN_IMG_DIR)))
print("VAL Images:", len(os.listdir(VAL_IMG_DIR)))

print("TRAIN Labels:", len(os.listdir(TRAIN_LBL_DIR)))
print("VAL Labels:", len(os.listdir(VAL_LBL_DIR)))

print("\nYAML file created at:", yaml_path)

TRAIN Images: 69863
VAL Images: 10000
TRAIN Labels: 69863
VAL Labels: 10000

YAML file created at: C:\Users\niran\OneDrive\Desktop\Bosch\Assignment\EDA\yolo_dataset\bdd100k_yolo.yaml
