# Generate VidOR annotations for YOLOv5 training

In [None]:
import json
from pathlib import Path
from tqdm.notebook import tqdm_notebook as tqdm
from typing import List

dataset_path = Path("/mnt/DATA/datasets/VidOR/")
training_annotation_path = dataset_path / "annotation" / "training"
validation_annotation_path = dataset_path / "annotation" / "validation"
training_annotation_files = list(training_annotation_path.glob("*/*.json"))
validation_annotation_files = list(validation_annotation_path.glob("*/*.json"))
annotation_files = training_annotation_files + validation_annotation_files
print(len(training_annotation_files))
print(len(validation_annotation_files))
anno_file = training_annotation_files[1551]
print(anno_file)
with anno_file.open("r") as f:
    anno_json = json.load(f)

print(anno_json.keys())
print(len(anno_json["subject/objects"]))
print(len(anno_json["trajectories"]))
print(anno_json["frame_count"])
for item in anno_json["subject/objects"]:
    print(item["category"])


In [None]:
human_classes = ["adult", "child", "baby"]
object_classes = [
    "person",
    "car",
    "guitar",
    "chair",
    "handbag",
    "toy",
    "baby_seat",
    "cat",
    "bottle",
    "backpack",
    "motorcycle",
    "ball/sports_ball",
    "laptop",
    "table",
    "surfboard",
    "camera",
    "sofa",
    "screen/monitor",
    "bicycle",
    "vegetables",
    "dog",
    "fruits",
    "cake",
    "cellphone",
    "cup",
    "bench",
    "snowboard",
    "skateboard",
    "bread",
    "bus/truck",
    "ski",
    "suitcase",
    "stool",
    "bat",
    "elephant",
    "fish",
    "baby_walker",
    "dish",
    "watercraft",
    "scooter",
    "pig",
    "refrigerator",
    "horse",
    "crab",
    "bird",
    "piano",
    "cattle/cow",
    "lion",
    "chicken",
    "camel",
    "electric_fan",
    "toilet",
    "sheep/goat",
    "rabbit",
    "train",
    "penguin",
    "hamster/rat",
    "snake",
    "frisbee",
    "aircraft",
    "oven",
    "racket",
    "faucet",
    "antelope",
    "duck",
    "stop_sign",
    "sink",
    "kangaroo",
    "stingray",
    "turtle",
    "tiger",
    "crocodile",
    "bear",
    "microwave",
    "traffic_light",
    "panda",
    "leopard",
    "squirrel",
]
name_to_idx = {name: idx for idx, name in enumerate(object_classes)}
for name in human_classes:
    name_to_idx[name] = 0
print(name_to_idx)


In [None]:
object_count = {object_class: 0 for object_class in object_classes}
unknown_count = {}
generated_trace_count = 0
total_trace_count = 0
empty_frame_count = 0
manually_frame_count = 0
total_frame_count = 0

for anno_file in tqdm(annotation_files):
    with anno_file.open("r") as f:
        anno_txt = f.read()
        anno_json = json.loads(anno_txt)
    for item in anno_json["subject/objects"]:
        subject = item["category"]
        if subject in human_classes:
            object_count["person"] += 1
        elif subject in object_classes:
            object_count[subject] += 1
        elif subject in unknown_count.keys():
            unknown_count[subject] += 1
        else:
            unknown_count[subject] = 1
    for frame_item in anno_json["trajectories"]:
        total_frame_count += 1
        if len(frame_item) > 0:
            frame_has_manually = False
            for trajectories in frame_item:
                if trajectories["generated"] == 1:
                    generated_trace_count += 1
                else:
                    frame_has_manually = True
                total_trace_count += 1
            manually_frame_count += int(frame_has_manually)
        else:
            empty_frame_count += 1


print(object_count)
print(unknown_count)
print(f"Totally {total_frame_count} frames")
print(f"Totally {total_trace_count} traces")
print(f"Empty rate: {empty_frame_count / total_frame_count * 100:.2f}%")
print(
    f"Generated rate trace rate: {generated_trace_count / total_trace_count * 100:.2f}%"
)
print(
    f"Frames with at least one manually annotated bbox: {manually_frame_count / total_frame_count * 100:.2f}%"
)


In [None]:
def generate_yolov5_annotation_interval(
    annotation_files: List[Path], output_path: Path, trace_interval
):
    image_filename_list = []
    for anno_file in tqdm(annotation_files):
        with anno_file.open("r") as f:
            anno_json = json.load(f)
        width = anno_json["width"]
        height = anno_json["height"]
        id_to_label = [
            name_to_idx[obj["category"]] for obj in anno_json["subject/objects"]
        ]

        video_path = Path(anno_json["video_path"]).parent
        video_id = anno_json["video_id"]
        yolo_anno_path: Path = output_path / video_path / video_id
        if not yolo_anno_path.exists():
            yolo_anno_path.mkdir(parents=True, exist_ok=True)

        for frame_num, traces in enumerate(anno_json["trajectories"][::trace_interval]):
            yolo_anno_file = (
                yolo_anno_path / f"{video_id}_{frame_num * trace_interval + 1:06d}.txt"
            )
            yolo_anno_str = ""
            if len(traces) > 0:
                for trace in traces:
                    label = id_to_label[trace["tid"]]
                    bbox = trace["bbox"]
                    bbox_width = (bbox["xmax"] - bbox["xmin"]) / width
                    bbox_height = (bbox["ymax"] - bbox["ymin"]) / height
                    x_center = ((bbox["xmax"] + bbox["xmin"]) / 2) / width
                    y_center = ((bbox["ymax"] + bbox["ymin"]) / 2) / height
                    yolo_anno_str += (
                        f"{label} {x_center} {y_center} {bbox_width} {bbox_height}\n"
                    )

            with yolo_anno_file.open("w") as f:
                f.write(yolo_anno_str)
            label_filename = yolo_anno_file.as_posix()
            image_filename_list.append(
                label_filename.replace("labels", "images").replace("txt", "jpg") + "\n"
            )
    return image_filename_list


In [None]:
def generate_yolov5_annotation_manually_only(
    annotation_files: List[Path], output_path: Path
):
    image_filename_list = []
    for anno_file in tqdm(annotation_files):
        with anno_file.open("r") as f:
            anno_json = json.load(f)
        width = anno_json["width"]
        height = anno_json["height"]
        id_to_label = [
            name_to_idx[obj["category"]] for obj in anno_json["subject/objects"]
        ]

        video_path = Path(anno_json["video_path"]).parent
        video_id = anno_json["video_id"]
        yolo_anno_path: Path = output_path / video_path / video_id
        if not yolo_anno_path.exists():
            yolo_anno_path.mkdir(parents=True, exist_ok=True)

        for frame_num, traces in enumerate(anno_json["trajectories"]):
            yolo_anno_file = yolo_anno_path / f"{video_id}_{frame_num + 1:06d}.txt"
            yolo_anno_str = ""
            frame_manually = True
            if len(traces) > 0:
                for trace in traces:
                    if trace["generated"] == 1:
                        frame_manually = False
                        break
                    label = id_to_label[trace["tid"]]
                    bbox = trace["bbox"]
                    bbox_width = (bbox["xmax"] - bbox["xmin"]) / width
                    bbox_height = (bbox["ymax"] - bbox["ymin"]) / height
                    x_center = ((bbox["xmax"] + bbox["xmin"]) / 2) / width
                    y_center = ((bbox["ymax"] + bbox["ymin"]) / 2) / height
                    yolo_anno_str += (
                        f"{label} {x_center} {y_center} {bbox_width} {bbox_height}\n"
                    )
            # add some empty frames
            # if frame_manually or (len(traces) == 0 and frame_num // 15 == 0):
            if frame_manually and len(traces) > 0:
                with yolo_anno_file.open("w") as f:
                    f.write(yolo_anno_str)
                label_filename = yolo_anno_file.as_posix()
                image_filename_list.append(
                    label_filename.replace("labels", "images").replace("txt", "jpg")
                    + "\n"
                )

    return image_filename_list


In [None]:
def generate_yolov5_annotation_vidhoi_only(
    dataset_path: Path, output_path: Path, train=True,
):
    annotation_path = dataset_path / "annotation"
    if train:
        vidhoi_annotation_path = dataset_path / "VidHOI_annotation" / "train_frame_annots.json"
        annotation_path = annotation_path / "training"
    else:
        vidhoi_annotation_path = dataset_path / "VidHOI_annotation" / "val_frame_annots.json"
        annotation_path = annotation_path / "validation"

    with vidhoi_annotation_path.open() as f:
        vidhoi_annotation_json = json.load(f)

    image_filename_list = []
    last_video_id = ""
    last_frame_id = ""
    for entry in tqdm(vidhoi_annotation_json):
        video_path = entry["video_folder"]
        video_id = entry["video_id"]        
        yolo_anno_path: Path = output_path / video_path / video_id
        if not yolo_anno_path.exists():
            yolo_anno_path.mkdir(parents=True, exist_ok=True)

        frame_id = entry["frame_id"]
        frame_num = int(frame_id) - 1
        if video_id == last_video_id and frame_id == last_frame_id:
            continue
        
        last_video_id = video_id
        last_frame_id = frame_id

        anno_file = annotation_path / video_path / f"{video_id}.json"
        with anno_file.open("r") as f:
            anno_json = json.load(f)
        width = anno_json["width"]
        height = anno_json["height"]
        id_to_label = [
            name_to_idx[obj["category"]] for obj in anno_json["subject/objects"]
        ]
        yolo_anno_file = yolo_anno_path / f"{video_id}_{frame_num + 1:06d}.txt"
        yolo_anno_str = ""
        traces = anno_json["trajectories"][frame_num]
        if len(traces) > 0:
            for trace in traces:
                label = id_to_label[trace["tid"]]
                bbox = trace["bbox"]
                bbox_width = (bbox["xmax"] - bbox["xmin"]) / width
                bbox_height = (bbox["ymax"] - bbox["ymin"]) / height
                x_center = ((bbox["xmax"] + bbox["xmin"]) / 2) / width
                y_center = ((bbox["ymax"] + bbox["ymin"]) / 2) / height
                yolo_anno_str += (
                    f"{label} {x_center} {y_center} {bbox_width} {bbox_height}\n"
                )
            with yolo_anno_file.open("w") as f:
                f.write(yolo_anno_str)
            label_filename = yolo_anno_file.as_posix()
            image_filename_list.append(
                label_filename.replace("labels", "images").replace("txt", "jpg")
                + "\n"
            )

    return image_filename_list


In [None]:
# print("Generating labels with interval 30...")
# output_path = dataset_path / "labels"
# # annotation_files_sorted = sorted(annotation_files, key=lambda x: str(x))
# image_filename_list = generate_yolov5_annotation_interval(
#     training_annotation_files, output_path, 30
# )
# print(f"Training: {len(image_filename_list)} frames")
# training_image_txt = dataset_path / "yolov5_train.txt"
# with training_image_txt.open("w") as f:
#     f.writelines(image_filename_list)

# image_filename_list = generate_yolov5_annotation_interval(
#     validation_annotation_files, output_path, 30
# )
# print(f"Validation: {len(image_filename_list)} frames")
# val_image_txt = dataset_path / "yolov5_val.txt"
# with val_image_txt.open("w") as f:
#     f.writelines(image_filename_list)


In [None]:
print("Generating labels with only manually labeled frames...")
output_path = dataset_path / "labels"
# annotation_files_sorted = sorted(annotation_files, key=lambda x: str(x))
image_filename_list = generate_yolov5_annotation_manually_only(
    training_annotation_files, output_path
)
print(f"Training: {len(image_filename_list)} frames")
training_image_txt = dataset_path / "yolov5_train_manually.txt"
with training_image_txt.open("w") as f:
    f.writelines(image_filename_list)

image_filename_list = generate_yolov5_annotation_manually_only(
    validation_annotation_files, output_path
)
print(f"Validation: {len(image_filename_list)} frames")
val_image_txt = dataset_path / "yolov5_val_manually.txt"
with val_image_txt.open("w") as f:
    f.writelines(image_filename_list)


In [None]:
# print("Generating labels with only vidhoi frames...")
# output_path = dataset_path / "labels"
# # annotation_files_sorted = sorted(annotation_files, key=lambda x: str(x))
# image_filename_list = generate_yolov5_annotation_vidhoi_only(
#     dataset_path, output_path, True
# )
# print(f"Training: {len(image_filename_list)} frames")
# training_image_txt = dataset_path / "yolov5_train_vidhoi.txt"
# with training_image_txt.open("w") as f:
#     f.writelines(image_filename_list)

# image_filename_list = generate_yolov5_annotation_vidhoi_only(
#     dataset_path, output_path, False
# )
# print(f"Validation: {len(image_filename_list)} frames")
# val_image_txt = dataset_path / "yolov5_val_vidhoi.txt"
# with val_image_txt.open("w") as f:
#     f.writelines(image_filename_list)