# Convert VidHOI object detection results to our format

In [None]:
import sys

sys.path.insert(0, "../modules/object_tracking/yolov5")
sys.path.insert(0, "..")

import numpy as np
import shelve
import json
from pathlib import Path

from tqdm import tqdm
import torch
from torch.utils.data import DataLoader

from modules.object_tracking import HeadDetection
from modules.gaze_following import GazeFollowing
from modules.gaze_following.head_association import assign_human_head_video
from common.vidhoi_dataset import VidHOIDataset
from common.data_io import FrameDatasetLoader
from common.transforms import YOLOv5Transform
from common.image_processing import convert_annotation_frame_to_video

device = "cuda:0" if torch.cuda.is_available() else "cpu"

dataset_folder = Path("/mnt/DATA/datasets/VidOR")

output_folder = dataset_folder / "VidHOI_detection"



In [None]:
vidhoi_det_path = dataset_folder / "VidHOI_annotation/det_val_frame_annots.json"
vidhoi_val_annotation_path = dataset_folder / "VidHOI_annotation/val_frame_annots.json"
with vidhoi_det_path.open() as f:
    all_detections_detectron2 = json.load(f)
with vidhoi_val_annotation_path.open() as f:
    annotations = json.load(f)

frames_to_remove = ["1000/4925211209_000156", "1124/13569831214_000015", "1201/5658916668_000465", "1110/3455926688_000228", "1110/3455926688_000492", "1002/2932897373_000165", "1110/3455926688_000036", "0074/7453733046_000945", "1110/3455926688_000252", "0017/2810112808_002235", "0050/11587211476_001185", "0074/7453733046_000675", "1015/5919180502_001185", "1001/6713120511_000015", "1150/6104044648_000060", "1201/5658916668_000435", "1203/3345608051_001284", "1001/6713120511_000045", "1000/4925211209_000420", "1103/3441428429_000645", "0050/11587211476_001365", "1201/5658916668_000525", "1051/5979720550_000105", "1008/5783819683_000045", "1001/6713120511_000105", "1203/3345608051_001308", "1000/4925211209_000252", "1110/3455926688_000012", "1203/3345608051_001260", "0074/7453733046_000915", "1011/8627074061_001785", "1103/3441428429_000585", "1002/4103088549_000225", "0017/2810112808_000675", "0017/2810112808_001995", "1008/8797589693_000915", "1000/2716277960_000135", "1015/5919180502_001155", "0017/2810112808_000705", "1124/13569831214_000225", "0017/2810112808_002295", "0016/4006608424_000315", "1005/5991060898_000615", "1103/3441428429_002145", "1001/4889681401_001305", "1110/3455926688_000540", "0017/2810112808_000765", "1000/6253433310_000585", "0080/5522880149_000705", "1000/6253433310_000555", "0050/11587211476_001215", "1000/4925211209_000180", "1021/5352022985_000195", "1103/2510696559_001185", "0017/2810112808_000825", "1051/5979720550_000135", "1008/8797589693_001485", "1000/6253433310_000525", "1000/4925211209_000276", "1103/3441428429_000915", "0017/2810112808_000525", "1011/8627074061_000045", "1000/4925211209_000132", "1008/8797589693_000165", "1110/3455926688_000516", "1002/4103088549_000135", "1000/4925211209_000396", "1025/8787109801_000675", "0017/2810112808_002055", "1000/4925211209_000204", "1110/3455926688_000276", "1021/5352022985_000735", "0081/6139126979_000084", "1103/3441428429_000765", "0017/2810112808_002025", "1201/5658916668_000495", "1000/2716277960_000315", "0017/2810112808_000795", "1000/4925211209_000300", "1015/5919180502_001215", "1103/3441428429_001485", "1000/6253433310_000615", "0017/2810112808_002115", "1008/8797589693_000885", "1103/3441428429_000675", "1110/3455926688_000348", "0017/2810112808_000975", "1001/4889681401_001275", "1000/4925211209_000372", "1001/4889681401_001245", "1000/4925211209_000324", "1051/5979720550_000165", "0028/5840177726_000285", "1001/4889681401_001215", "1124/13569831214_000075", "1001/6713120511_000075", "1103/3441428429_001545", "1005/5991060898_000585", "1103/3441428429_001395", "1021/5352022985_000165", "1008/8797589693_000945", "1000/4925211209_000108", "0017/2810112808_000945", "1124/13569831214_000945", "0017/2810112808_002145", "1000/4925211209_000348", "0082/11503803033_000495", "1015/5919180502_001245", "1000/4925211209_000228", "1001/4889681401_001365", "1027/3113970118_000045", "1103/3441428429_000885", "1110/3455926688_000060", "1021/5352022985_000075", "0082/11503803033_000435", "1008/8797589693_002355", "1110/3455926688_000468", "1011/8627074061_001755", "1103/5521781780_001035", "1110/3455926688_000444", "1103/3441428429_000735", "1110/3455926688_000372", "1150/6104044648_000204", "0017/2810112808_000855", "1021/5352022985_000225", "0017/2810112808_000885", "1005/5991060898_000645", "1150/6104044648_000012", "1015/5919180502_001095", "1103/3441428429_000945", "1051/5979720550_000195", "1015/5919180502_001125", "1001/4889681401_001395", "0080/5522880149_000765", "1103/3441428429_000705", "1124/13569831214_000045", "1103/3441428429_001305", "0017/2810112808_002085", "1000/6253433310_000645", "1021/5352022985_000675", "1103/3441428429_001515", "0017/2810112808_000555", "1110/3455926688_000204", "1103/3441428429_001425", "0019/4759861822_000315", "0017/2810112808_002265", "1104/2821968703_000825", "1008/5783819683_000015", "1002/4103088549_000165", "1021/5352022985_000615", "1002/4103088549_000195", "1021/5352022985_000045", "1201/5658916668_000615", "1021/5352022985_000705", "1103/3441428429_000615", "0050/11587211476_001335", "0080/5522880149_000735", "1001/4889681401_001335", "0017/2810112808_002175", "1011/8627074061_000075", "1103/3441428429_001455", "1103/3441428429_002115", "0080/5522880149_000675", "1103/3441428429_001035", "0017/2810112808_000585", "1110/3455926688_000300", "1011/8627074061_000015", "0017/2810112808_000495"]


In [None]:
all_detections = {}
processed = set()
for anno in annotations:
    middle_frame_timestamp = anno['middle_frame_timestamp'] + 1
    image_id_middle = f"{anno['video_folder']}/{anno['video_id']}_{middle_frame_timestamp:06d}"
    image_id = f"{anno['video_folder']}/{anno['video_id']}_{anno['frame_id']}"
    video_name = f"{anno['video_folder']}/{anno['video_id']}"

    if image_id in processed:
        continue

    processed.add(image_id)

    if video_name not in all_detections:
        all_detections[video_name] = {
            "bboxes": [],
            "ids": [],
            "labels": [],
            "confidences": [],
            "frame_ids": [],
        }

    # nothing detected in this frame
    if image_id_middle not in all_detections_detectron2:
        if image_id not in frames_to_remove:
            print(image_id)
        all_detections[video_name]["bboxes"].append([])
        all_detections[video_name]["ids"].append([])
        all_detections[video_name]["labels"].append([])
        all_detections[video_name]["confidences"].append([])
    else:
        frame_detection = all_detections_detectron2[image_id_middle]
        bboxes = []
        ids = []
        labels = []
        confidences = []
        for det in frame_detection:
            bboxes.append(det["bbox"])
            ids.append(det["tid"])
            labels.append(det["category_id"])
            confidences.append(det["score"])
        all_detections[video_name]["bboxes"].append(bboxes)
        all_detections[video_name]["ids"].append(ids)
        all_detections[video_name]["labels"].append(labels)
        all_detections[video_name]["confidences"].append(confidences)
    all_detections[video_name]["frame_ids"].append(anno['frame_id'])

for video_name in all_detections.keys():
    frame_ids = all_detections[video_name]["frame_ids"]
    if len(frame_ids) > 0:
        sorted_idx = [i for (v, i) in sorted((v, i) for (i, v) in enumerate(frame_ids))]
        frame_ids = [frame_ids[i] for i in sorted_idx]
        bboxes = [all_detections[video_name]["bboxes"][i] for i in sorted_idx]
        ids = [all_detections[video_name]["ids"][i] for i in sorted_idx]
        labels = [all_detections[video_name]["labels"][i] for i in sorted_idx]
        confidences = [all_detections[video_name]["confidences"][i] for i in sorted_idx]
        all_detections[video_name]["frame_ids"] = frame_ids
        all_detections[video_name]["bboxes"] = bboxes
        all_detections[video_name]["ids"] = ids
        all_detections[video_name]["labels"] = labels
        all_detections[video_name]["confidences"] = confidences


    

In [None]:
det_out = output_folder / "val_trace_detectron2.json"
with det_out.open("w") as out:
    json.dump(all_detections, out)

# Based on VidHOI Detectron2 results, apply Gaze Following

In [None]:
# Load Head Tracking and Gaze Following modules
head_detection_module = HeadDetection(
    crowd_human_weight_path="../weights/yolov5/crowdhuman_yolov5m.pt",
    config_path="../configs/object_tracking.yaml",
    device=device,
)
gaze_following_module = GazeFollowing(
    weight_path="../weights/detecting_attended/model_videoatttarget.pt",
    config_path="../configs/gaze_following.yaml",
    device=device,
)

img_size = 640
yolov5_stride = head_detection_module.yolov5_stride
# NOTE adjust this tolerance and method
head_matching_iou_thres = 0.7
head_matching_method = "hungarian"

In [None]:
vidhoi_val_dataset = VidHOIDataset(
    annotations_file="/mnt/DATA/datasets/VidOR/VidHOI_annotation/val_frame_annots.json",
    frames_dir="/mnt/DATA/datasets/VidOR/images",
    transform=YOLOv5Transform(img_size, yolov5_stride),
    min_length=1,
    max_length=999999,
    max_human_num=999999,
    annotation_mode="clip",
)
vidhoi_val_dataloader = DataLoader(vidhoi_val_dataset, batch_size=None, shuffle=False)

output_val_head_filename = str(output_folder / "val_frame_heads_detectron2")
output_val_gaze_filename = str(output_folder / "val_frame_gazes_detectron2")
output_val_inout_filename = str(output_folder / "val_frame_inout_detectron2")

det_out = output_folder / "val_trace_detectron2.json"
with det_out.open("r") as out:
    all_detections = json.load(out)

In [None]:
output_val_head_dict = shelve.open(output_val_head_filename)
output_val_gaze_dict = shelve.open(output_val_gaze_filename)
output_val_inout_dict = shelve.open(output_val_inout_filename)
# For each video, first detect heads
t = tqdm(vidhoi_val_dataloader)
for frames, annotations, meta_info in t:
    video_name = meta_info["video_name"]
    t.set_description(f"{video_name}")
    t.refresh()
    original_frames = meta_info["original_frames"]
    # bboxes from detection
    clip_detections = all_detections[video_name]
    # convert to [im_idx, x1, y1, x2, y2] format
    bboxes, ids, labels, _ = convert_annotation_frame_to_video(
        clip_detections["bboxes"],
        clip_detections["ids"],
        clip_detections["labels"],
        clip_detections["confidences"],
    )
    # detect head and assign to human
    with torch.no_grad():
        video_head_bbox_list = assign_human_head_video(
            frames,
            original_frames,
            bboxes,
            ids,
            labels,
            head_detection_module,
            head_matching_iou_thres,
            device,
            method=head_matching_method,
        )
    # assign video head bbox list to its name
    output_val_head_dict[video_name] = video_head_bbox_list
    output_val_head_dict.sync()

    # for each head bbox, detect gaze
    video_gaze_list = []
    video_inout_list = []
    hx_memory = {}
    for i, (head_bboxes, frame0) in enumerate(
        zip(video_head_bbox_list, original_frames)
    ):
        t.set_description(
            f"{video_name}/{meta_info['frame_ids'][i]}, {i}/{len(video_head_bbox_list) - 1}: "
        )
        t.refresh()
        frame_gaze_dict = {}
        frame_inout_dict = {}
        for human_id, head_bbox in head_bboxes.items():
            t.set_postfix_str(f"{head_bbox}")
            t.refresh()
            # no head found for this human_id
            if len(head_bbox) == 0:
                frame_gaze_dict[human_id] = []
                frame_inout_dict[human_id] = []
                continue
            # check hidden state memory
            if human_id in hx_memory:
                hidden_state = hx_memory[human_id]
            else:
                hidden_state = None
            with torch.no_grad():
                (heatmap, inout, hx, _, _, _,) = gaze_following_module.detect_one(
                    frame0.numpy(),
                    head_bbox,
                    hidden_state,
                    draw=False,
                )
            hx_memory[human_id] = (hx[0].detach(), hx[1].detach())
            # process heatmap 64x64 (not include inout), store inout info separately
            # softmax inout, value = probability of gaze inside the scene
            inout_modulated = 1 / (1 + np.exp(-inout))
            # assign heatmap and in_out to human_id
            frame_gaze_dict[human_id] = heatmap
            frame_inout_dict[human_id] = inout_modulated
        # append frame heatmap and inout dict to video heatmap list
        video_gaze_list.append(frame_gaze_dict)
        video_inout_list.append(frame_inout_dict)
    # assign video heatmap list to its name
    output_val_gaze_dict[video_name] = video_gaze_list
    output_val_gaze_dict.sync()
    output_val_inout_dict[video_name] = video_inout_list
    output_val_inout_dict.sync()

output_val_head_dict.close()
output_val_gaze_dict.close()
output_val_inout_dict.close()
print(f"Head bboxes dumped to {output_val_head_filename}")
print(f"Gaze heatmaps dumped to {output_val_gaze_filename}")
print(f"Gaze inout dumped to {output_val_inout_filename}")
