# Object Detection in VidHOI Validation Set
Detect and track objects in VidHOI validation dataset, store them in a buffer. Then apply gaze following method to extract gaze heatmaps. 

In [None]:
import sys

sys.path.insert(0, "../modules/object_tracking/yolov5")
sys.path.insert(0, "..")

import numpy as np
import shelve
import json
from pathlib import Path

from tqdm import tqdm
import torch
from torch.utils.data import DataLoader

from modules.object_tracking import HeadDetection, ObjectTracking
from modules.gaze_following import GazeFollowing
from modules.gaze_following.head_association import assign_human_head_video
from common.vidhoi_dataset import VidHOIDataset
from common.data_io import FrameDatasetLoader
from common.transforms import YOLOv5Transform
from common.image_processing import convert_annotation_frame_to_video

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# dataset_folder = Path("G:/datasets/VidOR")
dataset_folder = Path("/mnt/DATA/datasets/VidOR")

output_folder = dataset_folder / "VidHOI_detection"

yolov5_model_size = "yolov5l"

# tracking_mode = "key"  # only track objects in key frames
tracking_mode = "all"  # track objects in all frames, then only keep the key frames


### Object Tracking Module init

In [None]:
object_tracking_module = ObjectTracking(
    yolo_weights_path="../weights/yolov5/vidor_" + yolov5_model_size + ".pt",
    deep_sort_model_dir="../weights/deep_sort/",
    config_path="../configs/object_tracking.yaml",
    device=device,
)

img_size = 640
yolov5_stride = object_tracking_module.yolov5_stride


### Dataset init, only validation

In [None]:
vidhoi_val_dataset = VidHOIDataset(
    annotations_file=dataset_folder / "VidHOI_annotation" / "val_frame_annots.json",
    frames_dir=dataset_folder / "images",
    transform=YOLOv5Transform(img_size, yolov5_stride),
    min_length=1,
    max_length=999999,
    max_human_num=999999,
    annotation_mode="clip",
    train_ratio=0,
)
vidhoi_val_dataset.eval()
vidhoi_val_dataloader = DataLoader(vidhoi_val_dataset, batch_size=None, shuffle=False)

## Object Tracking


### Only key frames

In [None]:
if tracking_mode == "key":
    # dict for all videos
    all_detections = {}
    t = tqdm(vidhoi_val_dataloader)
    # for each video
    # don't need annotation here
    for frames, _, meta_info in t:
        video_name = meta_info['video_name']
        t.set_description(f"{video_name}")
        t.refresh()
        original_frames = meta_info["original_frames"]
        frame_ids = meta_info["frame_ids"]
        clip_len = len(frames) - 1
        # object tracking init
        object_tracking_module.clear()
        object_tracking_module.warmup(frames[0].to(device), original_frames[0])
        # entry for one video
        clip_detections = {
            "bboxes": [],
            "ids": [],
            "labels": [],
            "confidences": [],
            "frame_ids": [],
        }
        # for each frame, do detection and tracking
        for im_idx, (frame, original_frame, frame_id) in enumerate(zip(frames, original_frames, frame_ids)):
            t.set_postfix_str(f"{im_idx}/{clip_len}: {frame_id}")
            t.refresh()
            bboxes, ids, labels, _, confidences, _ = object_tracking_module.track_one(frame.to(device), original_frame, draw=False)
            # frame-based format, NOTE need to convert to [im_idx, x1, y1, x2, y2] later
            bboxes = [bbox.tolist() for bbox in bboxes]
            clip_detections["bboxes"].append(bboxes)
            clip_detections["ids"].append(ids)
            clip_detections["labels"].append(labels)
            clip_detections["confidences"].append(confidences)
            clip_detections["frame_ids"].append(frame_id)
        all_detections[video_name] = clip_detections
else:
    print("Skip, not key frame mode")
    

### All frames mode

In [None]:
if tracking_mode == "all":
    total_frame_num = 0
    # dict for all videos
    all_detections = {}
    # for each video, load all frames, only keep the detections in key frames
    # don't need annotation here
    t = tqdm(range(len(vidhoi_val_dataset)))
    for video_idx in t:
        video_name = vidhoi_val_dataset.video_name_list[video_idx]
        frame_ids = vidhoi_val_dataset.frame_ids_list[video_idx]
        t.set_description(f"{video_name}")
        t.refresh()
        # entry for one video
        clip_detections = {
            "bboxes": [],
            "ids": [],
            "labels": [],
            "confidences": [],
            "frame_ids": [],
        }
        # load all frames
        video_frame_path = dataset_folder / "images" / video_name
        video_loader = FrameDatasetLoader(video_frame_path, YOLOv5Transform(img_size, yolov5_stride))
        for frame_idx, (frame, frame0, _, _, meta_info) in enumerate(video_loader):
            total_frame_num += 1
            if frame_idx == 0:
                # object tracking init
                object_tracking_module.clear()
                object_tracking_module.warmup(frame.to(device), frame0)
                
            frame_id = str(meta_info["frame_path"])[-10:-4]
            clip_len = meta_info["frame_num"] - 1
            t.set_postfix_str(f"{frame_idx}/{clip_len}: {frame_id}")
            t.refresh()
            bboxes, ids, labels, _, confidences, _ = object_tracking_module.track_one(frame.to(device), frame0, draw=False)
            # only store the detections in key frame set
            if frame_id in frame_ids:
                # frame-based format, NOTE need to convert to [im_idx, x1, y1, x2, y2] later
                bboxes = [bbox.tolist() for bbox in bboxes]
                clip_detections["bboxes"].append(bboxes)
                clip_detections["ids"].append(ids)
                clip_detections["labels"].append(labels)
                clip_detections["confidences"].append(confidences)
                clip_detections["frame_ids"].append(frame_id)
        all_detections[video_name] = clip_detections
    print(f"\nTotally {total_frame_num} frames")
else:
    print("Skip, not all frame mode")
    

In [None]:
# Save to file
filename = output_folder / ("val_trace_" + yolov5_model_size + "_deepsort.json")
out_str = json.dumps(all_detections)
with filename.open("w") as out_file:
    out_file.write(out_str)


## Gaze Following

In [None]:
# Load Head Tracking and Gaze Following modules
head_detection_module = HeadDetection(
    crowd_human_weight_path="../weights/yolov5/crowdhuman_yolov5m.pt",
    config_path="../configs/object_tracking.yaml",
    device=device,
)
gaze_following_module = GazeFollowing(
    weight_path="../weights/detecting_attended/model_videoatttarget.pt",
    config_path="../configs/gaze_following.yaml",
    device=device,
)

img_size = 640
yolov5_stride = head_detection_module.yolov5_stride
# NOTE adjust this tolerance and method
head_matching_iou_thres = 0.7
head_matching_method = "hungarian"
# head_matching_method = "greedy"

output_val_head_filename = str(output_folder / ("val_frame_heads_" + yolov5_model_size + "_deepsort"))
output_val_gaze_filename = str(output_folder / ("val_frame_gazes_" + yolov5_model_size + "_deepsort"))
output_val_inout_filename = str(output_folder / ("val_frame_inout_" + yolov5_model_size + "_deepsort"))

filename = output_folder / ("val_trace_" + yolov5_model_size + "_deepsort.json")
with filename.open() as detection_file:
    all_detections = json.loads(detection_file.read())

In [None]:
output_val_head_dict = shelve.open(output_val_head_filename)
output_val_gaze_dict = shelve.open(output_val_gaze_filename)
output_val_inout_dict = shelve.open(output_val_inout_filename)
# For each video, first detect heads
t = tqdm(vidhoi_val_dataloader)
for frames, annotations, meta_info in t:
    video_name = meta_info['video_name']
    t.set_description(f"{video_name}")
    t.refresh()
    original_frames = meta_info["original_frames"]
    # bboxes from detection
    clip_detections = all_detections[video_name]
    # convert to [im_idx, x1, y1, x2, y2] format
    bboxes, ids, labels, _ = convert_annotation_frame_to_video(clip_detections["bboxes"], clip_detections["ids"], clip_detections["labels"], clip_detections["confidences"])
    # detect head and assign to human
    with torch.no_grad():
        video_head_bbox_list = assign_human_head_video(
            frames,
            original_frames,
            bboxes,
            ids,
            labels,
            head_detection_module,
            head_matching_iou_thres,
            device,
            method=head_matching_method,
        )
    # assign video head bbox list to its name
    output_val_head_dict[video_name] = video_head_bbox_list
    output_val_head_dict.sync()

    # for each head bbox, detect gaze
    video_gaze_list = []
    video_inout_list = []
    hx_memory = {}
    for i, (head_bboxes, frame0) in enumerate(
        zip(video_head_bbox_list, original_frames)
    ):
        t.set_description(f"{video_name}/{meta_info['frame_ids'][i]}, {i}/{len(video_head_bbox_list) - 1}: ")
        t.refresh()
        frame_gaze_dict = {}
        frame_inout_dict = {}
        for human_id, head_bbox in head_bboxes.items():
            t.set_postfix_str(f"{head_bbox}")
            t.refresh()
            # no head found for this human_id
            if len(head_bbox) == 0:
                frame_gaze_dict[human_id] = []
                frame_inout_dict[human_id] = []
                continue
            # check hidden state memory
            if human_id in hx_memory:
                hidden_state = hx_memory[human_id]
            else:
                hidden_state = None
            with torch.no_grad():
                (heatmap, inout, hx, _, _, _,) = gaze_following_module.detect_one(
                    frame0.numpy(),
                    head_bbox,
                    hidden_state,
                    draw=False,
                )
            hx_memory[human_id] = (hx[0].detach(), hx[1].detach())
            # process heatmap 64x64 (not include inout), store inout info separately
            # softmax inout, value = probability of gaze inside the scene
            inout_modulated = 1 / (1 + np.exp(-inout))
            # assign heatmap and in_out to human_id
            frame_gaze_dict[human_id] = heatmap
            frame_inout_dict[human_id] = inout_modulated
        # append frame heatmap and inout dict to video heatmap list
        video_gaze_list.append(frame_gaze_dict)
        video_inout_list.append(frame_inout_dict)
    # assign video heatmap list to its name
    output_val_gaze_dict[video_name] = video_gaze_list
    output_val_gaze_dict.sync()
    output_val_inout_dict[video_name] = video_inout_list
    output_val_inout_dict.sync()

output_val_head_dict.close()
output_val_gaze_dict.close()
output_val_inout_dict.close()
print(f"Head bboxes dumped to {output_val_head_filename}")
print(f"Gaze heatmaps dumped to {output_val_gaze_filename}")
print(f"Gaze inout dumped to {output_val_inout_filename}")