# Extract gaze features in VidHOI dataset
Use head detection and gaze following methods to extract feature maps for all key frames in VidHOI dataset. Check face is inside a person bbox. Store them in a separate buffer. 

In [None]:
import sys

sys.path.insert(0, "../modules/object_tracking/yolov5")
sys.path.insert(0, "..")

import numpy as np
import shelve

from tqdm import tqdm
import torch
from torch.utils.data import DataLoader

from modules.object_tracking import HeadDetection
from modules.gaze_following import GazeFollowing
from modules.gaze_following.head_association import assign_human_head_video
from common.action_genome_dataset import AGDataset
from common.transforms import YOLOv5Transform

device = "cuda:0" if torch.cuda.is_available() else "cpu"


In [None]:
# Load Head Tracking and Gaze Following modules
head_detection_module = HeadDetection(
    crowd_human_weight_path="../weights/yolov5/crowdhuman_yolov5m.pt",
    config_path="../configs/object_tracking.yaml",
    device=device,
)
gaze_following_module = GazeFollowing(
    weight_path="../weights/detecting_attended/model_videoatttarget.pt",
    config_path="../configs/gaze_following.yaml",
    device=device,
)

img_size = 640
yolov5_stride = head_detection_module.yolov5_stride
# NOTE adjust this tolerance and method
head_matching_iou_thres = 0.7
head_matching_method = "hungarian"

## Validation Dataset

In [None]:
ag_val_dataset = AGDataset(
    dataset_dir="/mnt/DATA/datasets/action_genome",
    mode="test",
    transform=YOLOv5Transform(img_size, yolov5_stride),
    min_length=1,
    max_length=999999,
    annotation_mode="clip",
)
ag_val_dataloader = DataLoader(ag_val_dataset, batch_size=None, shuffle=False)
output_val_head_filename = "/mnt/DATA/datasets/action_genome/action_genome_gaze/val_frame_heads_gt_bbox"
output_val_gaze_filename = "/mnt/DATA/datasets/action_genome/action_genome_gaze/val_frame_gazes_gt_bbox"
output_val_inout_filename = "/mnt/DATA/datasets/action_genome/action_genome_gaze/val_frame_inout_gt_bbox"


In [None]:
output_val_head_dict = shelve.open(output_val_head_filename)
output_val_gaze_dict = shelve.open(output_val_gaze_filename)
output_val_inout_dict = shelve.open(output_val_inout_filename)
# For each video, first detect heads
t = tqdm(ag_val_dataloader)
for frames, annotations, meta_info in t:
    video_name = meta_info['video_name']
    t.set_description(f"{video_name}")
    t.refresh()
    original_frames = meta_info["original_frames"]
    bboxes = annotations["bboxes"]
    ids = annotations["ids"]
    labels = annotations["labels"]
    bboxes = np.array(bboxes)
    ids = np.array(ids)
    labels = np.array(labels)
    with torch.no_grad():
        video_head_bbox_list = assign_human_head_video(
            frames,
            original_frames,
            bboxes,
            ids,
            labels,
            head_detection_module,
            head_matching_iou_thres,
            device,
            method=head_matching_method,
            human_label=1,
        )
    # assign video head bbox list to its name
    output_val_head_dict[video_name] = video_head_bbox_list
    output_val_head_dict.sync()

    # for each head bbox, detect gaze
    video_gaze_list = []
    video_inout_list = []
    hx_memory = {}
    for i, (head_bboxes, frame0) in enumerate(
        zip(video_head_bbox_list, original_frames)
    ):
        t.set_description(f"{video_name}/{meta_info['frame_ids'][i]}, {i}/{len(video_head_bbox_list) - 1}: ")
        t.refresh()
        frame_gaze_dict = {}
        frame_inout_dict = {}
        for human_id, head_bbox in head_bboxes.items():
            t.set_postfix_str(f"{head_bbox}")
            # no head found for this human_id
            if len(head_bbox) == 0:
                frame_gaze_dict[human_id] = []
                frame_inout_dict[human_id] = []
                continue
            # check hidden state memory
            if human_id in hx_memory:
                hidden_state = hx_memory[human_id]
            else:
                hidden_state = None
            with torch.no_grad():
                (heatmap, inout, hx, _, _, _,) = gaze_following_module.detect_one(
                    frame0.numpy(),
                    head_bbox,
                    hidden_state,
                    draw=False,
                )
            hx_memory[human_id] = (hx[0].detach(), hx[1].detach())
            # process heatmap 64x64 (not include inout), store inout info separately
            # softmax inout, value = probability of gaze inside the scene
            inout_modulated = 1 / (1 + np.exp(-inout))
            # inout_modulated = 1 - inout_modulated
            # heatmap_modulated = heatmap - inout_modulated
            # assign heatmap and in_out to human_id
            frame_gaze_dict[human_id] = heatmap
            frame_inout_dict[human_id] = inout_modulated
        # append frame heatmap and inout dict to video heatmap list
        video_gaze_list.append(frame_gaze_dict)
        video_inout_list.append(frame_inout_dict)
    # assign video heatmap list to its name
    output_val_gaze_dict[video_name] = video_gaze_list
    output_val_gaze_dict.sync()
    output_val_inout_dict[video_name] = video_inout_list
    output_val_inout_dict.sync()

output_val_head_dict.close()
output_val_gaze_dict.close()
output_val_inout_dict.close()
print(f"Head bboxes dumped to {output_val_head_filename}")
print(f"Gaze heatmaps dumped to {output_val_gaze_filename}")
print(f"Gaze inout dumped to {output_val_inout_filename}")


## Training Dataset

In [None]:
# load training dataset
ag_train_dataset = AGDataset(
    dataset_dir="/mnt/DATA/datasets/action_genome",
    mode="train",
    transform=YOLOv5Transform(img_size, yolov5_stride),
    min_length=1,
    max_length=999999,
    annotation_mode="clip",
)
ag_train_dataloader = DataLoader(
    ag_train_dataset, batch_size=None, shuffle=False
)
output_train_head_filename = "/mnt/DATA/datasets/action_genome/action_genome_gaze/train_frame_heads_gt_bbox"
output_train_gaze_filename = "/mnt/DATA/datasets/action_genome/action_genome_gaze/train_frame_gazes_gt_bbox"
output_train_inout_filename = "/mnt/DATA/datasets/action_genome/action_genome_gaze/train_frame_inout_gt_bbox"


In [None]:
output_train_head_dict = shelve.open(output_train_head_filename)
output_train_gaze_dict = shelve.open(output_train_gaze_filename)
output_train_inout_dict = shelve.open(output_train_inout_filename)
# For each video, first detect heads
t = tqdm(ag_train_dataloader)
for frames, annotations, meta_info in t:
    video_name = meta_info['video_name']
    t.set_description(f"{video_name}")
    t.refresh()
    original_frames = meta_info["original_frames"]
    bboxes = annotations["bboxes"]
    ids = annotations["ids"]
    labels = annotations["labels"]
    bboxes = np.array(bboxes)
    ids = np.array(ids)
    labels = np.array(labels)
    with torch.no_grad():
        video_head_bbox_list = assign_human_head_video(
            frames,
            original_frames,
            bboxes,
            ids,
            labels,
            head_detection_module,
            head_matching_iou_thres,
            device,
            method=head_matching_method,
            human_label=1,
        )
    # assign video head bbox list to its name
    output_train_head_dict[video_name] = video_head_bbox_list
    output_train_head_dict.sync()

    # for each head bbox, detect gaze
    video_gaze_list = []
    video_inout_list = []
    hx_memory = {}
    for i, (head_bboxes, frame0) in enumerate(
        zip(video_head_bbox_list, original_frames)
    ):
        t.set_description(f"{video_name}/{meta_info['frame_ids'][i]}, {i}/{len(video_head_bbox_list) - 1}: ")
        t.refresh()
        frame_gaze_dict = {}
        frame_inout_dict = {}
        for human_id, head_bbox in head_bboxes.items():
            t.set_postfix_str(f"{head_bbox}")
            # no head found for this human_id
            if len(head_bbox) == 0:
                frame_gaze_dict[human_id] = []
                frame_inout_dict[human_id] = []
                continue
            # check hidden state memory
            if human_id in hx_memory:
                hidden_state = hx_memory[human_id]
            else:
                hidden_state = None
            with torch.no_grad():
                (heatmap, inout, hx, _, _, _,) = gaze_following_module.detect_one(
                    frame0.numpy(),
                    head_bbox,
                    hidden_state,
                    draw=False,
                )
            hx_memory[human_id] = (hx[0].detach(), hx[1].detach())
            # process heatmap 64x64 (not include inout), store inout info separately
            # softmax inout, value = probability of gaze inside the scene
            inout_modulated = 1 / (1 + np.exp(-inout))
            # inout_modulated = 1 - inout_modulated
            # heatmap_modulated = heatmap - inout_modulated
            # assign heatmap and in_out to human_id
            frame_gaze_dict[human_id] = heatmap
            frame_inout_dict[human_id] = inout_modulated
        # append frame heatmap and inout dict to video heatmap list
        video_gaze_list.append(frame_gaze_dict)
        video_inout_list.append(frame_inout_dict)
    # assign video heatmap list to its name
    output_train_gaze_dict[video_name] = video_gaze_list
    output_train_gaze_dict.sync()
    output_train_inout_dict[video_name] = video_inout_list
    output_train_inout_dict.sync()

output_train_head_dict.close()
output_train_gaze_dict.close()
output_train_inout_dict.close()
print(f"Head bboxes dumped to {output_train_head_filename}")
print(f"Gaze heatmaps dumped to {output_train_gaze_filename}")
print(f"Gaze inout dumped to {output_train_inout_filename}")
