In [1]:
import os
import cv2
import numpy as np
import json
from tqdm import trange

In [2]:
repo_path = "/home/sjauhri/IAS_WS/EgoVis/2handedafforder_repo/"
dataset_folder = repo_path + "BENCHMARK/ego4d_bench/bench_data/"
# original_frame_folder = repo_path + "BENCHMARK/ego4d_bench/bench_data/"
# full_inpainted_frame_folder = repo_path + "BENCHMARK/ego4d_bench/bench_data/"
out_folder = dataset_folder

In [3]:
# Create viz images for annotation (affordance class + inpainted image + original image)
for i in trange(len(os.listdir(dataset_folder)), desc="Processing videos"):
    video_dir = os.listdir(dataset_folder)[i]
    print(f"Generating viz images for annotation for video id {video_dir}")

    for f_i in trange(len(os.listdir(os.path.join(dataset_folder, video_dir))), desc="Processing frames"):
        frame_id = os.listdir(os.path.join(dataset_folder, video_dir))[f_i]
        
        # fetch the original video frame corresponding to this datapoint
        # print("Frame ID: ", frame_id)
        original_frame_file_path = os.path.join(dataset_folder, video_dir, frame_id, 'frame.png')
        # inpainted frame
        inpainted_frame_file_path = os.path.join(dataset_folder, video_dir, frame_id, 'inpainted_frame.png')
        # get the affordance class from the json file
        affordance_annotation_file_path = os.path.join(dataset_folder, video_dir, frame_id, "annotation.json")

        # create new image by concatenating the inpainted image, original image and affordance class
        inpainted_frame = cv2.imread(inpainted_frame_file_path)
        # inpainted_frame = cv2.cvtColor(inpainted_frame, cv2.COLOR_BGR2RGB)
        # Scale up inpainted image (to 512,512)
        inpainted_frame = cv2.resize(inpainted_frame, (inpainted_frame.shape[1] * 2, inpainted_frame.shape[0] * 2))
        original_frame = cv2.imread(original_frame_file_path)
        # Scale down the original frame to 2/5th the size of the inpainted frame height
        scaling_factor = (2.0/5.0) * inpainted_frame.shape[0] / original_frame.shape[0]
        original_frame = cv2.resize(original_frame, (int(original_frame.shape[1] * scaling_factor), int(original_frame.shape[0] * scaling_factor)))
        with open(affordance_annotation_file_path, 'r') as f:
            annotation_data = json.load(f)
            aff_class = annotation_data.get('narration', 'Unknown')
            if '#unsure' in aff_class:
                # replace with blank space
                aff_class = aff_class.replace('#unsure', '')
            # replace first four characters with 'person'
            aff_class = 'person' + aff_class[4:]
        if original_frame is not None and inpainted_frame is not None:
            # Resize original frame to match the height of the inpainted frame
            height_diff = inpainted_frame.shape[0] - original_frame.shape[0]
            black_space = np.zeros((height_diff, original_frame.shape[1], 3), dtype=np.uint8)

            # Add white text to the black space
            font = cv2.FONT_HERSHEY_SIMPLEX
            font_scale = 1
            font_color = (255, 255, 255)
            thickness = 2
            text = f"Task:"
            text_size = cv2.getTextSize(text, font, font_scale, thickness)[0]
            text_x = (black_space.shape[1] - text_size[0]) // 2
            text_y = (black_space.shape[0] + text_size[1]) // 2  - 90
            cv2.putText(black_space, text, (text_x, text_y), font, font_scale, font_color, thickness)
            text = f"{aff_class}"
            # Split the text into multiple lines if it doesn't fit in the width
            max_width = black_space.shape[1]
            words = text.split()
            lines = []
            current_line = words[0]

            for word in words[1:]:
                if cv2.getTextSize(current_line + ' ' + word, font, font_scale, thickness)[0][0] < max_width:
                    current_line += ' ' + word
                else:
                    lines.append(current_line)
                    current_line = word
            lines.append(current_line)

            y_offset = text_y + 40
            for line in lines:
                text_size = cv2.getTextSize(line, font, font_scale, thickness)[0]
                text_x = (black_space.shape[1] - text_size[0]) // 2
                cv2.putText(black_space, line, (text_x, y_offset), font, font_scale, font_color, thickness)
                y_offset += text_size[1] + 10

            # Add "Example" text just above the original frame
            example_text = "Example:"
            example_text_size = cv2.getTextSize(example_text, font, font_scale, thickness)[0]
            example_text_x = (black_space.shape[1] - example_text_size[0]) // 2
            example_text_y = black_space.shape[0] - 20
            cv2.putText(black_space, example_text, (example_text_x, example_text_y), font, font_scale, font_color, thickness)
            original_frame = np.vstack((black_space, original_frame))

            benchmark_frame = np.concatenate((inpainted_frame, original_frame), axis=1)
            benchmark_frame_path = os.path.join(dataset_folder, video_dir, frame_id, "benchmark_frame.png")
            cv2.imwrite(benchmark_frame_path, benchmark_frame)
            # print(f"Saved benchmark frame to {benchmark_frame_path}")
        else:
            raise Exception("Original or inpainted frame is not found")

Processing videos:   0%|          | 0/8 [00:00<?, ?it/s]

Generating viz images for annotation for video id 8f91bc0d-9ce7-4b31-aba7-dd59791917df




Processing frames: 100%|██████████| 260/260 [00:06<00:00, 42.90it/s]
Processing videos:  12%|█▎        | 1/8 [00:06<00:42,  6.06s/it]

Generating viz images for annotation for video id 793a9c9d-327e-4457-9c40-e626b2208aae


Processing frames: 100%|██████████| 92/92 [00:01<00:00, 59.47it/s]
Processing videos:  25%|██▌       | 2/8 [00:07<00:20,  3.41s/it]

Generating viz images for annotation for video id 114d86a7-2849-46de-8bb7-8fe1e1a48be8


Processing frames: 100%|██████████| 426/426 [00:10<00:00, 41.70it/s]
Processing videos:  38%|███▊      | 3/8 [00:17<00:32,  6.52s/it]

Generating viz images for annotation for video id 60b0ccb6-49f7-4a44-a70d-bf319217af50


Processing frames: 100%|██████████| 200/200 [00:04<00:00, 42.15it/s]
Processing videos:  50%|█████     | 4/8 [00:22<00:23,  5.82s/it]

Generating viz images for annotation for video id 1244d83b-fb99-469c-a943-354ac4d95361


Processing frames: 100%|██████████| 409/409 [00:09<00:00, 42.24it/s]
Processing videos:  62%|██████▎   | 5/8 [00:32<00:21,  7.21s/it]

Generating viz images for annotation for video id 126fe8f1-226a-4161-ad6e-84f9e5baeb3a


Processing frames: 100%|██████████| 773/773 [00:18<00:00, 40.86it/s]
Processing videos:  75%|███████▌  | 6/8 [00:51<00:22, 11.19s/it]

Generating viz images for annotation for video id 1134205f-6f03-47ac-bf8e-ae1453dc7fc9


Processing frames: 100%|██████████| 319/319 [00:07<00:00, 41.94it/s]
Processing videos:  88%|████████▊ | 7/8 [00:58<00:10, 10.02s/it]

Generating viz images for annotation for video id 11286c45-7869-4b84-81a5-7fcb9a247e9d


Processing frames: 100%|██████████| 714/714 [00:16<00:00, 42.50it/s]
Processing videos: 100%|██████████| 8/8 [01:15<00:00,  9.45s/it]
