In [None]:
import sys

sys.path.append('.')
sys.path.append('..')

import cv2
import numpy as np
from PIL import Image
import json
from ultralytics import YOLO
from tqdm import tqdm
import os
import cv2
from facenet_pytorch import MTCNN
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader
from torchvision.utils import save_image

from datasets.create_datasets import (
    create_frame_dataset,
    create_patch_dataset,
)

from detection_utils import (
    accept_patch,
    add_and_pop,
)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Create Models

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
mtcnn_device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") # MTCNN only supports CUDA or CPU

print(f'using device: {device}')

In [None]:
# Detection, segmentation, and pose models
os.makedirs('./pretrained-models', exist_ok=True)

classification_model = YOLO('./pretrained-models/yolov8m.pt')
segmentation_model = YOLO('./pretrained-models/yolov8l-seg.pt')
pose_model = YOLO('./pretrained-models/yolov8l-pose.pt')

mtcnn = MTCNN(keep_all=True, device=mtcnn_device, image_size=256, margin=8, thresholds=[0.5,0.5,0.5])

# Human Detection

In [None]:
def extract_human_patches(batch, lookback, filter=True):
    HUMAN_CLASS = 0
    CONFIDENCE_THRESHOLD = 0.8 if filter else 0.35

    patches = []
    with torch.no_grad():
        results = classification_model(batch, verbose=False)
        assert len(results) == batch.shape[0]

        for frame, r in enumerate(results):
            for box in r.boxes:
                if box.cls ==  HUMAN_CLASS and box.conf > CONFIDENCE_THRESHOLD:
                    x1, y1, x2, y2 = box.xyxy.int().tolist()[0]
                    patch = batch[frame, :, y1:y2, x1:x2]

                    if filter:
                        if accept_patch(patch, lookback):
                            patches.append(patch.unsqueeze(0))
                            add_and_pop(lookback, patch)
                    else:
                        patches.append({
                            'patch': patch,
                            'coords': (x1, y1, x2, y2),
                        })
    return patches

In [None]:
videos = ['/game/MafiaVideogame','/movie/TheGodfather', '/movie/TheSopranos', '/movie/TheIrishman']

for video in videos:
    video_path = f'Data/Train/{video}.mp4'
    save_dir = f'Dataset/patches/{video.split("/")[-1]}'
    
    os.makedirs(save_dir, exist_ok=True)

    if len(os.listdir(save_dir)) > 0:
        continue

    dataset = create_frame_dataset(video_path)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0)

    patch_count = 0
    lookback = []

    for i, batch in enumerate(tqdm(dataloader)):
        batch = batch.to(device)
        batch_to_list = lambda x: [x[i] for i in range(len(x))]

        with torch.no_grad():
            # First, detect objects in the frame
            human_patches = extract_human_patches(batch, lookback)

            for patch in human_patches:
                save_image(patch, os.path.join(save_dir, f'patch_{patch_count}.png'))
                patch_count += 1

# Plot Example Patches

In [None]:
def collate_dict(batch):
    return {
        'patch': torch.stack([x['patch'] for x in batch]),
        'patch_path': np.array([x['patch_path'] for x in batch])
    }

train_dataset = create_patch_dataset('Dataset/patches')
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0, collate_fn=collate_dict)

batch = next(iter(train_dataloader))
batch_2 = next(iter(train_dataloader))

patches = torch.cat([batch['patch'], batch_2['patch']])

# Plot 50 random patches
rows, columns = 5, 10
plt.figure(figsize=(20, 10))

idx = 0
for i in range(1, rows * columns + 1):
    plt.subplot(rows, columns, i)
    plt.imshow(patches[idx].permute(1, 2, 0))
    plt.axis('off')
    idx += 1

plt.suptitle('1.1 Human Patch Extraction - 50 Randomly Sampled Patches', fontsize=20)

plt.tight_layout()
plt.show()

# Segmentation

In [None]:
# Use YOLO to segment objects
def detect_and_save_segmentation(batch, save_dir):
    os.makedirs(save_dir, exist_ok=True)

    patches, patch_paths = batch['patch'], batch['patch_path']
    patches = patches.to(device)

    with torch.no_grad():
        # Predict poses and save to disk
        results = segmentation_model(patches, verbose=False)
        assert len(results) == patches.shape[0]

        for result, patch_path in zip(results, patch_paths):
            masks = result.masks

            if masks == None:
                print(f"No masks detected for {patch_path}")
                continue

            cls = result.boxes.cls
            conf = result.boxes.conf

            masks = masks[cls == 0]
            conf = conf[cls == 0]

            data = {
                'masks': masks,
                'conf': conf,
            }

            save_path = os.path.join(save_dir, os.path.basename(patch_path).replace('.png', '.pt'))
            torch.save(data, save_path)

In [None]:
videos = ['TheGodfather', 'TheSopranos', 'TheIrishman', 'MafiaVideogame']

for video in videos:
    patches_dir = f'Dataset/patches/{video}'
    save_dir = f'Dataset/segmentations/{video}'

    if len(os.listdir(save_dir)) > 0:
        continue

    dataset = create_patch_dataset(patches_dir)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0)

    for i, batch in enumerate(tqdm(dataloader)):
        detect_and_save_segmentation(batch, save_dir)

# Final Dataset Curation

- We currently have a selection of patches with positive human detections, as well as segmentation masks
- We use the segmentation masks to divide a patch into their respective human pixels, and save these separately
- Finally, we will apply face & pose detection to each segmented patch

In [None]:
patches_dir = 'Dataset/patches'

os.makedirs('DatasetProcessed/', exist_ok=True)
os.makedirs('DatasetProcessed/train_A', exist_ok=True)
os.makedirs('DatasetProcessed/train_B', exist_ok=True)
os.makedirs('DatasetProcessed/test_A', exist_ok=True)
os.makedirs('DatasetProcessed/test_B', exist_ok=True)

videos = ['TheGodfather', 'TheSopranos', 'TheIrishman', 'MafiaVideogame']

final_patches = []
conf_rejected_patches = []
rejected_patches = []
for video in videos:
    suffix = 'A' if video in ['TheGodfather', 'TheSopranos', 'TheIrishman'] else 'B'

    dataset = create_patch_dataset(f'Dataset/patches/{video}')
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

    for i, batch in enumerate(tqdm(dataloader)):
        patch = batch['patch'][0]
        patch_path = batch['patch_path'][0]
        segmentation_path = patch_path.replace('patches', 'segmentations').replace('.png', '.pt')

        if not os.path.exists(segmentation_path):
            continue

        # === Open Segmentation ===
        segmentation = torch.load(segmentation_path)
        masks, conf = segmentation['masks'], segmentation['conf']
        masks = masks.data
        num_segmentations = masks.shape[0]

        # If there are multiple segmentations, we split the patch into multiple patches
        for i in range(num_segmentations):
            mask = masks[i].unsqueeze(0)
            mask_conf = conf[i]

            if mask_conf < 0.75:
                continue
            mask = mask.repeat(3, 1, 1)
            masked_patch = patch * mask

            final_patches.append((masked_patch, suffix))

print(f'[INFO] Accepted total of {len(final_patches)} patches')

# Split into train and test
train_patches, test_patches = train_test_split(final_patches, test_size=0.2)

idx = 0
for (patch, suffix) in train_patches:
    save_image(patch, f'DatasetProcessed/train_{suffix}/patch_{idx}.png')
    idx += 1

for (patch, suffix) in test_patches:
    save_image(patch, f'DatasetProcessed/test_{suffix}/patch_{idx}.png')
    idx += 1

# Face and Pose Detection

In [None]:
# We need to manually keep map indices to their corresponding limbs, sigh...

# Classify into
# 1) Full-body front/back view
# 2) Head-and-shoulder front/back view
# 3) Others

POSE_MAP = {
    # Face
    0: 'Nose',
    1: 'Left Eye',
    2: 'Right Eye',
    3: 'Left Ear',
    4: 'Right Ear',
    # Upper
    5: 'Left Shoulder',
    6: 'Right Shoulder',
    7: 'Left Elbow',
    8: 'Right Elbow',
    9: 'Left Hand',
    10: 'Right Hand',
    # Lower
    11: 'Left Hip',
    12: 'Right Hip',
    13: 'Left Leg',
    14: 'Right Leg',
    15: 'Left Foot',
    16: 'Right Foot',
}

In [None]:
# Use MTCNN to detect faces and landmarks
def detect_and_save_faces(batch, save_dir):
    # Patch is [B, C, H, W], we have to process each individually as [H, W, C]
    patches, patch_path = batch['patch'], batch['patch_path']
    batch_size = patches.shape[0]

    for i in range(batch_size):
        patch_i = patches[i]
        patch_path_i = patch_path[i]

        # Convert to Numpy array of shape (H, W, C)
        patch_i = patch_i.detach().cpu().numpy().squeeze().transpose(1, 2, 0)

        # Convert to PIL
        patch_i = (patch_i * 255).astype(np.uint8)
        patch_i = Image.fromarray(patch_i)

        with torch.no_grad():
            boxes, landmarks = mtcnn.detect(patch_i)

            save_object = {
                'boxes': boxes,
                'landmarks': landmarks,
            }

            save_path = os.path.join(save_dir, os.path.basename(patch_path_i).replace('.png', '.npy'))
            np.save(save_path, save_object)

# Use YOLO to detect poses
def detect_and_save_poses(batch, save_dir):
    patches, patch_paths = batch['patch'], batch['patch_path']
    patches = patches.to(device)

    with torch.no_grad():
        # Predict poses and save to disk
        results = pose_model(patches, verbose=False)
        assert len(results) == patches.shape[0]

        for result, patch_path in zip(results, patch_paths):
            # Shape [1, 17, 2]
            keypoints_xy = result.keypoints.xy
            keypoints_xy = keypoints_xy.squeeze().cpu()

            # Save as .pt
            save_path = os.path.join(save_dir, os.path.basename(patch_path).replace('.png', '.pt'))
            torch.save(keypoints_xy, save_path)

In [None]:
patches_dir = './DatasetProcessed'
dirs = ['/train_A', '/train_B', '/test_A', '/test_B']

faces_dir = 'DatasetProcessed/faces'
poses_dir = 'DatasetProcessed/poses'

if not os.path.exists(faces_dir) and not os.path.exists(poses_dir):
    for dir in dirs:
        dataset = create_patch_dataset(f'{patches_dir}/{dir}')
        dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

        for i, batch in enumerate(tqdm(dataloader)):
            detect_and_save_faces(batch, faces_dir)
            detect_and_save_poses(batch, poses_dir)

# Pose classification and final filtering

In [None]:
# Define enums for classification
FRONT_HEAD_AND_SHOULDERS = 0
BACK_HEAD_AND_SHOULDERS = 1

FRONT_FULL_BODY = 2
BACK_FULL_BODY = 3

OTHER = 4

def classify_patch(face, pose):
    if pose.shape[0] == 0:
        return None

    # If we detected multiple poses, we take the first one
    if pose.ndim == 3:
        pose = pose[0]

    # Map pose indices to body parts
    pose_mapped = {
        v: pose[k] for k, v in POSE_MAP.items() if pose[k].sum() > 0
    }

    face_markers = [body_part in pose_mapped for body_part in ['Nose', 'Left Eye', 'Right Eye', 'Left Ear', 'Right Ear']]
    head_and_shoulders_markers = [body_part in pose_mapped for body_part in ['Left Shoulder', 'Right Shoulder', 'Left Ear', 'Right Ear']]
    full_body_markers = [body_part in pose_mapped for body_part in ['Left Hip', 'Right Hip', 'Left Leg', 'Right Leg', 'Left Foot', 'Right Foot']]
    
    # Perfect front view will have both MTCNN positive detection and all 5 face markers
    front_view = (face is not None) and (sum(face_markers) >= 4)
    
    # Side view will have at least 3 face markers
    side_profile = sum(face_markers) >= 3

    # H&S should have torso markers
    head_and_shoulders = sum(head_and_shoulders_markers) >= len(head_and_shoulders_markers) // 2

    # Full body should have leg markers
    full_body = sum(full_body_markers) >= len(full_body_markers) // 3

    if front_view:
        if full_body:
            return FRONT_FULL_BODY
        elif head_and_shoulders:
            return FRONT_HEAD_AND_SHOULDERS
        else:
            return None
    elif side_profile:
        return OTHER
    else:
        if full_body:
            return BACK_FULL_BODY
        elif head_and_shoulders:
            return BACK_HEAD_AND_SHOULDERS
        else:
            return None

In [None]:
patches_dir = './DatasetProcessed'
dirs = ['/train_A', '/train_B', '/test_A', '/test_B']

classifications = {}
for dir in dirs:
    dataset = create_patch_dataset(f'{patches_dir}/{dir}')
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

    for i, batch in enumerate(tqdm(dataloader)):
        patch = batch['patch'][0]
        patch_path = batch['patch_path'][0]
        face_path = patch_path.replace(dir, 'faces').replace('.png', '.npy')
        pose_path = patch_path.replace(dir, 'poses').replace('.png', '.pt')

        percent_active = (patch > 0).sum().item() / patch.numel()
        if percent_active < 0.15:
            os.remove(patch_path)
            os.remove(face_path)
            os.remove(pose_path)
            continue

        if not os.path.exists(face_path) or not os.path.exists(pose_path):
            os.remove(patch_path)
            continue

        face = np.load(face_path, allow_pickle=True).item()
        pose = torch.load(pose_path)

        classification = classify_patch(face, pose)

        if classification is not None:
            classifications[patch_path] = classification

# Save to JSON
classifications_json_path = 'DatasetProcessed/classifications.json'

with open(classifications_json_path, 'w') as f:
    json.dump(classifications, f)

# Plot poses

In [None]:
# shuffle classifications
classification_english = {
    0: 'Front Head-and-Shoulders',
    1: 'Back Head-and-Shoulders',
    2: 'Front Full Body',
    3: 'Back Full Body',
    4: 'Other'
}

classifications = dict(sorted(classifications.items(), key=lambda x: np.random.rand()))

plot_images = {k: [] for k in range(5)}

for i, (patch_path, classification) in enumerate(classifications.items()):
    if len(plot_images[classification]) >= 25:
        continue

    patch = Image.open(patch_path)
    plot_images[classification].append(patch)

    if all([len(images) >= 25 for images in plot_images.values()]):
        break

# For each pose, make a plot of 25 images with the title of the pose
for i, (classification, images) in enumerate(plot_images.items()):
    rows, columns = 4, 5
    plt.figure(figsize=(10, 10))

    idx = 0
    for j in range(1, rows * columns + 1):
        patch = images[idx]
        idx += 1

        plt.subplot(rows, columns, j)
        plt.imshow(patch)
        plt.axis('off')

    plt.tight_layout()
    plt.suptitle(classification_english[classification])
    plt.show()

# Break down pose distribution

In [None]:
patches_by_class = {k: 0 for k in range(4)}

for classification in classifications.values():
    patches_by_class[classification] += 1

print(f'=== Patches by class ===')
for k, v in patches_by_class.items():
    print(f'{classification_english[k]}: {v}')

# Final Dataset Visualisation

In [None]:
# Randomly plot 50 patches
classifications = dict(sorted(classifications.items(), key=lambda x: np.random.rand()))

plot_images = []
for i, (patch_path, classification) in enumerate(classifications.items()):
    if len(plot_images) >= 50:
        break

    patch = Image.open(patch_path)
    plot_images.append(patch)

rows, columns = 5, 10
plt.figure(figsize=(20, 10))

idx = 0
for i in range(1, rows * columns + 1):
    plt.subplot(rows, columns, i)
    plt.imshow(plot_images[idx])
    plt.axis('off')
    idx += 1

plt.suptitle('1.3 - Training Data Selection', fontsize=20)
plt.tight_layout()
plt.show()

# Training Models

While theoretically, the two command belows could be replaced for direct calls to the python train functions,
in practice we call the bash scripts which request a CUDA GPU from slurm. We did not train or run inference from inside the Jupyter notebook as that would just bring redundant complexity. 

In [None]:
# Train unconditional model
!bash ./train.sh

In [None]:
# Train conditional model
!bash ./train_conditional.sh

# Processing Video-Game Evaluation Patches

In [None]:
# Use larger model
classification_model = YOLO('./pretrained-models/yolov8l.pt')

In [None]:
test_video_path = 'Data/Test/Test.mp4'
patches_save_dir = 'TestDataset/patches'

os.makedirs(patches_save_dir, exist_ok=True)

dataset = create_frame_dataset(test_video_path, dim=(640, 640))
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

metadata = []
# For the test-video, we have to extract every patch with a human detected
for i, batch in enumerate(tqdm(dataloader)):
    frame_data = []
    batch = batch.to(device)

    with torch.no_grad():
        patches = extract_human_patches(batch, lookback=[], filter=False)

        for j, patch in enumerate(patches):
            patch_box = patch['patch']
            patch_coords = patch['coords']

            save_image(patch_box, os.path.join(patches_save_dir, f'frame{i}_patch{j}.png'))

            frame_data.append({
                'coords': patch_coords,
            })

    metadata.append(frame_data)

metadata_path = 'TestDataset/metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f)

In [None]:
# extract segmentations
save_dir = 'TestDataset/segmentations'

os.makedirs(save_dir, exist_ok=True)

dataset = create_patch_dataset(patches_save_dir)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0)

for i, batch in enumerate(tqdm(dataloader)):
    detect_and_save_segmentation(batch, save_dir)

In [None]:
patches_dir = 'TestDataset/patches'
dataset = create_patch_dataset(patches_dir)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

for i, batch in enumerate(tqdm(dataloader)):
    patch = batch['patch'][0]
    patch_path = batch['patch_path'][0]
    segmentation_path = patch_path.replace('patches', 'segmentations').replace('.png', '.pt')

    if not os.path.exists(segmentation_path):
        os.remove(patch_path)
        continue

    # === Open Segmentation ===
    segmentation = torch.load(segmentation_path)
    masks, conf = segmentation['masks'], segmentation['conf']
    masks = masks.data
    num_segmentations = masks.shape[0]

    if num_segmentations == 0:
        os.remove(patch_path)
        continue

    if num_segmentations != 1:
        # Take biggest mask
        mask_areas = [mask.sum().item() for mask in masks]
        mask_idx = np.argmax(mask_areas)
        mask = masks[mask_idx].unsqueeze(0)
        conf = conf[mask_idx]
    else:
        mask = masks[0].unsqueeze(0)
        conf = conf[0]

    mask = mask.repeat(3, 1, 1)
    masked_patch = patch * mask

    # Save masked patch
    save_image(masked_patch, patch_path)

In [None]:
# Faces and poses for the test patches
test_patches_dir = 'TestDataset/patches'
test_faces_dir = 'TestDataset/faces'
test_poses_dir = 'TestDataset/poses'

os.makedirs(test_faces_dir, exist_ok=True)
os.makedirs(test_poses_dir, exist_ok=True)

dataset = create_patch_dataset(test_patches_dir)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

for i, batch in enumerate(tqdm(dataloader)):
    detect_and_save_faces(batch, test_faces_dir)
    detect_and_save_poses(batch, test_poses_dir)

# Run Model Inference

In [None]:
!bash ./inference.sh

In [None]:
!bash ./inference_conditional.sh

In [None]:
def unsquare_pad(img, target_size):
    h, w = img.shape[:2]
    target_h, target_w = target_size

    pad_h = abs((target_h - h) // 2)
    pad_w = abs((target_w - w) // 2)

    # Trim to target size
    img = img[pad_h:pad_h + target_h, pad_w:pad_w + target_w]

    return img

# Stitch Game Together

In [None]:
test_patches_dir = 'TestDataset/patches'
test_video_path = 'Data/Test/Test.mp4'
processed_frames_dir = 'TranslatedMovieOutput'
segmentations_dir = 'TestDataset/segmentations'
metadata_path = 'TestDataset/metadata.json'
save_dir = processed_frames_dir + '/stitched_frames'

os.makedirs(save_dir, exist_ok=True)

with open(metadata_path, 'r') as f:
    metadata = json.load(f)

dataset = create_frame_dataset(test_video_path, dim=(640, 640))
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

for i, batch in enumerate(tqdm(dataloader)):
    frame = batch[0].permute(1, 2, 0).numpy()
    frame = (frame * 255).astype(np.uint8)
    frame_data = metadata[i]

    for j, patch_data in enumerate(frame_data):
        patch_path = f'{processed_frames_dir}/frame{i}_patch{j}.png'
        mask_path = f'{segmentations_dir}/frame{i}_patch{j}.pt'

        if not os.path.exists(patch_path) or not os.path.exists(mask_path):
            continue

        converted_patch = Image.open(patch_path)
        converted_patch = np.array(converted_patch)

        masks = torch.load(mask_path)['masks'].data
        if masks.shape[0] != 1:
            mask_areas = [mask.sum().item() for mask in masks]
            mask_idx = np.argmax(mask_areas)
            mask = masks[mask_idx]
        else:
            mask = masks[0]

        mask = mask.unsqueeze(0).repeat(3, 1, 1).numpy().transpose(1, 2, 0)

        # Resize mask and patch to original size
        patch_coords = patch_data['coords']
        x1, y1, x2, y2 = patch_coords
        
        # First thing we need to do is undo the resizing, but for this we need to know what we square padded it to
        # thats easy, its just the max of the width and height
        dim = max(x2 - x1, y2 - y1)

        # resize to (dim, dim)
        converted_patch = cv2.resize(converted_patch, (dim, dim))
        mask = cv2.resize(mask, (dim, dim))

        # undo the square padding by cropping to (x2 - x1, y2 - y1)
        converted_patch = unsquare_pad(converted_patch, (y2 - y1, x2 - x1))
        mask = unsquare_pad(mask, (y2 - y1, x2 - x1))

        frame[y1:y2, x1:x2] = converted_patch * mask + frame[y1:y2, x1:x2] * (1 - mask)

    # upscale
    frame = Image.fromarray(frame)
    frame = frame.crop(frame.getbbox())
    frame = np.array(frame)
    frame = cv2.resize(frame, (1080, 720))

    # save processed frame
    save_path = os.path.join(save_dir, f'frame{i}.png')
    cv2.imwrite(save_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

In [None]:
# Compile frames into video
output_video_path = 'TranslatedMovieOutput/translated_movie.mp4'

frame_paths = [os.path.join(save_dir, f) for f in os.listdir(save_dir)]
frame_paths = sorted(frame_paths, key=lambda x: int(x.split('/')[-1].split('.')[0].split('frame')[-1]))

frame = cv2.imread(frame_paths[0])
height, width, _ = frame.shape

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, 30, (width, height))

for frame_path in frame_paths:
    frame = cv2.imread(frame_path)
    out.write(frame)

out.release()

# Generate Analytics

In [None]:
frames = [
    '54',
    '160',
    '317',
    '523',
    '588',
    '728',
    '787',
    '830',
    '1723',
    '1932',
]

rows, columns = 2, 10
fig, axs = plt.subplots(rows, columns, figsize=(20, 5))
# for each frame, plot both the one in ./TranslatedMovieOutput_Unconditional/stitched_frames and the one in ./TranslatedMovieOutput/stitched_frames

for i, frame in enumerate(frames):
    frame_unconditional_path = f'./TranslatedMovieOutput_Unconditional/stitched_frames/frame{frame}.png'
    frame_conditional_path = f'./TranslatedMovieOutput/stitched_frames/frame{frame}.png'

    frame_unconditional = cv2.imread(frame_unconditional_path)
    frame_conditional = cv2.imread(frame_conditional_path)

    axs[0, i].imshow(cv2.cvtColor(frame_unconditional, cv2.COLOR_BGR2RGB))
    axs[0, i].axis('off')

    axs[1, i].imshow(cv2.cvtColor(frame_conditional, cv2.COLOR_BGR2RGB))
    axs[1, i].axis('off')

# label first row
for i in range(columns):
    axs[0, i].set_title(f'Unconditional {frames[i]}')
    axs[1, i].set_title(f'Conditional {frames[i]}')

plt.suptitle('Comparison of Unconditional and Conditional Models', fontsize=20)
plt.tight_layout()
plt.show()