In [26]:
import torch
from torch import nn
from torch.utils.data import Dataset
import os
import cv2
import json
import numpy as np
import pandas as pd
from transformers import BertTokenizer

In [34]:
ACTION_MAP = {
    'w': 0,
    'a': 1,
    's': 2,
    'd': 3,
    'Key.space': 4,
    'Key.shift': 5,
    'q': 6,
    'e': 7
}

EVENT_TYPE = {
    'idle': 0,
    'click': 1,
    'release': 2,
    'move': 3
}

MOUSE_BUTTON = {
    'idle': 0,
    'Button.left': 1,
    'Button.right': 2
}

In [68]:
class GameplayActionPairVideoDataset(Dataset):
    def __init__(self, root_dir, tokenizer, transform=None):
        """
        Args:
            root_dir (string): Root directory containing the subdirectories with JSON and MP4 files.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.data = self._load_data()
        self.tokenizer = tokenizer

    def _load_data(self):
        data = []
        for subdir in os.listdir(self.root_dir):
            subdir_path = os.path.join(self.root_dir, subdir)
            if os.path.isdir(subdir_path):
                frame_logs = os.path.join(subdir_path, 'frame_logs.json')
                gameplay = os.path.join(subdir_path, 'gameplay.mp4')
                if os.path.isfile(frame_logs) and os.path.isfile(gameplay):
                    with open(frame_logs, 'r') as file:
                        annotation = json.load(file)
                        annotation['video_path'] = gameplay
                        data.append(annotation)
        return data

    def frame_logs_to_actions_tensor(self, actions):
        actions_tensor = []
        num_actions = len(action_map) + 4
        tensor_size = (num_actions,)
        
        for action in actions:
            key_events = [key_event['key'] for key_event in action['key_events']]
            mouse_events = []
            action_tensor = torch.zeros(tensor_size)
    
            for key in key_events:
                action_index = action_map.get(key, -1)
                if action_index != -1:
                    action_tensor[action_index] = 1
            
            for mouse_event in action['mouse_events']:
                mouse_events = [mouse_event['event_type'], mouse_event['position'][0], mouse_event['position'][1], mouse_event.get('button', 'idle')]
                action_tensor[8] = event_type.get(mouse_events[0], 0)
                action_tensor[9] = mouse_events[1]
                action_tensor[10] = mouse_events[2]
                action_tensor[11] = mouse_button.get(mouse_events[3], 0)
                
            actions_tensor.append(action_tensor)
        actions_tensor = np.array(actions_tensor)
        actions_tensor = torch.tensor(actions_tensor)
        return actions_tensor

    def preprocess_frame(self, frame, target_size=(224, 224)):
        # Resize the frame
        resized_frame = cv2.resize(frame, target_size)
        
        # Normalize the frame (assuming model expects input in range [0, 1])
        normalized_frame = resized_frame / 255.0
    
        return normalized_frame
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_info = self.data[idx]
        video_path = video_info['video_path']

        # Read the video using OpenCV
        cap = cv2.VideoCapture(video_path)
        frames = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            processed_frame = self.preprocess_frame(frame)
            frames.append(processed_frame)
        cap.release()

        # Convert list of frame into NumPy array
        frames = np.array(frames)

        # Convert to tensor and permute dimensions to T, C, H, W
        frames = torch.tensor(frames, dtype=torch.float32).permute(0, 3, 1, 2)
        instruction = self.tokenizer(text=self.data[idx]['instruction'],
                                     return_tensors='pt',
                                     padding=True,
                                     truncation=True,
                                     max_length=128
                                    )['input_ids']
        actions = self.frame_logs_to_actions_tensor(self.data[idx]['actions'])

        if self.transform:
            frames = self.transform(frames)

        return instruction, frames, actions

In [108]:
root_dir = "output_logs"

In [109]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
dataset = GameplayActionPairVideoDataset(root_dir=root_dir, tokenizer=tokenizer)



In [110]:
instruction, frames, actions = dataset[8]

In [111]:
print(f"Instructions text: {instruction.shape}")
print(f"Video tensor shape: {frames.shape}")
print(f"Action example: {actions.shape}")

Instructions text: torch.Size([1, 3])
Video tensor shape: torch.Size([124, 3, 224, 224])
Action example: torch.Size([124, 12])


## Development code to check unique on each action

In [112]:
instruction

tensor([[ 101, 5376,  102]])

In [113]:
frames

tensor([[[[0.5137, 0.5137, 0.5216,  ..., 0.4588, 0.4392, 0.4745],
          [0.5137, 0.5137, 0.5216,  ..., 0.4588, 0.4667, 0.4745],
          [0.5373, 0.5569, 0.5216,  ..., 0.4667, 0.4745, 0.4745],
          ...,
          [0.4588, 0.4510, 0.4824,  ..., 0.4980, 0.4863, 0.5059],
          [0.4902, 0.5137, 0.4588,  ..., 0.4902, 0.5059, 0.5059],
          [0.4745, 0.4510, 0.4588,  ..., 0.4902, 0.5059, 0.5059]],

         [[0.4039, 0.4039, 0.4118,  ..., 0.4000, 0.3804, 0.4157],
          [0.4039, 0.4039, 0.4118,  ..., 0.4000, 0.4078, 0.4157],
          [0.4275, 0.4471, 0.4118,  ..., 0.4078, 0.4157, 0.4157],
          ...,
          [0.3529, 0.3451, 0.3765,  ..., 0.4000, 0.3882, 0.4078],
          [0.3843, 0.4078, 0.3529,  ..., 0.3922, 0.3961, 0.3961],
          [0.3686, 0.3451, 0.3529,  ..., 0.3922, 0.3961, 0.3961]],

         [[0.3725, 0.3725, 0.3804,  ..., 0.3843, 0.3647, 0.4000],
          [0.3725, 0.3725, 0.3804,  ..., 0.3843, 0.3922, 0.4000],
          [0.3961, 0.4157, 0.3804,  ..., 0

In [114]:
actions

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])