#### Prepare HMDB51 dataset

In [7]:
import glob
from collections import Counter
import os

keep_hmdb51 = ["clap", "climb", "drink", "jump", "pour", "ride_bike", "ride_horse", 
        "run", "shoot_bow", "smoke", "throw", "wave"]

TRAIN_TAG, TEST_TAG = 1, 2
full_train_files, test_files = [], []
full_train_labels, test_labels = [], []
split_pattern_name = f"*test_split1.txt"
split_pattern_path = os.path.join('HMDB51\\test_train_splits', split_pattern_name)
annotation_paths = glob.glob(split_pattern_path)

for filepath in annotation_paths:

    base_name = os.path.basename(filepath)
    class_name = '_'.join(base_name.split('_')[:-2])

    # Doesn't work with the original code given
    # class_name = '_'.join(filepath.split('/')[-1].split('_')[:-2])

    if class_name not in keep_hmdb51:
        continue  # skipping the classes that we won't use.
    with open(filepath) as fid:
        lines = fid.readlines()
    for line in lines:
        video_filename, tag_string = line.split()
        tag = int(tag_string)
        if tag == TRAIN_TAG:
            full_train_files.append(video_filename)
            full_train_labels.append(class_name)
        elif tag == TEST_TAG:
            test_files.append(video_filename)
            test_labels.append(class_name)

print(f'Train files ({len(full_train_files)}):\n\t{full_train_files}')
print(f'Train labels ({len(full_train_labels)}):\n\t{full_train_labels}\n'\
      f'Train Distribution:{list(Counter(sorted(full_train_labels)).items())}\n')
print(f'Test files ({len(test_files)}):\n\t{test_files}')
print(f'Test labels ({len(test_labels)}):\n\t{test_labels}\n'\
      f'Test Distribution:{list(Counter(sorted(test_labels)).items())}\n')
action_categories = sorted(list(set(full_train_labels)))
print(f'Action categories ({len(action_categories)}):\n{action_categories}')


# Split the training data into training 90% and validation 10%

import random

# Keep the files and labels pairing
combined = list(zip(full_train_files, full_train_labels))
random.shuffle(combined)
shuffled_train_files, shuffled_train_labels = zip(*combined)

# Validation is 10%
val_size = int(0.1 * len(shuffled_train_files))

val_files = shuffled_train_files[:val_size]
val_labels = shuffled_train_labels[:val_size]
train_files = shuffled_train_files[val_size:]
train_labels = shuffled_train_labels[val_size:]

print(f'Train files ({len(train_files)}):\n\t{train_files}')
print(f'Train labels ({len(train_labels)}):\n\t{train_labels}\n'\
      f'Train Distribution:{list(Counter(sorted(train_labels)).items())}\n')

print(f'Validation files ({len(val_files)}):\n\t{val_files}')
print(f'Validation labels ({len(val_labels)}):\n\t{val_labels}\n'\
      f'Validation Distribution:{list(Counter(sorted(val_labels)).items())}\n')

Train files (840):
	['#20_Rhythm_clap_u_nm_np1_fr_goo_0.avi', '#20_Rhythm_clap_u_nm_np1_fr_goo_1.avi', '#20_Rhythm_clap_u_nm_np1_fr_goo_2.avi', '#20_Rhythm_clap_u_nm_np1_le_goo_3.avi', '#20_Rhythm_clap_u_nm_np1_le_goo_4.avi', 'A_Round_of_Applause_clap_u_cm_np1_fr_med_0.avi', 'A_Round_of_Applause_clap_u_cm_np1_fr_med_1.avi', 'Applauding_Abby_clap_u_nm_np1_fr_med_0.avi', 'Applauding_Abby_clap_u_nm_np1_fr_med_1.avi', 'Baby_Bob_kann_klatschen_!_clap_u_cm_np1_fr_med_0.avi', 'Baby_Bob_kann_klatschen_!_clap_u_cm_np1_fr_med_1.avi', 'Baby_Bob_kann_klatschen_!_clap_u_cm_np1_fr_med_2.avi', 'Boom_Snap_Clap_clap_u_nm_np1_fr_med_0.avi', 'Boom_Snap_Clap_clap_u_nm_np1_fr_med_1.avi', 'Boom__Snap__Clap!_(Challenge)_clap_u_nm_np1_fr_med_1.avi', 'Budam_-_Clap_Hands_clap_u_nm_np1_fr_med_0.avi', 'Budam_-_Clap_Hands_clap_u_nm_np1_fr_med_1.avi', 'Budam_-_Clap_Hands_clap_u_nm_np1_fr_med_2.avi', 'Clap_Hands_clap_u_nm_np1_fr_med_0.avi', 'Clap_Hands_clap_u_nm_np1_fr_med_1.avi', 'Clap_Hands_clap_u_nm_np1_fr_med_2.

#### Defining the 2 CNN Streams (models) - No fusion model yet

In [13]:
import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import cv2
import os
import torch.nn as nn

# Use gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the path to the 'video_data' folder
video_folder = 'video_data'

# Define the data transforms
data_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create custom dataset for HMDB51
class HMDB51Dataset(Dataset):
    def __init__(self, file_list, labels, video_folder, transform=None):
        self.file_list = file_list
        self.labels = [action_categories.index(label) for label in labels]  # Convert labels to integers
        self.video_folder = video_folder
        self.transform = transform

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, index):
        video_name = self.file_list[index]
        label = self.labels[index]
        class_name = action_categories[label]
        video_path = os.path.join(self.video_folder, class_name, video_name)
        video = cv2.VideoCapture(video_path)
        frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        middle_frame_idx = frame_count // 2
        video.set(cv2.CAP_PROP_POS_FRAMES, middle_frame_idx)
        ret, frame = video.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            if self.transform:
                frame = self.transform(frame)
        else:
            raise ValueError(f"Failed to read middle frame from video: {video_path}")
        return frame, label

# Create data loaders for HMDB51
train_dataset_f = HMDB51Dataset(train_files, train_labels, video_folder, transform=data_transforms)
val_dataset_f = HMDB51Dataset(val_files, val_labels, video_folder, transform=data_transforms)
test_dataset_f = HMDB51Dataset(test_files, test_labels, video_folder, transform=data_transforms)

train_loader_f = DataLoader(train_dataset_f, batch_size=32, shuffle=True)
val_loader_f = DataLoader(val_dataset_f, batch_size=32, shuffle=False)
test_loader_f = DataLoader(test_dataset_f, batch_size=32, shuffle=False)

# Define the custom CNN architecture
class CustomCNN(nn.Module):
    def __init__(self, num_classes):
        super(CustomCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(32 * 56 * 56, 128)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        # Flattening the tensor
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.fc2(x)
        return x
    
# Load the pretrained model
pretrained_model_f = CustomCNN(num_classes=len(action_categories))
pretrained_model_f.load_state_dict(torch.load('HMDB51-Frames.pth'))
pretrained_model_f = pretrained_model_f.to(device)


In [14]:
label_to_index = {
    "clap": 0,
    "climb": 1,
    "drink": 2,
    "jump": 3,
    "pour": 4,
    "ride_bike": 5,
    "ride_horse": 6,
    "run": 7,
    "shoot_bow": 8,
    "smoke": 9,
    "throw": 10,
    "wave": 11
}

class OpticalFlowDataset(Dataset):
    def __init__(self, data_directory, file_names, labels, transforms=None):
        # directory of the classes folders
        self.data_directory = data_directory
        # train files or validation files
        self.file_names = file_names
        self.labels = labels
        self.transforms = transforms
        self.samples = self._prepare_dataset()
        
    def _prepare_dataset(self):
        samples = []
        for video_name, label in zip(self.file_names, self.labels):
            # remove .avi from video name to use for directory
            video_name = os.path.splitext(video_name)[0]

            # label is the class folder name
            class_folder = label
            flow_images_path = os.path.join(self.data_directory, 'optical_flow', class_folder, video_name)
            # We have 16 optical flow images
            flow_image_files = []
            for i in range(0, 16):  
                flow_image_path = os.path.join(flow_images_path, f'{video_name}_flow_{i:04d}.png')
                flow_image_files.append((flow_image_path, label))
            samples.append(flow_image_files)
        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        flow_image_paths, labels = zip(*sample)

        flow_images = [Image.open(file_path) for file_path in flow_image_paths]
        if self.transforms:
            flow_images = [self.transforms(flow_image) for flow_image in flow_images]
        # Stacking is wrong cause we get a 5D tensor, I think we need to concatenate
        flow_images_concatenated = torch.cat(flow_images, axis=0)
        # All 16 frames have the same label
        label = labels[0]
        # Apparently for the loss function we need to make the string label to a numerical value and then to a tensor
        label_index = label_to_index[label]
        # label_tensor = torch.tensor(label_index, dtype=torch.long)

        return flow_images_concatenated, label_index
    
# Instantiate your dataset
train_dataset_of = OpticalFlowDataset(
    data_directory='HMDB51',
    file_names=train_files,
    labels=train_labels,
    transforms=data_transforms
)

val_dataset_of = OpticalFlowDataset(
    data_directory='HMDB51',
    file_names=val_files,
    labels=val_labels,
    transforms=data_transforms
)

train_loader_of = DataLoader(train_dataset_of, batch_size=32, shuffle=True)
val_loader_of = DataLoader(val_dataset_of, batch_size=32, shuffle=False)

class OpticalFlowCNN(nn.Module):
    def __init__(self, num_classes=12):
        super(OpticalFlowCNN, self).__init__()

        # Input 48 because we got 16 frames and 3 channels each
        self.conv1 = nn.Conv2d(48, 64, kernel_size=3, stride=2, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.relu3 = nn.ReLU()
        self.conv4 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.relu4 = nn.ReLU()
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.flatten = nn.Flatten()
        
        self.fc1 = nn.Linear(256 * 14 * 14, 1024)
        self.relu_fc1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(1024, num_classes)

    def forward(self, x):
        x = self.pool1(self.relu1(self.conv1(x)))
        x = self.pool2(self.relu2(self.conv2(x)))
        x = self.pool3(self.relu4(self.conv4(self.relu3(self.conv3(x)))))
        
        x = self.flatten(x)
        
        x = self.dropout1(self.relu_fc1(self.fc1(x)))
        x = self.fc2(x)
        
        return x

# Load the pretrained model
pretrained_model_of = OpticalFlowCNN(num_classes=len(action_categories))
pretrained_model_of.load_state_dict(torch.load('HMDB51-OpticalFlow.pth'))
pretrained_model_of = pretrained_model_of.to(device)

#### Fusion of the models

In [None]:
class TwoStreamModel(nn.Module):
    def _init_(self, frame_model, flow_model):
        super(TwoStreamModel, self)._init_()
        self.frame_model = frame_model
        self.flow_model = flow_model
        
        # Need to add more layers after fusion

    def forward(self, x_frame, x_flow):
        x_frame = self.frame_model(x_frame)
        x_flow = self.flow_model(x_flow)

        # Fusion here, unsure yet
        # Concatenate the two outputs
        x = torch.cat((x_frame, x_flow), dim=1)

        # Add more layers here

        return x
    
two_stream_model = TwoStreamModel(pretrained_model_f, pretrained_model_of)
two_stream_model = two_stream_model.to(device)

class TwoStreamDataset(torch.utils.data.Dataset):
    def _init_(self, frame_dataset, flow_dataset):
        self.frame_dataset = frame_dataset
        self.flow_dataset = flow_dataset

    def _len_(self):
        return len(self.frame_dataset)

    def _getitem_(self, index):
        frame, label = self.frame_dataset[index]
        flow, _ = self.flow_dataset[index]
        return frame, flow, label
    
train_two_stream_dataset = TwoStreamDataset(train_hmdb51_dataset, train_files)
val_two_stream_dataset = TwoStreamDataset(val_hmdb51_dataset, val_files)
test_two_stream_dataset = TwoStreamDataset(test_hmdb51_dataset, test_files)
