### HDMB51 Dataset

#### First steps to get the files and labels for training, validation and testing

##### Variable names:

- train_files
- train_labels
- val_files
- val_labels
- test_files
- test_labels

In [14]:
# Because the commands of the skeleton code are for linux, I just run them in collab and downloaded the result
# to the HMDB51 directory.

# Split dataset into train and test

import glob
from collections import Counter
import os

keep_hmdb51 = ["clap", "climb", "drink", "jump", "pour", "ride_bike", "ride_horse", 
        "run", "shoot_bow", "smoke", "throw", "wave"]

TRAIN_TAG, TEST_TAG = 1, 2
full_train_files, test_files = [], []
full_train_labels, test_labels = [], []
split_pattern_name = f"*test_split1.txt"
split_pattern_path = os.path.join('HMDB51\\test_train_splits', split_pattern_name)
annotation_paths = glob.glob(split_pattern_path)

for filepath in annotation_paths:

    base_name = os.path.basename(filepath)
    class_name = '_'.join(base_name.split('_')[:-2])

    # Doesn't work with the original code given
    # class_name = '_'.join(filepath.split('/')[-1].split('_')[:-2])

    if class_name not in keep_hmdb51:
        continue  # skipping the classes that we won't use.
    with open(filepath) as fid:
        lines = fid.readlines()
    for line in lines:
        video_filename, tag_string = line.split()
        tag = int(tag_string)
        if tag == TRAIN_TAG:
            full_train_files.append(video_filename)
            full_train_labels.append(class_name)
        elif tag == TEST_TAG:
            test_files.append(video_filename)
            test_labels.append(class_name)

print(f'Train files ({len(full_train_files)}):\n\t{full_train_files}')
print(f'Train labels ({len(full_train_labels)}):\n\t{full_train_labels}\n'\
      f'Train Distribution:{list(Counter(sorted(full_train_labels)).items())}\n')
print(f'Test files ({len(test_files)}):\n\t{test_files}')
print(f'Test labels ({len(test_labels)}):\n\t{test_labels}\n'\
      f'Test Distribution:{list(Counter(sorted(test_labels)).items())}\n')
action_categories = sorted(list(set(full_train_labels)))
print(f'Action categories ({len(action_categories)}):\n{action_categories}')

Train files (840):
	['#20_Rhythm_clap_u_nm_np1_fr_goo_0.avi', '#20_Rhythm_clap_u_nm_np1_fr_goo_1.avi', '#20_Rhythm_clap_u_nm_np1_fr_goo_2.avi', '#20_Rhythm_clap_u_nm_np1_le_goo_3.avi', '#20_Rhythm_clap_u_nm_np1_le_goo_4.avi', 'A_Round_of_Applause_clap_u_cm_np1_fr_med_0.avi', 'A_Round_of_Applause_clap_u_cm_np1_fr_med_1.avi', 'Applauding_Abby_clap_u_nm_np1_fr_med_0.avi', 'Applauding_Abby_clap_u_nm_np1_fr_med_1.avi', 'Baby_Bob_kann_klatschen_!_clap_u_cm_np1_fr_med_0.avi', 'Baby_Bob_kann_klatschen_!_clap_u_cm_np1_fr_med_1.avi', 'Baby_Bob_kann_klatschen_!_clap_u_cm_np1_fr_med_2.avi', 'Boom_Snap_Clap_clap_u_nm_np1_fr_med_0.avi', 'Boom_Snap_Clap_clap_u_nm_np1_fr_med_1.avi', 'Boom__Snap__Clap!_(Challenge)_clap_u_nm_np1_fr_med_1.avi', 'Budam_-_Clap_Hands_clap_u_nm_np1_fr_med_0.avi', 'Budam_-_Clap_Hands_clap_u_nm_np1_fr_med_1.avi', 'Budam_-_Clap_Hands_clap_u_nm_np1_fr_med_2.avi', 'Clap_Hands_clap_u_nm_np1_fr_med_0.avi', 'Clap_Hands_clap_u_nm_np1_fr_med_1.avi', 'Clap_Hands_clap_u_nm_np1_fr_med_2.

In [15]:
# Split the training data into training 90% and validation 10%

import random

# Keep the files and labels pairing
combined = list(zip(full_train_files, full_train_labels))
random.shuffle(combined)
shuffled_train_files, shuffled_train_labels = zip(*combined)

# Validation is 10%
val_size = int(0.1 * len(shuffled_train_files))

val_files = shuffled_train_files[:val_size]
val_labels = shuffled_train_labels[:val_size]
train_files = shuffled_train_files[val_size:]
train_labels = shuffled_train_labels[val_size:]

print(f'Train files ({len(train_files)}):\n\t{train_files}')
print(f'Train labels ({len(train_labels)}):\n\t{train_labels}\n'\
      f'Train Distribution:{list(Counter(sorted(train_labels)).items())}\n')

print(f'Validation files ({len(val_files)}):\n\t{val_files}')
print(f'Validation labels ({len(val_labels)}):\n\t{val_labels}\n'\
      f'Validation Distribution:{list(Counter(sorted(val_labels)).items())}\n')

Train files (756):
	('ArcherySVK_shoot_bow_u_cm_np1_fr_med_0.avi', 'L_cheln_Und_Winken_wave_h_cm_np1_fr_med_0.avi', 'Bier_richtig_einschenken_pour_u_cm_np1_fr_med_0.avi', 'Maya_beim_Winken_wave_f_cm_np1_fr_med_2.avi', 'Alifestyle_ride_horse_f_cm_np1_le_med_2.avi', 'Knifethrowing4knivestogethernospin_throw_u_nm_np1_ba_med_0.avi', 'Shootingarecurve_shoot_bow_u_cm_np1_fr_med_5.avi', 'boomsnapclap!_clap_u_nm_np1_fr_med_1.avi', 'Intro_to_Bartending_-_Lesson_2-Highball_Drinks_pour_u_nm_np1_ri_med_0.avi', 'David_beim_Fahrrad_fahren_X_x_ride_bike_f_cm_np1_ba_med_2.avi', 'ALeapToFreedom_ride_horse_f_cm_np1_ri_med_3.avi', 'megan_roof_climbing_climb_f_cm_np1_ri_goo_1.avi', 'metacafe_coolsoccer_run_f_cm_np3_fr_bad_6.avi', 'Sarah_und_die_Kletterwand_climb_f_cm_np1_ri_bad_1.avi', 'Sexy_Smoking_Girl_-_Teen_Smoking_Cigarette_smoke_u_nm_np1_fr_goo_1.avi', 'TheBoondockSaints_jump_u_cm_np1_fr_bad_83.avi', 'Fellowship_5_shoot_bow_h_cm_np1_fr_med_7.avi', 'THE_PROTECTOR_run_f_cm_np1_ba_med_35.avi', 'Rauchen

#### Plotting function to get train/val accuracy and loss

In [23]:
import matplotlib.pyplot as plt

def plot_training_validation_metrics(train_losses, val_losses, train_accuracies, val_accuracies, filename):
    # Plot training and validation loss
    plt.figure(figsize=(12, 6))
    plt.plot(train_losses, label='Training Loss', color='skyblue')
    plt.plot(val_losses, label='Validation Loss', color='salmon')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig(f"{filename}_loss.png")

    # Plot training and validation accuracy
    plt.figure(figsize=(12, 6))
    plt.plot(train_accuracies, label='Training Accuracy', color='skyblue')
    plt.plot(val_accuracies, label='Validation Accuracy', color='salmon')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    plt.savefig(f"{filename}_accuracy.png")  


#### Obtaining optical flow from HMDB51

Takes a few minutes to create all the folders and flows in the optical_flow folder under HMDB51 directory

In [21]:
import cv2
import numpy as np
import os

video_folder = 'HMDB51/video_data'
flow_folder = 'HMDB51/optical_flow'
# we want to use 16 frames from middle of video
num_frames = 16  

# Compute and save optical flow for a video - We kinda do a dense optical flow (Farneback)
def compute_save_optical_flow(video_path, output_folder):
    cap = cv2.VideoCapture(video_path)
    ret, prev_frame = cap.read()
    prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    
    # Compute middle frame to get 16 frames 8 back and 7 front probably
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    start_frame = max(0, total_frames // 2 - num_frames // 2)
    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
    
    # Process frames and compute optical flow
    for i in range(num_frames):
        ret, frame = cap.read()
        if not ret:
            break
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
        prev_gray = gray
        
        # Encode flow vectors as image
        mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])
        flow_img = np.zeros_like(frame)
        flow_img[..., 0] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
        flow_img[..., 1] = ang * 180 / np.pi / 2
        
        # Save optical flow image - keep original video name and just add flow and the frame number
        flow_filename = os.path.join(output_folder, f"{os.path.splitext(os.path.basename(video_path))[0]}_flow_{i:04d}.png")
        cv2.imwrite(flow_filename, flow_img)
    
    cap.release()


In [22]:
# Go through all videos
print("Starting processing...")
for class_folder in os.listdir(video_folder):
    class_path = os.path.join(video_folder, class_folder)
    if os.path.isdir(class_path):  # Check if it is a directory
        print(f"Processing class directory: {class_folder}")
        for video_filename in os.listdir(class_path):
            if video_filename.endswith(('.avi', '.mp4')):
                video_path = os.path.join(class_path, video_filename)
                output_folder = os.path.join(flow_folder, class_folder, os.path.splitext(video_filename)[0])
                os.makedirs(output_folder, exist_ok=True)
                print(f"Processing video: {video_path}")
                print(f"Output will be saved in: {output_folder}")
                compute_save_optical_flow(video_path, output_folder)
                print(f"Processed video {video_filename}")
            else:
                print(f"Skipping non-video file: {video_filename}")
    else:
        print(f"Skipping non-directory item: {class_path}")

Starting processing...
Processing class directory: clap
Processing video: HMDB51/video_data\clap\#20_Rhythm_clap_u_nm_np1_fr_goo_0.avi
Output will be saved in: HMDB51/optical_flow\clap\#20_Rhythm_clap_u_nm_np1_fr_goo_0
Processed video #20_Rhythm_clap_u_nm_np1_fr_goo_0.avi
Processing video: HMDB51/video_data\clap\#20_Rhythm_clap_u_nm_np1_fr_goo_1.avi
Output will be saved in: HMDB51/optical_flow\clap\#20_Rhythm_clap_u_nm_np1_fr_goo_1
Processed video #20_Rhythm_clap_u_nm_np1_fr_goo_1.avi
Processing video: HMDB51/video_data\clap\#20_Rhythm_clap_u_nm_np1_fr_goo_2.avi
Output will be saved in: HMDB51/optical_flow\clap\#20_Rhythm_clap_u_nm_np1_fr_goo_2
Processed video #20_Rhythm_clap_u_nm_np1_fr_goo_2.avi
Processing video: HMDB51/video_data\clap\#20_Rhythm_clap_u_nm_np1_le_goo_3.avi
Output will be saved in: HMDB51/optical_flow\clap\#20_Rhythm_clap_u_nm_np1_le_goo_3
Processed video #20_Rhythm_clap_u_nm_np1_le_goo_3.avi
Processing video: HMDB51/video_data\clap\#20_Rhythm_clap_u_nm_np1_le_goo_4.a

#### Create CNN for optical flow training on the HMDB51 dataset

In [None]:
# Defining the model

import torch
import torch.nn as nn
from torchvision.models import mobilenet_v2

class OpticalFlowMobileNet(nn.Module):
    def __init__(self, num_classes=12):
        super(OpticalFlowMobileNet, self).__init__()
        # Need to see what's best if we want pretrained or not
        original_model = mobilenet_v2(pretrained=True)
        self.features = original_model.features
        # stack 16 optical flow frames and they have 2 channels (the 2 colors for the motion)
        self.features[0][0] = nn.Conv2d(16 * 2, 32, kernel_size=3, stride=2, padding=1, bias=False)  # Modify to accept 32 channels

        # If we need it we can freeze some layers of the model
        # for param in self.features[:X].parameters():
        #     param.requires_grad = False

        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(original_model.last_channel, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        # global avg pooling
        x = x.mean([2, 3])
        x = self.classifier(x)
        return x


In [None]:
# The custom dataset - needs fixing

from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import numpy as np
import os
from PIL import Image

class OpticalFlowDataset(Dataset):
    def __init__(self, data_directory, transforms=None):
        # data_directory points to the folder where your data is.
        # This should include subfolders for each category/class.
        self.data_directory = data_directory
        self.transforms = transforms
        # List to hold all file paths and labels
        self.samples = []
        self._prepare_dataset()
        
    def _prepare_dataset(self):
        # Here, populate self.samples with (file_path, label) tuples.
        # You'll need to assign numeric labels to each class/category.
        # This method will vary depending on your directory structure and how your data is organized.
        pass  # Implement this

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        file_path, label = self.samples[idx]
        # Assuming you stored your optical flow images in a way that you can load them here
        # You might need to load multiple images and stack them here if they are not pre-stacked
        flow_stack = Image.open(file_path)  # Or modify this to handle your actual data format
        if self.transforms:
            flow_stack = self.transforms(flow_stack)
        return flow_stack, label

# Define your transformations, e.g., resizing, normalization.
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Adjust size to match your model's expected input
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5], std=[0.5, 0.5])  # Adjust as necessary
])



In [None]:
# Training the model

import torch

# Instantiate your dataset
train_dataset = OpticalFlowDataset(data_directory='path_to_train_data', transforms=transform)
val_dataset = OpticalFlowDataset(data_directory='path_to_val_data', transforms=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

model = OpticalFlowMobileNet(num_classes=12)  # Adjust num_classes based on your dataset
criterion = torch.nn.CrossEntropyLoss()  # For classification tasks
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adjust hyperparameters as needed

num_epochs = 25
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

for epoch in range(num_epochs):
    model.train()
    running_loss, running_corrects = 0.0, 0
    
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        _, preds = torch.max(outputs, 1)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = running_corrects.double() / len(train_loader.dataset)
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_acc.item())
    
    model.eval()
    running_val_loss, running_val_corrects = 0.0, 0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
            running_val_loss += loss.item() * inputs.size(0)
            running_val_corrects += torch.sum(preds == labels.data)
    
    epoch_val_loss = running_val_loss / len(val_loader.dataset)
    epoch_val_acc = running_val_corrects.double() / len(val_loader.dataset)
    val_losses.append(epoch_val_loss)
    val_accuracies.append(epoch_val_acc.item())
    
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_loss:.4f}, Train Accuracy: {epoch_acc:.4f}, Validation Loss: {epoch_val_loss:.4f}, Validation Accuracy: {epoch_val_acc:.4f}')

plot_training_validation_metrics(train_losses, val_losses, train_accuracies, val_accuracies, 'HMDDB51-OpticalFlow')
