In [1]:
!pip install kaggle



In [2]:
!kaggle datasets download -d matthewjansen/ucf101-action-recognition

Dataset URL: https://www.kaggle.com/datasets/matthewjansen/ucf101-action-recognition
License(s): CC0-1.0
Downloading ucf101-action-recognition.zip to /content
100% 6.52G/6.53G [01:18<00:00, 121MB/s]
100% 6.53G/6.53G [01:18<00:00, 88.8MB/s]


In [3]:
import zipfile
import os
# /content/ucf101-action-recognition.zip
# Path to the zip file
zip_file_path = '/content/ucf101-action-recognition.zip'
extracted_path = '/content/ucf101-action-recognition'

# Unzipping the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

# Check if extraction was successful
extracted_files = os.listdir(extracted_path)
print("Files and directories after extraction:")
for file in extracted_files:
    print(file)


Files and directories after extraction:
test.csv
test
train.csv
val
train
val.csv


In [4]:
import pandas as pd
import os
extracted_path = './ucf101-action-recognition'

# Path to the CSV files
train_csv_path = os.path.join(extracted_path, 'train.csv')
val_csv_path = os.path.join(extracted_path, 'val.csv')

# Load the CSV files
train_df = pd.read_csv(train_csv_path)
val_df = pd.read_csv(val_csv_path)

# Display the first few rows of train.csv to understand its structure
print("Training Data Sample:")
print(train_df.head())

# Filter for the selected classes
selected_classes = ['Biking', 'SoccerPenalty', 'JumpingJack', 'BasketballDunk', 'VolleyballSpiking']

train_selected = train_df[train_df['label'].isin(selected_classes)]
val_selected = val_df[val_df['label'].isin(selected_classes)]

# Display filtered train and val datasets
print("\nFiltered Training Data:")
print(train_selected.head())

print("\nFiltered Validation Data:")
print(val_selected.head())


Training Data Sample:
         clip_name                         clip_path  label
0  v_Swing_g05_c02  /train/Swing/v_Swing_g05_c02.avi  Swing
1  v_Swing_g21_c03  /train/Swing/v_Swing_g21_c03.avi  Swing
2  v_Swing_g07_c01  /train/Swing/v_Swing_g07_c01.avi  Swing
3  v_Swing_g24_c04  /train/Swing/v_Swing_g24_c04.avi  Swing
4  v_Swing_g20_c03  /train/Swing/v_Swing_g20_c03.avi  Swing

Filtered Training Data:
                  clip_name                                     clip_path  \
1282  v_JumpingJack_g13_c03  /train/JumpingJack/v_JumpingJack_g13_c03.avi   
1283  v_JumpingJack_g14_c03  /train/JumpingJack/v_JumpingJack_g14_c03.avi   
1284  v_JumpingJack_g25_c02  /train/JumpingJack/v_JumpingJack_g25_c02.avi   
1285  v_JumpingJack_g01_c04  /train/JumpingJack/v_JumpingJack_g01_c04.avi   
1286  v_JumpingJack_g06_c06  /train/JumpingJack/v_JumpingJack_g06_c06.avi   

            label  
1282  JumpingJack  
1283  JumpingJack  
1284  JumpingJack  
1285  JumpingJack  
1286  JumpingJack  

Filtered 

In [5]:
import cv2
import os
import numpy as np
from tqdm import tqdm

# Path for saving frames
frame_save_path = '/content/frames_raw/'

# Ensure the directory for saving frames exists
os.makedirs(frame_save_path, exist_ok=True)

# Function to extract frames from a video
def extract_frames(video_path, class_name, clip_name, save_path, frame_size=(64, 64)):
    # Create a directory for the class if it doesn't exist
    class_save_path = os.path.join(save_path, class_name)
    os.makedirs(class_save_path, exist_ok=True)

    # Create a directory for the specific video clip
    clip_save_path = os.path.join(class_save_path, clip_name)
    os.makedirs(clip_save_path, exist_ok=True)

    # Open video file
    cap = cv2.VideoCapture(video_path)

    # Read frames and save them
    frame_num = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Resize the frame
        frame_resized = cv2.resize(frame, frame_size)

        # Convert to grayscale (optional)
        frame_gray = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2GRAY)

        # Save frame as an image
        frame_filename = f"{clip_name}_frame_{frame_num:03d}.jpg"
        cv2.imwrite(os.path.join(clip_save_path, frame_filename), frame_gray)

        frame_num += 1

    cap.release()

# Example: Extract frames from videos for the 'JumpingJack' class
# for class_name in ['Biking', 'SoccerPenalty', 'JumpingJack', 'BasketballDunk', 'VolleyballSpiking']:
#     for video in train_df[train_df['label'] == class_name]['clip_path']:
#         video_filename = video#.split('/')[-1]#.replace('.avi', '')
#         print(f"Extracting frames for {class_name} - {video_filename}")
#         extract_frames('./ucf101-action-recognition'+video, class_name, video_filename, frame_save_path)
for class_name in ['Biking', 'SoccerPenalty', 'JumpingJack', 'BasketballDunk', 'VolleyballSpiking']:
    for video in train_df[train_df['label'] == class_name]['clip_path']:
        video_filename = video.split('/')[-1].replace('.avi', '')
        # print(f"Extracting and augmenting frames for {class_name} - {video_filename}")
        extract_frames('/content/ucf101-action-recognition' + video, class_name, video_filename, frame_save_path)

In [6]:
frame_save_path = '/content/frames_val/'

for class_name in ['Biking', 'SoccerPenalty', 'JumpingJack', 'BasketballDunk', 'VolleyballSpiking']:
    for video in val_df[val_df['label'] == class_name]['clip_path']:
        video_filename = video.split('/')[-1].replace('.avi', '')
        # print(f"Extracting and augmenting frames for {class_name} - {video_filename}")
        extract_frames('/content/ucf101-action-recognition' + video, class_name, video_filename, frame_save_path)

In [7]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset loader
class VideoFrameDataset(Dataset):
    def __init__(self, root_dir, transform=None, input_frames=10, target_frames=5):
        self.root_dir = root_dir
        self.transform = transform
        self.input_frames = input_frames
        self.target_frames = target_frames
        self.data = []

        for class_dir in os.listdir(root_dir):
            class_path = os.path.join(root_dir, class_dir)
            for video_dir in os.listdir(class_path):
                video_path = os.path.join(class_path, video_dir)
                frames = sorted(os.listdir(video_path))
                self.data.append((video_path, frames))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_path, frames = self.data[idx]
        frames = sorted(frames)

        # Load frames as tensors
        input_frames = []
        target_frames = []
        for i, frame_name in enumerate(frames):
            frame_path = os.path.join(video_path, frame_name)
            img = Image.open(frame_path)
            if self.transform:
                img = self.transform(img)
            if i < self.input_frames:
                input_frames.append(img)
            elif i < self.input_frames + self.target_frames:
                target_frames.append(img)

        input_frames = torch.stack(input_frames)
        target_frames = torch.stack(target_frames)
        return input_frames, target_frames

# Define PredRNN (simplified version for frame prediction)
class PredRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(PredRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_frames = 10
target_frames = 5
input_dim = 64 * 64  # Assuming frames are resized to 64x64
hidden_dim = 512
output_dim = 64 * 64
num_layers = 2
batch_size = 16
epochs = 30
lr = 0.001

# Transforms
transform = transforms.Compose([
    transforms.Grayscale(),  # Convert to grayscale
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])

# Load data
train_dataset = VideoFrameDataset(root_dir="frames_raw", transform=transform, input_frames=input_frames, target_frames=target_frames)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = VideoFrameDataset(root_dir="frames_val", transform=transform, input_frames=input_frames, target_frames=target_frames)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, loss, optimizer
model = PredRNN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Training loop
def train():
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for inputs, targets in tqdm(train_loader):
            inputs = inputs.view(inputs.size(0), input_frames, -1).to(device)  # Flatten each frame
            targets = targets.view(targets.size(0), target_frames, -1).to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets[:, -1, :])
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = validate()
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}")

# Validation loop
def validate():
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.view(inputs.size(0), input_frames, -1).to(device)
            targets = targets.view(targets.size(0), target_frames, -1).to(device)

            outputs = model(inputs)
            loss = criterion(outputs, targets[:, -1, :])
            val_loss += loss.item()
    return val_loss / len(val_loader)

if __name__ == "__main__":
    train()


100%|██████████| 30/30 [00:26<00:00,  1.13it/s]


Epoch 1/30, Train Loss: 0.0750, Val Loss: 0.0548


100%|██████████| 30/30 [00:23<00:00,  1.29it/s]


Epoch 2/30, Train Loss: 0.0461, Val Loss: 0.0429


100%|██████████| 30/30 [00:23<00:00,  1.28it/s]


Epoch 3/30, Train Loss: 0.0414, Val Loss: 0.0394


100%|██████████| 30/30 [00:23<00:00,  1.30it/s]


Epoch 4/30, Train Loss: 0.0379, Val Loss: 0.0361


100%|██████████| 30/30 [00:23<00:00,  1.30it/s]


Epoch 5/30, Train Loss: 0.0346, Val Loss: 0.0342


100%|██████████| 30/30 [00:22<00:00,  1.31it/s]


Epoch 6/30, Train Loss: 0.0325, Val Loss: 0.0328


100%|██████████| 30/30 [00:23<00:00,  1.29it/s]


Epoch 7/30, Train Loss: 0.0320, Val Loss: 0.0322


100%|██████████| 30/30 [00:22<00:00,  1.33it/s]


Epoch 8/30, Train Loss: 0.0302, Val Loss: 0.0326


100%|██████████| 30/30 [00:22<00:00,  1.33it/s]


Epoch 9/30, Train Loss: 0.0294, Val Loss: 0.0317


100%|██████████| 30/30 [00:22<00:00,  1.31it/s]


Epoch 10/30, Train Loss: 0.0285, Val Loss: 0.0319


100%|██████████| 30/30 [00:23<00:00,  1.29it/s]


Epoch 11/30, Train Loss: 0.0274, Val Loss: 0.0308


100%|██████████| 30/30 [00:23<00:00,  1.29it/s]


Epoch 12/30, Train Loss: 0.0269, Val Loss: 0.0299


100%|██████████| 30/30 [00:23<00:00,  1.29it/s]


Epoch 13/30, Train Loss: 0.0259, Val Loss: 0.0292


100%|██████████| 30/30 [00:23<00:00,  1.30it/s]


Epoch 14/30, Train Loss: 0.0248, Val Loss: 0.0289


100%|██████████| 30/30 [00:23<00:00,  1.30it/s]


Epoch 15/30, Train Loss: 0.0241, Val Loss: 0.0278


100%|██████████| 30/30 [00:22<00:00,  1.31it/s]


Epoch 16/30, Train Loss: 0.0242, Val Loss: 0.0274


100%|██████████| 30/30 [00:22<00:00,  1.32it/s]


Epoch 17/30, Train Loss: 0.0230, Val Loss: 0.0272


100%|██████████| 30/30 [00:23<00:00,  1.29it/s]


Epoch 18/30, Train Loss: 0.0226, Val Loss: 0.0264


100%|██████████| 30/30 [00:23<00:00,  1.29it/s]


Epoch 19/30, Train Loss: 0.0223, Val Loss: 0.0265


100%|██████████| 30/30 [00:23<00:00,  1.30it/s]


Epoch 20/30, Train Loss: 0.0218, Val Loss: 0.0261


100%|██████████| 30/30 [00:23<00:00,  1.29it/s]


Epoch 21/30, Train Loss: 0.0211, Val Loss: 0.0260


100%|██████████| 30/30 [00:23<00:00,  1.29it/s]


Epoch 22/30, Train Loss: 0.0208, Val Loss: 0.0252


100%|██████████| 30/30 [00:23<00:00,  1.30it/s]


Epoch 23/30, Train Loss: 0.0205, Val Loss: 0.0250


100%|██████████| 30/30 [00:23<00:00,  1.30it/s]


Epoch 24/30, Train Loss: 0.0203, Val Loss: 0.0253


100%|██████████| 30/30 [00:22<00:00,  1.30it/s]


Epoch 25/30, Train Loss: 0.0196, Val Loss: 0.0244


100%|██████████| 30/30 [00:23<00:00,  1.29it/s]


Epoch 26/30, Train Loss: 0.0192, Val Loss: 0.0247


100%|██████████| 30/30 [00:23<00:00,  1.29it/s]


Epoch 27/30, Train Loss: 0.0193, Val Loss: 0.0254


100%|██████████| 30/30 [00:23<00:00,  1.28it/s]


Epoch 28/30, Train Loss: 0.0194, Val Loss: 0.0241


100%|██████████| 30/30 [00:23<00:00,  1.29it/s]


Epoch 29/30, Train Loss: 0.0185, Val Loss: 0.0236


100%|██████████| 30/30 [00:23<00:00,  1.30it/s]


Epoch 30/30, Train Loss: 0.0180, Val Loss: 0.0236


In [8]:
# Save the entire model
torch.save(model, "predrnn_model.pth")

# Save only the state dictionary
torch.save(model.state_dict(), "predrnn_model_state.pth")


In [16]:
import torch
import os
from torchvision import transforms
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

# Define the test function for sequential predictions
def test_model_sequential(model, test_loader, output_dir="predicted_sequence", num_predicted_frames=10):
    model.eval()
    os.makedirs(output_dir, exist_ok=True)

    with torch.no_grad():
        for batch_idx, (inputs, _) in enumerate(test_loader):
            # Get the first batch (input sequence)
            inputs = inputs.view(inputs.size(0), input_frames, -1).to(device)  # Flatten input frames

            # Start with the input sequence
            current_sequence = inputs[0]  # Take the first sequence in the batch
            predicted_frames = []

            # Generate sequential predictions
            for _ in range(num_predicted_frames):
                prediction = model(current_sequence.unsqueeze(0))  # Predict the next frame
                predicted_frames.append(prediction.cpu().view(64, 64).numpy())  # Save the prediction

                # Update the sequence for the next prediction
                current_sequence = torch.cat((current_sequence[1:], prediction), dim=0)

            # Save the predicted frames as a sequence
            for i, frame in enumerate(predicted_frames):
                frame_output_path = os.path.join(output_dir, f"batch_{batch_idx}_pred_frame_{i}.png")
                plt.imsave(frame_output_path, frame, cmap="gray")
                print(f"Saved predicted frame: {frame_output_path}")

            # Stop after the first sequence (remove this to test all sequences)
            break

# Load the trained model
model = PredRNN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers).to(device)
model.load_state_dict(torch.load("predrnn_model_state.pth"))
print("Model loaded successfully.")

# Test the model for a single sequence
test_dataset = VideoFrameDataset(root_dir="frames_val", transform=transform, input_frames=input_frames, target_frames=target_frames)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)  # Test one sequence at a time
test_model_sequential(model, test_loader)


Model loaded successfully.
Saved predicted frame: predicted_sequence/batch_0_pred_frame_0.png
Saved predicted frame: predicted_sequence/batch_0_pred_frame_1.png
Saved predicted frame: predicted_sequence/batch_0_pred_frame_2.png
Saved predicted frame: predicted_sequence/batch_0_pred_frame_3.png
Saved predicted frame: predicted_sequence/batch_0_pred_frame_4.png
Saved predicted frame: predicted_sequence/batch_0_pred_frame_5.png
Saved predicted frame: predicted_sequence/batch_0_pred_frame_6.png
Saved predicted frame: predicted_sequence/batch_0_pred_frame_7.png
Saved predicted frame: predicted_sequence/batch_0_pred_frame_8.png
Saved predicted frame: predicted_sequence/batch_0_pred_frame_9.png


  model.load_state_dict(torch.load("predrnn_model_state.pth"))


In [9]:
# import torch
# import os
# from torchvision import transforms
# from PIL import Image
# import numpy as np
# import matplotlib.pyplot as plt

# # Define the test function
# def test_model(model, test_loader, output_dir="predicted_frames"):
#     model.eval()
#     os.makedirs(output_dir, exist_ok=True)

#     with torch.no_grad():
#         for batch_idx, (inputs, targets) in enumerate(test_loader):
#             inputs = inputs.view(inputs.size(0), input_frames, -1).to(device)  # Flatten input frames
#             predictions = model(inputs)  # Predict next frames
#             predictions = predictions.view(-1, 64, 64).cpu().numpy()  # Reshape predictions to image format

#             # Save the predicted frames
#             for i, frame in enumerate(predictions):
#                 frame_output_dir = os.path.join(output_dir, f"batch_{batch_idx}_frame_{i}.png")
#                 plt.imsave(frame_output_dir, frame, cmap="gray")
#                 print(f"Saved predicted frame: {frame_output_dir}")

# # Load the trained model
# model = PredRNN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers).to(device)
# model.load_state_dict(torch.load("predrnn_model_state.pth"))
# print("Model loaded successfully.")

# # Test the model
# test_dataset = VideoFrameDataset(root_dir="frames_val", transform=transform, input_frames=input_frames, target_frames=target_frames)
# test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)  # Test one sequence at a time
# test_model(model, test_loader)


  model.load_state_dict(torch.load("predrnn_model_state.pth"))


Model loaded successfully.
Saved predicted frame: predicted_frames/batch_0_frame_0.png
Saved predicted frame: predicted_frames/batch_1_frame_0.png
Saved predicted frame: predicted_frames/batch_2_frame_0.png
Saved predicted frame: predicted_frames/batch_3_frame_0.png
Saved predicted frame: predicted_frames/batch_4_frame_0.png
Saved predicted frame: predicted_frames/batch_5_frame_0.png
Saved predicted frame: predicted_frames/batch_6_frame_0.png
Saved predicted frame: predicted_frames/batch_7_frame_0.png
Saved predicted frame: predicted_frames/batch_8_frame_0.png
Saved predicted frame: predicted_frames/batch_9_frame_0.png
Saved predicted frame: predicted_frames/batch_10_frame_0.png
Saved predicted frame: predicted_frames/batch_11_frame_0.png
Saved predicted frame: predicted_frames/batch_12_frame_0.png
Saved predicted frame: predicted_frames/batch_13_frame_0.png
Saved predicted frame: predicted_frames/batch_14_frame_0.png
Saved predicted frame: predicted_frames/batch_15_frame_0.png
Saved p

In [18]:
import torch
import os
from torchvision import transforms
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset loader
class VideoFrameDataset(Dataset):
    def __init__(self, root_dir, transform=None, input_frames=10, target_frames=5):
        self.root_dir = root_dir
        self.transform = transform
        self.input_frames = input_frames
        self.target_frames = target_frames
        self.data = []

        for class_dir in os.listdir(root_dir):
            class_path = os.path.join(root_dir, class_dir)
            for video_dir in os.listdir(class_path):
                video_path = os.path.join(class_path, video_dir)
                frames = sorted(os.listdir(video_path))
                self.data.append((video_path, frames))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_path, frames = self.data[idx]
        frames = sorted(frames)

        # Load frames as tensors
        input_frames = []
        target_frames = []
        for i, frame_name in enumerate(frames):
            frame_path = os.path.join(video_path, frame_name)
            img = Image.open(frame_path)
            if self.transform:
                img = self.transform(img)
            if i < self.input_frames:
                input_frames.append(img)
            elif i < self.input_frames + self.target_frames:
                target_frames.append(img)

        input_frames = torch.stack(input_frames)
        target_frames = torch.stack(target_frames)
        return input_frames, target_frames

# Define PredRNN (simplified version for frame prediction)
class PredRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(PredRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Function to load frames from a directory
def load_frames_from_directory(directory, transform, num_frames):
    frames = sorted(os.listdir(directory))[:num_frames]
    images = []
    for frame in frames:
        frame_path = os.path.join(directory, frame)
        img = Image.open(frame_path)
        if transform:
            img = transform(img)
        images.append(img)
    return torch.stack(images)

# Define the test function with comparison
def test_model_with_comparison(model, input_dir, target_dir, output_dir="predicted_vs_actual", num_predicted_frames=10):
    model.eval()
    os.makedirs(output_dir, exist_ok=True)

    # Load the input frames
    input_frames = load_frames_from_directory(input_dir, transform, num_frames=input_frames).to(device)
    current_sequence = input_frames  # Start with input frames

    # Predict the next frames
    predicted_frames = []
    with torch.no_grad():
        for _ in range(num_predicted_frames):
            prediction = model(current_sequence.unsqueeze(0))  # Predict the next frame
            predicted_frames.append(prediction.cpu().view(64, 64).numpy())  # Save the prediction

            # Update the sequence for the next prediction
            current_sequence = torch.cat((current_sequence[1:], prediction), dim=0)

    # Load the actual target frames
    actual_frames = load_frames_from_directory(target_dir, transform, num_frames=num_predicted_frames)

    # Compare and save the frames
    for i, (pred_frame, actual_frame) in enumerate(zip(predicted_frames, actual_frames)):
        fig, axs = plt.subplots(1, 2, figsize=(8, 4))
        axs[0].imshow(pred_frame, cmap="gray")
        axs[0].set_title("Predicted Frame")
        axs[0].axis("off")
        axs[1].imshow(actual_frame[0].cpu().numpy(), cmap="gray")
        axs[1].set_title("Actual Frame")
        axs[1].axis("off")

        output_path = os.path.join(output_dir, f"comparison_frame_{i}.png")
        plt.savefig(output_path)
        plt.close(fig)
        print(f"Saved comparison: {output_path}")

# Load the trained model
model = PredRNN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers).to(device)
model.load_state_dict(torch.load("predrnn_model_state.pth"))
print("Model loaded successfully.")

# Specify input and target directories
input_dir = "/content/frames_val/Biking/v_Biking_g02_c05"  # Path to directory with 10 input frames
target_dir = "/content/frames_val/Biking/v_Biking_g02_c05"  # Path to directory with target frames for comparison

# Test the model with comparison
test_model_with_comparison(model, input_dir, target_dir)


  model.load_state_dict(torch.load("predrnn_model_state.pth"))


FileNotFoundError: [Errno 2] No such file or directory: 'predrnn_model_state.pth'

In [14]:
import os
import cv2

# Directory containing the predicted frames
frames_directory = '/content/predicted_sequence'

# List all PNG files in the directory
png_files = sorted([file for file in os.listdir(frames_directory) if file.endswith('.png')])

# Check if there are any PNG files
if not png_files:
    print("No PNG files found in the directory.")
else:
    # Get the size of the first image to determine video dimensions
    first_image_path = os.path.join(frames_directory, png_files[0])
    first_image = cv2.imread(first_image_path)
    height, width, layers = first_image.shape
    size = (width, height)

    # Define the codec and create a VideoWriter object
    video_path = 'predicted_frames_video.avi'
    fourcc = cv2.VideoWriter_fourcc(*'XVID')  # Codec for AVI files
    video = cv2.VideoWriter(video_path, fourcc, 30, size)

    # Read each PNG file and write it to the video
    for file in png_files:
        image_path = os.path.join(frames_directory, file)
        frame = cv2.imread(image_path)
        video.write(frame)

    # Release the video writer
    video.release()

    print(f"Video created successfully: {video_path}")


Video created successfully: predicted_frames_video.avi


In [36]:
import torch
import torch.nn as nn

# class TransformerFramePredictor(nn.Module):
#     def __init__(self, input_dim, num_heads, num_layers, hidden_dim, seq_length):
#         super(TransformerFramePredictor, self).__init__()
#         self.input_dim = input_dim
#         self.seq_length = seq_length
#         self.embedding = nn.Linear(input_dim, hidden_dim)
#         self.positional_encoding = nn.Parameter(torch.zeros(1, seq_length, hidden_dim))
#         self.transformer = nn.Transformer(
#             d_model=hidden_dim, nhead=num_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers
#         )
#         self.fc = nn.Linear(hidden_dim, input_dim)

#     def forward(self, src, tgt):
#         # src: (seq_length, batch_size, input_dim)
#         # tgt: (seq_length, batch_size, input_dim)
#         src = self.embedding(src) + self.positional_encoding
#         tgt = self.embedding(tgt) + self.positional_encoding
#         output = self.transformer(src, tgt)
#         output = self.fc(output)
#         return output
class TransformerFramePredictor(nn.Module):
    def __init__(self, input_dim, num_heads, num_layers, hidden_dim, max_seq_length):
        super(TransformerFramePredictor, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embedding = nn.Linear(input_dim, hidden_dim)
        self.register_buffer("positional_encoding", self._generate_positional_encoding(max_seq_length, hidden_dim))
        self.transformer = nn.Transformer(
            d_model=hidden_dim, nhead=num_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers
        )
        self.fc = nn.Linear(hidden_dim, input_dim)

    def _generate_positional_encoding(self, max_seq_length, hidden_dim):
        # Generate positional encoding (max_seq_length, hidden_dim)
        position = torch.arange(max_seq_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_dim, 2) * (-torch.log(torch.tensor(10000.0)) / hidden_dim))
        pe = torch.zeros(max_seq_length, hidden_dim)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(1)  # Shape: (max_seq_length, 1, hidden_dim)

    def forward(self, src, tgt):
        # src: (seq_length, batch_size, input_dim)
        # tgt: (seq_length, batch_size, input_dim)
        src = self.embedding(src)
        tgt = self.embedding(tgt)

        # Ensure positional encoding matches the actual sequence length
        seq_length_src = src.size(0)
        seq_length_tgt = tgt.size(0)

        src_pos_enc = self.positional_encoding[:seq_length_src, :, :].to(src.device)
        tgt_pos_enc = self.positional_encoding[:seq_length_tgt, :, :].to(tgt.device)

        src = src + src_pos_enc
        tgt = tgt + tgt_pos_enc

        output = self.transformer(src, tgt)
        output = self.fc(output)
        return output


def load_frames_from_directory(directory, transform, num_frames):
    frames = sorted(os.listdir(directory))[:num_frames]
    images = []
    for frame in frames:
        frame_path = os.path.join(directory, frame)
        img = Image.open(frame_path)
        if transform:
            img = transform(img)
        images.append(img)
    return torch.stack(images)

In [37]:
import torch.optim as optim
from tqdm import tqdm

# Training function
def train_transformer_model(model, train_loader, val_loader, epochs, lr):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    best_val_loss = float('inf')

    for epoch in range(epochs):
        train_loss = 0.0
        model.train()
        for inputs, targets in tqdm(train_loader):
            inputs = inputs.view(inputs.size(0), input_frames, -1).to(device)  # Flatten input frames
            targets = targets.view(targets.size(0), target_frames, -1).to(device)  # Flatten target frames

            src = inputs.permute(1, 0, 2)  # (seq_length, batch_size, input_dim)
            tgt = targets.permute(1, 0, 2)  # (seq_length, batch_size, input_dim)

            optimizer.zero_grad()
            predictions = model(src, tgt[:-1])
            loss = criterion(predictions, tgt[1:])
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = validate_transformer_model(model, val_loader, criterion)
        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, Val Loss: {val_loss:.4f}")

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "transformer_model_state.pth")
            print("Saved best model.")

# Validation function
def validate_transformer_model(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.view(inputs.size(0), input_frames, -1).to(device)  # Flatten input frames
            targets = targets.view(targets.size(0), target_frames, -1).to(device)  # Flatten target frames

            src = inputs.permute(1, 0, 2)  # (seq_length, batch_size, input_dim)
            tgt = targets.permute(1, 0, 2)  # (seq_length, batch_size, input_dim)

            predictions = model(src, tgt[:-1])
            loss = criterion(predictions, tgt[1:])
            val_loss += loss.item()
    return val_loss / len(val_loader)


In [39]:
import os
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
import torch

class VideoFrameDataset(Dataset):
    def __init__(self, root_dir, transform=None, input_frames=10, target_frames=5):
        """
        Args:
            root_dir (str): Directory with all the video frames organized by classes and videos.
            transform (callable, optional): A function/transform to apply to the frames.
            input_frames (int): Number of input frames for the model.
            target_frames (int): Number of target frames to predict.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.input_frames = input_frames
        self.target_frames = target_frames
        self.data = []

        # Collect all videos
        for class_dir in os.listdir(root_dir):
            class_path = os.path.join(root_dir, class_dir)
            for video_dir in os.listdir(class_path):
                video_path = os.path.join(class_path, video_dir)
                frames = sorted(os.listdir(video_path))
                # Ensure sufficient frames for input and target
                if len(frames) >= input_frames + target_frames:
                    self.data.append((video_path, frames))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_path, frames = self.data[idx]

        # Select input and target frames
        input_frames = frames[:self.input_frames]
        target_frames = frames[self.input_frames:self.input_frames + self.target_frames]

        # Load frames
        input_tensors = []
        target_tensors = []

        for frame_name in input_frames:
            frame_path = os.path.join(video_path, frame_name)
            img = Image.open(frame_path)
            if self.transform:
                img = self.transform(img)
            input_tensors.append(img)

        for frame_name in target_frames:
            frame_path = os.path.join(video_path, frame_name)
            img = Image.open(frame_path)
            if self.transform:
                img = self.transform(img)
            target_tensors.append(img)

        # Stack tensors
        input_tensor = torch.stack(input_tensors)  # Shape: (input_frames, channels, height, width)
        target_tensor = torch.stack(target_tensors)  # Shape: (target_frames, channels, height, width)

        return input_tensor, target_tensor
transform = transforms.Compose([
    transforms.Grayscale(),  # Convert frames to grayscale (optional)
    transforms.Resize((64, 64)),  # Resize frames to 64x64 pixels
    transforms.ToTensor(),  # Convert frames to PyTorch tensors
    transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize pixel values to [-1, 1]
])
from torch.utils.data import DataLoader

# Parameters
batch_size = 16
input_frames = 10
target_frames = 10

# Training dataset and DataLoader
train_dataset = VideoFrameDataset(
    root_dir="frames_raw",
    transform=transform,
    input_frames=input_frames,
    target_frames=target_frames
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Validation dataset and DataLoader
val_dataset = VideoFrameDataset(
    root_dir="frames_val",
    transform=transform,
    input_frames=input_frames,
    target_frames=target_frames
)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


# Define PredRNN (simplified version for frame prediction)

# Hyperparameters
input_dim = 64 * 64  # Frame size after flattening
hidden_dim = 512
num_heads = 8
num_layers = 4
seq_length = 10
batch_size = 16
epochs = 20
lr = 0.001

# Initialize the Transformer model
# Initialize the Transformer model
model = TransformerFramePredictor(
    input_dim=64 * 64,  # Flattened input size
    num_heads=8,
    num_layers=4,
    hidden_dim=512,
    max_seq_length=input_frames + target_frames  # Max sequence length
).to(device)

# Train the model
train_transformer_model(model, train_loader, val_loader, epochs, lr)


# # Validation loop
# def validate():
#     model.eval()
#     val_loss = 0.0
#     with torch.no_grad():
#         for inputs, targets in val_loader:
#             inputs = inputs.view(inputs.size(0), input_frames, -1).to(device)
#             targets = targets.view(targets.size(0), target_frames, -1).to(device)

#             outputs = model(inputs)
#             loss = criterion(outputs, targets[:, -1, :])
#             val_loss += loss.item()
#     return val_loss / len(val_loader)

# if __name__ == "__main__":
#     train()



100%|██████████| 30/30 [00:32<00:00,  1.07s/it]


Epoch 1/20, Train Loss: 0.2957, Val Loss: 0.2197
Saved best model.


100%|██████████| 30/30 [00:32<00:00,  1.08s/it]


Epoch 2/20, Train Loss: 0.2443, Val Loss: 0.2253


100%|██████████| 30/30 [00:32<00:00,  1.10s/it]


Epoch 3/20, Train Loss: 0.2215, Val Loss: 0.2021
Saved best model.


100%|██████████| 30/30 [00:34<00:00,  1.15s/it]


Epoch 4/20, Train Loss: 0.2188, Val Loss: 0.1995
Saved best model.


100%|██████████| 30/30 [00:35<00:00,  1.18s/it]


Epoch 5/20, Train Loss: 0.2166, Val Loss: 0.2044


100%|██████████| 30/30 [00:34<00:00,  1.16s/it]


Epoch 6/20, Train Loss: 0.2282, Val Loss: 0.2335


100%|██████████| 30/30 [00:35<00:00,  1.17s/it]


Epoch 7/20, Train Loss: 0.2332, Val Loss: 0.2250


100%|██████████| 30/30 [00:35<00:00,  1.18s/it]


Epoch 8/20, Train Loss: 0.2286, Val Loss: 0.2246


100%|██████████| 30/30 [00:36<00:00,  1.22s/it]


Epoch 9/20, Train Loss: 0.2228, Val Loss: 0.2153


100%|██████████| 30/30 [00:36<00:00,  1.23s/it]


Epoch 10/20, Train Loss: 0.2234, Val Loss: 0.2118


100%|██████████| 30/30 [00:36<00:00,  1.23s/it]


Epoch 11/20, Train Loss: 0.2199, Val Loss: 0.2086


100%|██████████| 30/30 [00:36<00:00,  1.22s/it]


Epoch 12/20, Train Loss: 0.2130, Val Loss: 0.2143


100%|██████████| 30/30 [00:36<00:00,  1.22s/it]


Epoch 13/20, Train Loss: 0.2106, Val Loss: 0.2104


100%|██████████| 30/30 [00:36<00:00,  1.22s/it]


Epoch 14/20, Train Loss: 0.2106, Val Loss: 0.2088


100%|██████████| 30/30 [00:36<00:00,  1.23s/it]


Epoch 15/20, Train Loss: 0.2092, Val Loss: 0.2134


100%|██████████| 30/30 [00:36<00:00,  1.23s/it]


Epoch 16/20, Train Loss: 0.2083, Val Loss: 0.2102


100%|██████████| 30/30 [00:36<00:00,  1.22s/it]


Epoch 17/20, Train Loss: 0.2086, Val Loss: 0.2101


100%|██████████| 30/30 [00:36<00:00,  1.20s/it]


Epoch 18/20, Train Loss: 0.2074, Val Loss: 0.2107


100%|██████████| 30/30 [00:36<00:00,  1.20s/it]


Epoch 19/20, Train Loss: 0.2083, Val Loss: 0.2075


100%|██████████| 30/30 [00:35<00:00,  1.20s/it]


Epoch 20/20, Train Loss: 0.2076, Val Loss: 0.2092


In [40]:
# Save the entire model
torch.save(model, "transformer_model.pth")
print("Model saved as transformer_model.pth")


Model saved as transformer_model.pth


In [41]:
# Save only the model's state dictionary
torch.save(model.state_dict(), "transformer_model_state.pth")
print("Model state dictionary saved as transformer_model_state.pth")


Model state dictionary saved as transformer_model_state.pth


In [None]:
# Load the entire model
model = torch.load("transformer_model.pth")
model.eval()
print("Transformer model loaded successfully.")


In [42]:
# Recreate the model structure
model = TransformerFramePredictor(
    input_dim=64 * 64,
    num_heads=8,
    num_layers=4,
    hidden_dim=512,
    max_seq_length=input_frames + target_frames
).to(device)

# Load the state dictionary
model.load_state_dict(torch.load("transformer_model_state.pth"))
model.eval()
print("Transformer model state dictionary loaded successfully.")


Transformer model state dictionary loaded successfully.


  model.load_state_dict(torch.load("transformer_model_state.pth"))


In [44]:
# Function to test the Transformer model
def test_transformer_model(model, input_dir, target_dir, output_dir="transformer_predicted_vs_actual", num_predicted_frames=10):
    model.eval()
    os.makedirs(output_dir, exist_ok=True)

    # Load the input frames
    input_frames = load_frames_from_directory(input_dir, transform, num_frames=10).to(device)
    input_frames_flattened = input_frames.view(input_frames.size(0), -1)  # Flatten input frames
    input_frames_flattened = input_frames_flattened.unsqueeze(1)  # Add batch dimension

    # Start prediction
    predicted_frames = []
    with torch.no_grad():
        # Prepare the initial source and target
        src = input_frames_flattened  # Source sequence (input frames)
        tgt = src[-1:].repeat(num_predicted_frames, 1, 1)  # Start with the last frame as the target

        for i in range(num_predicted_frames):
            prediction = model(src, tgt)  # Predict the next frame
            predicted_frame = prediction[-1].cpu().view(64, 64).numpy()  # Extract the last frame
            predicted_frames.append(predicted_frame)

            # Update the source for the next iteration
            tgt = torch.cat((tgt[1:], prediction[-1:]), dim=0)

    # Load the actual target frames
    actual_frames = load_frames_from_directory(target_dir, transform, num_frames=num_predicted_frames)

    # Compare and save the frames
    for i, (pred_frame, actual_frame) in enumerate(zip(predicted_frames, actual_frames)):
        fig, axs = plt.subplots(1, 2, figsize=(8, 4))
        axs[0].imshow(pred_frame, cmap="gray")
        axs[0].set_title("Predicted Frame")
        axs[0].axis("off")
        axs[1].imshow(actual_frame[0].cpu().numpy(), cmap="gray")
        axs[1].set_title("Actual Frame")
        axs[1].axis("off")

        output_path = os.path.join(output_dir, f"comparison_frame_{i}.png")
        plt.savefig(output_path)
        plt.close(fig)
        print(f"Saved comparison: {output_path}")

# Load the trained Transformer model
# model = TransformerFramePredictor(
#     input_dim=64 * 64, num_heads=8, num_layers=6, hidden_dim=512, seq_length=10
# ).to(device)

# model.load_state_dict(torch.load("transformer_model_state.pth"))
print("Transformer model loaded successfully.")

# Specify input and target directories
input_dir = "/content/frames_val/BasketballDunk/v_BasketballDunk_g04_c02"  # Path to directory with 10 input frames
target_dir = "/content/frames_val/BasketballDunk/v_BasketballDunk_g04_c02"  # Path to directory with target frames for comparison

# Test the model with comparison
test_transformer_model(model, input_dir, target_dir)


Transformer model loaded successfully.
Saved comparison: transformer_predicted_vs_actual/comparison_frame_0.png
Saved comparison: transformer_predicted_vs_actual/comparison_frame_1.png
Saved comparison: transformer_predicted_vs_actual/comparison_frame_2.png
Saved comparison: transformer_predicted_vs_actual/comparison_frame_3.png
Saved comparison: transformer_predicted_vs_actual/comparison_frame_4.png
Saved comparison: transformer_predicted_vs_actual/comparison_frame_5.png
Saved comparison: transformer_predicted_vs_actual/comparison_frame_6.png
Saved comparison: transformer_predicted_vs_actual/comparison_frame_7.png
Saved comparison: transformer_predicted_vs_actual/comparison_frame_8.png
Saved comparison: transformer_predicted_vs_actual/comparison_frame_9.png
