In [1]:
!pip install kaggle



In [2]:
!kaggle datasets download -d matthewjansen/ucf101-action-recognition

Dataset URL: https://www.kaggle.com/datasets/matthewjansen/ucf101-action-recognition
License(s): CC0-1.0
Downloading ucf101-action-recognition.zip to /content
100% 6.52G/6.53G [04:56<00:00, 22.4MB/s]
100% 6.53G/6.53G [04:56<00:00, 23.6MB/s]


In [3]:
import zipfile
import os
# /content/ucf101-action-recognition.zip
# Path to the zip file
zip_file_path = '/content/ucf101-action-recognition.zip'
extracted_path = '/content/ucf101-action-recognition'

# Unzipping the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

# Check if extraction was successful
extracted_files = os.listdir(extracted_path)
print("Files and directories after extraction:")
for file in extracted_files:
    print(file)


Files and directories after extraction:
val
val.csv
train
test
train.csv
test.csv


In [4]:
import pandas as pd
import os
extracted_path = './ucf101-action-recognition'

# Path to the CSV files
train_csv_path = os.path.join(extracted_path, 'train.csv')
val_csv_path = os.path.join(extracted_path, 'val.csv')

# Load the CSV files
train_df = pd.read_csv(train_csv_path)
val_df = pd.read_csv(val_csv_path)

# Display the first few rows of train.csv to understand its structure
print("Training Data Sample:")
print(train_df.head())

# Filter for the selected classes
selected_classes = ['Biking', 'SoccerPenalty', 'JumpingJack', 'BasketballDunk', 'VolleyballSpiking']

train_selected = train_df[train_df['label'].isin(selected_classes)]
val_selected = val_df[val_df['label'].isin(selected_classes)]

# Display filtered train and val datasets
print("\nFiltered Training Data:")
print(train_selected.head())

print("\nFiltered Validation Data:")
print(val_selected.head())


Training Data Sample:
         clip_name                         clip_path  label
0  v_Swing_g05_c02  /train/Swing/v_Swing_g05_c02.avi  Swing
1  v_Swing_g21_c03  /train/Swing/v_Swing_g21_c03.avi  Swing
2  v_Swing_g07_c01  /train/Swing/v_Swing_g07_c01.avi  Swing
3  v_Swing_g24_c04  /train/Swing/v_Swing_g24_c04.avi  Swing
4  v_Swing_g20_c03  /train/Swing/v_Swing_g20_c03.avi  Swing

Filtered Training Data:
                  clip_name                                     clip_path  \
1282  v_JumpingJack_g13_c03  /train/JumpingJack/v_JumpingJack_g13_c03.avi   
1283  v_JumpingJack_g14_c03  /train/JumpingJack/v_JumpingJack_g14_c03.avi   
1284  v_JumpingJack_g25_c02  /train/JumpingJack/v_JumpingJack_g25_c02.avi   
1285  v_JumpingJack_g01_c04  /train/JumpingJack/v_JumpingJack_g01_c04.avi   
1286  v_JumpingJack_g06_c06  /train/JumpingJack/v_JumpingJack_g06_c06.avi   

            label  
1282  JumpingJack  
1283  JumpingJack  
1284  JumpingJack  
1285  JumpingJack  
1286  JumpingJack  

Filtered 

In [5]:
import cv2
import os
import numpy as np
from tqdm import tqdm

# Path for saving frames
frame_save_path = '/content/frames_raw/'

# Ensure the directory for saving frames exists
os.makedirs(frame_save_path, exist_ok=True)

# Function to extract frames from a video
def extract_frames(video_path, class_name, clip_name, save_path, frame_size=(64, 64)):
    # Create a directory for the class if it doesn't exist
    class_save_path = os.path.join(save_path, class_name)
    os.makedirs(class_save_path, exist_ok=True)

    # Create a directory for the specific video clip
    clip_save_path = os.path.join(class_save_path, clip_name)
    os.makedirs(clip_save_path, exist_ok=True)

    # Open video file
    cap = cv2.VideoCapture(video_path)

    # Read frames and save them
    frame_num = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Resize the frame
        frame_resized = cv2.resize(frame, frame_size)

        # Convert to grayscale (optional)
        frame_gray = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2GRAY)

        # Save frame as an image
        frame_filename = f"{clip_name}_frame_{frame_num:03d}.jpg"
        cv2.imwrite(os.path.join(clip_save_path, frame_filename), frame_gray)

        frame_num += 1

    cap.release()

# Example: Extract frames from videos for the 'JumpingJack' class
# for class_name in ['Biking', 'SoccerPenalty', 'JumpingJack', 'BasketballDunk', 'VolleyballSpiking']:
#     for video in train_df[train_df['label'] == class_name]['clip_path']:
#         video_filename = video#.split('/')[-1]#.replace('.avi', '')
#         print(f"Extracting frames for {class_name} - {video_filename}")
#         extract_frames('./ucf101-action-recognition'+video, class_name, video_filename, frame_save_path)
for class_name in ['Biking', 'SoccerPenalty', 'JumpingJack', 'BasketballDunk', 'VolleyballSpiking']:
    for video in train_df[train_df['label'] == class_name]['clip_path']:
        video_filename = video.split('/')[-1].replace('.avi', '')
        # print(f"Extracting and augmenting frames for {class_name} - {video_filename}")
        extract_frames('/content/ucf101-action-recognition' + video, class_name, video_filename, frame_save_path)

In [None]:
frame_save_path = '/content/frames_val/'

for class_name in ['Biking', 'SoccerPenalty', 'JumpingJack', 'BasketballDunk', 'VolleyballSpiking']:
    for video in val_df[val_df['label'] == class_name]['clip_path']:
        video_filename = video.split('/')[-1].replace('.avi', '')
        # print(f"Extracting and augmenting frames for {class_name} - {video_filename}")
        extract_frames('/content/ucf101-action-recognition' + video, class_name, video_filename, frame_save_path)

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset loader
class VideoFrameDataset(Dataset):
    def __init__(self, root_dir, transform=None, input_frames=10, target_frames=5):
        self.root_dir = root_dir
        self.transform = transform
        self.input_frames = input_frames
        self.target_frames = target_frames
        self.data = []

        for class_dir in os.listdir(root_dir):
            class_path = os.path.join(root_dir, class_dir)
            for video_dir in os.listdir(class_path):
                video_path = os.path.join(class_path, video_dir)
                frames = sorted(os.listdir(video_path))
                self.data.append((video_path, frames))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_path, frames = self.data[idx]
        frames = sorted(frames)

        # Load frames as tensors
        input_frames = []
        target_frames = []
        for i, frame_name in enumerate(frames):
            frame_path = os.path.join(video_path, frame_name)
            img = Image.open(frame_path)
            if self.transform:
                img = self.transform(img)
            if i < self.input_frames:
                input_frames.append(img)
            elif i < self.input_frames + self.target_frames:
                target_frames.append(img)

        input_frames = torch.stack(input_frames)
        target_frames = torch.stack(target_frames)
        return input_frames, target_frames

# Define PredRNN (simplified version for frame prediction)
class PredRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(PredRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_frames = 10
target_frames = 5
input_dim = 64 * 64  # Assuming frames are resized to 64x64
hidden_dim = 512
output_dim = 64 * 64
num_layers = 2
batch_size = 16
epochs = 30
lr = 0.001

# Transforms
transform = transforms.Compose([
    transforms.Grayscale(),  # Convert to grayscale
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])

# Load data
train_dataset = VideoFrameDataset(root_dir="frames_raw", transform=transform, input_frames=input_frames, target_frames=target_frames)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = VideoFrameDataset(root_dir="frames_val", transform=transform, input_frames=input_frames, target_frames=target_frames)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, loss, optimizer
model = PredRNN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Training loop
def train():
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for inputs, targets in tqdm(train_loader):
            inputs = inputs.view(inputs.size(0), input_frames, -1).to(device)  # Flatten each frame
            targets = targets.view(targets.size(0), target_frames, -1).to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets[:, -1, :])
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = validate()
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}")

# Validation loop
def validate():
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.view(inputs.size(0), input_frames, -1).to(device)
            targets = targets.view(targets.size(0), target_frames, -1).to(device)

            outputs = model(inputs)
            loss = criterion(outputs, targets[:, -1, :])
            val_loss += loss.item()
    return val_loss / len(val_loader)

if __name__ == "__main__":
    train()


In [None]:
# Save the entire model
torch.save(model, "predrnn_model.pth")

# Save only the state dictionary
torch.save(model.state_dict(), "predrnn_model_state.pth")


In [None]:
import torch
import os
from torchvision import transforms
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

# Define the test function
def test_model(model, test_loader, output_dir="predicted_frames"):
    model.eval()
    os.makedirs(output_dir, exist_ok=True)

    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(test_loader):
            inputs = inputs.view(inputs.size(0), input_frames, -1).to(device)  # Flatten input frames
            predictions = model(inputs)  # Predict next frames
            predictions = predictions.view(-1, 64, 64).cpu().numpy()  # Reshape predictions to image format

            # Save the predicted frames
            for i, frame in enumerate(predictions):
                frame_output_dir = os.path.join(output_dir, f"batch_{batch_idx}_frame_{i}.png")
                plt.imsave(frame_output_dir, frame, cmap="gray")
                print(f"Saved predicted frame: {frame_output_dir}")

# Load the trained model
model = PredRNN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers).to(device)
model.load_state_dict(torch.load("predrnn_model_state.pth"))
print("Model loaded successfully.")

# Test the model
test_dataset = VideoFrameDataset(root_dir="frames_val", transform=transform, input_frames=input_frames, target_frames=target_frames)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)  # Test one sequence at a time
test_model(model, test_loader)


In [None]:
# Example visualization
plt.imshow(predictions[0], cmap="gray")
plt.title("Predicted Frame")
plt.show()
