In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("matthewjansen/ucf101-action-recognition")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/matthewjansen/ucf101-action-recognition?dataset_version_number=4...


100%|██████████| 6.53G/6.53G [01:04<00:00, 108MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/matthewjansen/ucf101-action-recognition/versions/4


In [3]:
import cv2
import os

def extract_frames(video_path, output_dir, fps=10):
    os.makedirs(output_dir, exist_ok=True)
    cap = cv2.VideoCapture(video_path)

    video_fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(video_fps / fps)
    count = 0
    saved = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if count % frame_interval == 0:
            frame_filename = os.path.join(output_dir, f"frame_{saved:05d}.jpg")
            cv2.imwrite(frame_filename, frame)
            saved += 1
        count += 1

    cap.release()


In [4]:
import torch
from torchvision import models, transforms
from PIL import Image
import numpy as np

# Load pre-trained ResNet
resnet = models.resnet50(pretrained=True)
resnet = resnet.eval()
feature_extractor = torch.nn.Sequential(*list(resnet.children())[:-1])  # Remove final classifier layer

# Transformation for input images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

def extract_features_from_frames(frame_dir):
    features = []
    for img_name in sorted(os.listdir(frame_dir)):
        img_path = os.path.join(frame_dir, img_name)
        img = Image.open(img_path).convert("RGB")
        input_tensor = transform(img).unsqueeze(0)  # Shape: (1, 3, 224, 224)
        with torch.no_grad():
            output = feature_extractor(input_tensor)
        features.append(output.squeeze().numpy())  # Shape: (2048,)
    return np.array(features)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 165MB/s]


In [5]:
from tqdm import tqdm

# Where to store intermediate frames and final features
frame_root = "extracted_frames"
feature_root = "extracted_features"
os.makedirs(frame_root, exist_ok=True)
os.makedirs(feature_root, exist_ok=True)

# Supported video formats
video_extensions = [".avi", ".mp4"]

# Traverse UCF101 video files
for root, _, files in os.walk(path):
    for file in tqdm(files, desc="Processing videos"):
        if any(file.endswith(ext) for ext in video_extensions):
            video_path = os.path.join(root, file)
            class_name = os.path.basename(os.path.dirname(video_path))
            video_name = os.path.splitext(file)[0]

            # Create paths
            frame_dir = os.path.join(frame_root, class_name, video_name)
            feature_path = os.path.join(feature_root, class_name)
            os.makedirs(feature_path, exist_ok=True)

            try:
                # Step 1: Extract frames
                extract_frames(video_path, frame_dir, fps=10)

                # Step 2: Extract features
                features = extract_features_from_frames(frame_dir)

                # Step 3: Save features
                np.save(os.path.join(feature_path, f"{video_name}.npy"), features)

            except Exception as e:
                print(f"❌ Error processing {video_path}: {e}")


Processing videos: 100%|██████████| 3/3 [00:00<00:00, 13148.29it/s]
Processing videos: 0it [00:00, ?it/s]
Processing videos: 100%|██████████| 16/16 [04:46<00:00, 17.90s/it]
Processing videos: 100%|██████████| 18/18 [03:40<00:00, 12.27s/it]
Processing videos: 100%|██████████| 21/21 [06:51<00:00, 19.60s/it]
Processing videos:   0%|          | 0/16 [00:02<?, ?it/s]


KeyboardInterrupt: 

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
import os
import numpy as np

class VideoFeatureDataset(Dataset):
    def __init__(self, feature_root, class_names, max_seq_len=100):
        self.samples = []
        self.class_to_idx = {cls: idx for idx, cls in enumerate(class_names)}
        self.max_seq_len = max_seq_len

        for cls in class_names:
            cls_dir = os.path.join(feature_root, cls)
            for file in os.listdir(cls_dir):
                if file.endswith(".npy"):
                    self.samples.append((os.path.join(cls_dir, file), self.class_to_idx[cls]))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        features = np.load(path)

        # Pad or truncate to fixed sequence length
        if features.shape[0] < self.max_seq_len:
            pad = np.zeros((self.max_seq_len - features.shape[0], features.shape[1]))
            features = np.vstack((features, pad))
        else:
            features = features[:self.max_seq_len]

        return torch.tensor(features, dtype=torch.float32), torch.tensor(label, dtype=torch.long)


In [7]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim=2048, hidden_dim=256, num_layers=2, num_classes=4):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = hn[-1]  # last hidden state
        return self.fc(out)


In [8]:
from sklearn.model_selection import train_test_split

# Define paths and class names
feature_root = "extracted_features"
class_names = sorted(os.listdir(feature_root))

# Load dataset
dataset = VideoFeatureDataset(feature_root, class_names, max_seq_len=100)

# Split train/test
indices = list(range(len(dataset)))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
train_set = torch.utils.data.Subset(dataset, train_idx)
test_set = torch.utils.data.Subset(dataset, test_idx)

train_loader = DataLoader(train_set, batch_size=8, shuffle=True)
test_loader = DataLoader(test_set, batch_size=8)


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(input_dim=2048, hidden_dim=256, num_layers=2, num_classes=len(class_names)).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)

        outputs = model(features)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss:.4f}")


Epoch [1/20], Loss: 7.9138
Epoch [2/20], Loss: 7.0605
Epoch [3/20], Loss: 6.2441
Epoch [4/20], Loss: 5.4064
Epoch [5/20], Loss: 4.7199
Epoch [6/20], Loss: 4.4284
Epoch [7/20], Loss: 3.9887
Epoch [8/20], Loss: 3.9138
Epoch [9/20], Loss: 3.7758
Epoch [10/20], Loss: 3.3259
Epoch [11/20], Loss: 3.4335
Epoch [12/20], Loss: 3.2261
Epoch [13/20], Loss: 2.7954
Epoch [14/20], Loss: 2.7060
Epoch [15/20], Loss: 2.0012
Epoch [16/20], Loss: 1.7030
Epoch [17/20], Loss: 1.2244
Epoch [18/20], Loss: 1.2627
Epoch [19/20], Loss: 0.7905
Epoch [20/20], Loss: 0.6609


In [28]:
## Save the trained model
torch.save(model.state_dict(), "lstm_model.pth")
print("Model saved as 'lstm_model.pth'")


Model saved as 'lstm_model.pth'


In [30]:
import torch
from torchvision import models, transforms
from PIL import Image
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMClassifier(torch.nn.Module):
    def __init__(self, input_dim=2048, hidden_dim=256, num_layers=2, num_classes=4):
        super(LSTMClassifier, self).__init__()
        self.lstm = torch.nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])

model = LSTMClassifier(input_dim=2048, hidden_dim=256, num_layers=2, num_classes=len(os.listdir("extracted_features")))
model.load_state_dict(torch.load("/content/lstm_model.pth", map_location=device))
model.to(device)
model.eval()


resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1]).to(device)
resnet.eval()


transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


class_names = sorted(os.listdir("extracted_features"))


def classify_single_image(image_path, seq_len=100):
    img = Image.open(image_path).convert("RGB")
    input_tensor = transform(img).unsqueeze(0).to(device)

    with torch.no_grad():
        feature = resnet(input_tensor).squeeze()


    sequence = feature.repeat(seq_len, 1).unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(sequence)
        predicted = torch.argmax(output, dim=1).item()
        return class_names[predicted]


image_path = "/content/frame_00000.jpg"
predicted_class = classify_single_image(image_path)
print(f"Predicted Class: {predicted_class}")



Predicted Class: GolfSwing
