# Installing Dependencies

In [1]:
!pip install opencv-python-headless scikit-learn -q

#  Import Libraries

In [2]:
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import pickle
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Configuration

In [3]:
CONFIG = {
    'SEQUENCE_LENGTH': 20,
    'IMG_HEIGHT': 112,
    'IMG_WIDTH': 112,
    'BATCH_SIZE': 8,
    'EPOCHS': 10,
    'LEARNING_RATE': 0.0001,
    'LSTM_HIDDEN': 256,
    'LSTM_LAYERS': 1,
    'DROPOUT': 0.3,
    'NUM_WORKERS': 0
}

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")
print(f"Available GPUs: {torch.cuda.device_count()}")

for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

Using device: cuda
Available GPUs: 2
GPU 0: Tesla T4
GPU 1: Tesla T4


# Add Dataset as Input

In [4]:
DATA_PATH = '/kaggle/input/ucf101-action-recognition/train'

if not os.path.exists(DATA_PATH):
    print("ERROR: Dataset not found")
else:
    print(f"Dataset found at: {DATA_PATH}")

Dataset found at: /kaggle/input/ucf101-action-recognition/train


# Video Processing Functions

In [5]:
def extract_frames(video_path, max_frames=20):
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    if total_frames == 0:
        cap.release()
        return None
    
    step = max(1, total_frames // max_frames)
    
    for i in range(0, total_frames, step):
        if len(frames) >= max_frames:
            break
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret:
            frame = cv2.resize(frame, (CONFIG['IMG_WIDTH'], CONFIG['IMG_HEIGHT']))
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
    
    cap.release()
    
    while len(frames) < max_frames:
        frames.append(frames[-1])
    
    return np.array(frames)

def normalize_frames(frames):
    return frames / 255.0

# Prepare Dataset

In [6]:
train_csv = pd.read_csv('/kaggle/input/ucf101-action-recognition/train.csv')
val_csv = pd.read_csv('/kaggle/input/ucf101-action-recognition/val.csv')

train_df = pd.concat([train_csv, val_csv], ignore_index=True)

video_paths = []
labels = []

base_path = '/kaggle/input/ucf101-action-recognition'

for idx, row in train_df.iterrows():
    video_path = base_path + row['clip_path']
    if os.path.exists(video_path):
        video_paths.append(video_path)
        labels.append(row['label'])

print(f"Total videos: {len(video_paths)}")
print(f"Total classes: {len(set(labels))}")

Total videos: 11728
Total classes: 101


# Encode Labels

In [7]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

print(f"Number of classes: {num_classes}")
print(f"Sample classes: {list(label_encoder.classes_[:5])}")

Number of classes: 101
Sample classes: [np.str_('ApplyEyeMakeup'), np.str_('ApplyLipstick'), np.str_('Archery'), np.str_('BabyCrawling'), np.str_('BalanceBeam')]


# Train-Test Split with Sampling

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    video_paths, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels
)

sample_train = min(2000, len(X_train))
sample_test = min(400, len(X_test))

X_train = X_train[:sample_train]
y_train = y_train[:sample_train]

X_test = X_test[:sample_test]
y_test = y_test[:sample_test]

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

Training samples: 2000
Testing samples: 400


# Dataset Class

In [9]:
class VideoDataset(Dataset):
    def __init__(self, video_paths, labels, sequence_length=20):
        self.video_paths = video_paths
        self.labels = labels
        self.sequence_length = sequence_length
        
    def __len__(self):
        return len(self.video_paths)
    
    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]
        
        frames = extract_frames(video_path, self.sequence_length)
        
        if frames is None:
            frames = np.zeros((self.sequence_length, CONFIG['IMG_HEIGHT'], CONFIG['IMG_WIDTH'], 3))
        
        frames = normalize_frames(frames)
        frames = torch.FloatTensor(frames).permute(0, 3, 1, 2)
        
        return frames, label

# Create DataLoaders

In [10]:
train_dataset = VideoDataset(X_train, y_train, CONFIG['SEQUENCE_LENGTH'])
test_dataset = VideoDataset(X_test, y_test, CONFIG['SEQUENCE_LENGTH'])

train_loader = DataLoader(
    train_dataset, 
    batch_size=CONFIG['BATCH_SIZE'], 
    shuffle=True, 
    num_workers=CONFIG['NUM_WORKERS'],
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=CONFIG['BATCH_SIZE'], 
    shuffle=False, 
    num_workers=CONFIG['NUM_WORKERS'],
    pin_memory=True
)

print(f"Train batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

Train batches: 250
Test batches: 50


# CNN-LSTM Model Architecture

In [11]:
class CNNLSTMModel(nn.Module):
    def __init__(self, num_classes, lstm_hidden=256, lstm_layers=1, dropout=0.3):
        super(CNNLSTMModel, self).__init__()
        
        resnet = models.resnet50(pretrained=True)
        self.cnn = nn.Sequential(*list(resnet.children())[:-1])
        
        for param in self.cnn.parameters():
            param.requires_grad = False
        
        self.lstm = nn.LSTM(
            input_size=2048,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            dropout=dropout if lstm_layers > 1 else 0,
            bidirectional=True
        )
        
        self.fc = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(lstm_hidden * 2, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes)
        )
        
    def forward(self, x):
        batch_size, seq_len, c, h, w = x.size()
        
        x = x.view(batch_size * seq_len, c, h, w)
        x = self.cnn(x)
        x = x.view(batch_size, seq_len, -1)
        
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        
        x = self.fc(x)
        return x

model = CNNLSTMModel(num_classes, CONFIG['LSTM_HIDDEN'], CONFIG['LSTM_LAYERS'], CONFIG['DROPOUT'])
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 183MB/s] 


Model parameters: 28,309,413


# Setup Parallel GPU Training

In [12]:
torch.backends.cudnn.benchmark = True

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)

model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=CONFIG['LEARNING_RATE'])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

Using 2 GPUs


# Training Function

In [13]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for frames, labels in tqdm(loader, desc="Training"):
        frames = frames.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        
        optimizer.zero_grad()
        outputs = model(frames)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    return running_loss / len(loader), 100. * correct / total

#  Validation Function

In [14]:
def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for frames, labels in tqdm(loader, desc="Validation"):
            frames = frames.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            
            outputs = model(frames)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    return running_loss / len(loader), 100. * correct / total

# Training Loop

In [15]:
best_acc = 0.0

for epoch in range(CONFIG['EPOCHS']):
    print(f"\nEpoch {epoch+1}/{CONFIG['EPOCHS']}")
    
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, DEVICE)
    val_loss, val_acc = validate(model, test_loader, criterion, DEVICE)
    
    scheduler.step(val_loss)
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_acc': val_acc,
            'label_encoder': label_encoder
        }, 'best_model.pth')
        print(f"Model saved with accuracy: {val_acc:.2f}%")

print(f"\nBest Validation Accuracy: {best_acc:.2f}%")


Epoch 1/10


Training: 100%|██████████| 250/250 [10:41<00:00,  2.56s/it]
Validation: 100%|██████████| 50/50 [02:06<00:00,  2.53s/it]


Train Loss: 4.6125 | Train Acc: 1.35%
Val Loss: 4.5858 | Val Acc: 3.25%
Model saved with accuracy: 3.25%

Epoch 2/10


Training: 100%|██████████| 250/250 [10:44<00:00,  2.58s/it]
Validation: 100%|██████████| 50/50 [02:07<00:00,  2.55s/it]


Train Loss: 4.5809 | Train Acc: 2.40%
Val Loss: 4.5621 | Val Acc: 2.75%

Epoch 3/10


Training: 100%|██████████| 250/250 [10:45<00:00,  2.58s/it]
Validation: 100%|██████████| 50/50 [02:06<00:00,  2.53s/it]


Train Loss: 4.5338 | Train Acc: 4.10%
Val Loss: 4.4776 | Val Acc: 7.00%
Model saved with accuracy: 7.00%

Epoch 4/10


Training: 100%|██████████| 250/250 [10:44<00:00,  2.58s/it]
Validation: 100%|██████████| 50/50 [02:06<00:00,  2.53s/it]


Train Loss: 4.4560 | Train Acc: 4.95%
Val Loss: 4.3582 | Val Acc: 10.00%
Model saved with accuracy: 10.00%

Epoch 5/10


Training: 100%|██████████| 250/250 [10:52<00:00,  2.61s/it]
Validation: 100%|██████████| 50/50 [02:07<00:00,  2.55s/it]


Train Loss: 4.3455 | Train Acc: 7.05%
Val Loss: 4.2204 | Val Acc: 12.75%
Model saved with accuracy: 12.75%

Epoch 6/10


Training: 100%|██████████| 250/250 [10:47<00:00,  2.59s/it]
Validation: 100%|██████████| 50/50 [02:07<00:00,  2.55s/it]


Train Loss: 4.2146 | Train Acc: 8.85%
Val Loss: 4.0549 | Val Acc: 12.75%

Epoch 7/10


Training: 100%|██████████| 250/250 [10:45<00:00,  2.58s/it]
Validation: 100%|██████████| 50/50 [02:07<00:00,  2.54s/it]


Train Loss: 4.0884 | Train Acc: 9.95%
Val Loss: 3.8879 | Val Acc: 13.50%
Model saved with accuracy: 13.50%

Epoch 8/10


Training: 100%|██████████| 250/250 [10:48<00:00,  2.60s/it]
Validation: 100%|██████████| 50/50 [02:07<00:00,  2.55s/it]


Train Loss: 3.9524 | Train Acc: 11.55%
Val Loss: 3.7785 | Val Acc: 15.50%
Model saved with accuracy: 15.50%

Epoch 9/10


Training: 100%|██████████| 250/250 [10:49<00:00,  2.60s/it]
Validation: 100%|██████████| 50/50 [02:08<00:00,  2.56s/it]


Train Loss: 3.8588 | Train Acc: 12.80%
Val Loss: 3.6282 | Val Acc: 18.25%
Model saved with accuracy: 18.25%

Epoch 10/10


Training: 100%|██████████| 250/250 [10:51<00:00,  2.60s/it]
Validation: 100%|██████████| 50/50 [02:07<00:00,  2.55s/it]


Train Loss: 3.7570 | Train Acc: 13.75%
Val Loss: 3.5467 | Val Acc: 19.00%
Model saved with accuracy: 19.00%

Best Validation Accuracy: 19.00%


# Save Final Model

In [16]:
torch.save({
    'model_state_dict': model.state_dict(),
    'label_encoder': label_encoder,
    'config': CONFIG,
    'num_classes': num_classes
}, 'action_recognition_model.pth')

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("Model saved as 'action_recognition_model.pth'")
print("Label encoder saved as 'label_encoder.pkl'")

Model saved as 'action_recognition_model.pth'
Label encoder saved as 'label_encoder.pkl'


# Test Inference

In [17]:
def predict_action(video_path, model, label_encoder, device):
    model.eval()
    frames = extract_frames(video_path, CONFIG['SEQUENCE_LENGTH'])
    
    if frames is None:
        return "Error", 0.0
    
    frames = normalize_frames(frames)
    frames = torch.FloatTensor(frames).permute(0, 3, 1, 2).unsqueeze(0)
    frames = frames.to(device)
    
    with torch.no_grad():
        outputs = model(frames)
        probabilities = torch.softmax(outputs, dim=1)
        confidence, predicted = probabilities.max(1)
    
    action = label_encoder.inverse_transform([predicted.item()])[0]
    confidence = confidence.item() * 100
    
    return action, confidence

test_video = X_test[0]
action, confidence = predict_action(test_video, model, label_encoder, DEVICE)

print(f"Test Video: {os.path.basename(test_video)}")
print(f"Predicted: {action}")
print(f"Confidence: {confidence:.2f}%")

Test Video: v_Basketball_g10_c05.avi
Predicted: Bowling
Confidence: 6.75%


# Create & Download Model ZIP

In [18]:
import zipfile
from IPython.display import FileLink

zip_filename = 'action_recognition_model.zip'

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('action_recognition_model.pth')
    zipf.write('label_encoder.pkl')
    if os.path.exists('best_model.pth'):
        zipf.write('best_model.pth')

print(f"Zip created: {zip_filename}")
print(f"Size: {os.path.getsize(zip_filename) / (1024**2):.2f} MB")

FileLink(zip_filename)

Zip created: action_recognition_model.zip
Size: 228.93 MB
