# Installing Dependencies

In [1]:
!pip install opencv-python-headless scikit-learn -q

#  Import Libraries

In [2]:
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import pickle
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Configuration

In [3]:
CONFIG = {
    'SEQUENCE_LENGTH': 20,
    'IMG_HEIGHT': 112,
    'IMG_WIDTH': 112,
    'BATCH_SIZE': 16,
    'EPOCHS': 12,
    'LEARNING_RATE': 0.003,
    'LSTM_HIDDEN': 256,
    'LSTM_LAYERS': 2,
    'DROPOUT': 0.3,
    'NUM_WORKERS': 0
}

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")
print(f"Available GPUs: {torch.cuda.device_count()}")

for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

Using device: cuda
Available GPUs: 2
GPU 0: Tesla T4
GPU 1: Tesla T4


# Add Dataset as Input

In [4]:
DATA_PATH = '/kaggle/input/ucf101-action-recognition/train'

if not os.path.exists(DATA_PATH):
    print("ERROR: Dataset not found")
else:
    print(f"Dataset found at: {DATA_PATH}")

Dataset found at: /kaggle/input/ucf101-action-recognition/train


# Video Processing Functions

In [5]:
def extract_frames(video_path, max_frames=20):
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    if total_frames == 0:
        cap.release()
        return None
    
    step = max(1, total_frames // max_frames)
    
    for i in range(0, total_frames, step):
        if len(frames) >= max_frames:
            break
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret:
            frame = cv2.resize(frame, (CONFIG['IMG_WIDTH'], CONFIG['IMG_HEIGHT']))
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
    
    cap.release()
    
    while len(frames) < max_frames:
        frames.append(frames[-1])
    
    return np.array(frames)

def normalize_frames(frames):
    return frames / 255.0

# Prepare Dataset

In [6]:
train_csv = pd.read_csv('/kaggle/input/ucf101-action-recognition/train.csv')
val_csv = pd.read_csv('/kaggle/input/ucf101-action-recognition/val.csv')

train_df = pd.concat([train_csv, val_csv], ignore_index=True)

video_paths = []
labels = []

base_path = '/kaggle/input/ucf101-action-recognition'

for idx, row in train_df.iterrows():
    video_path = base_path + row['clip_path']
    if os.path.exists(video_path):
        video_paths.append(video_path)
        labels.append(row['label'])

print(f"Total videos: {len(video_paths)}")
print(f"Total classes: {len(set(labels))}")

Total videos: 11728
Total classes: 101


# Encode Labels

In [7]:
selected_classes = ['Basketball', 'Bowling', 'GolfSwing', 'TennisSwing', 
                    'Drumming', 'PushUps', 'PullUps', 'JumpingJack']

filtered_videos = []
filtered_labels = []

for vid, lbl in zip(video_paths, labels):
    if lbl in selected_classes:
        filtered_videos.append(vid)
        filtered_labels.append(lbl)

video_paths = filtered_videos
labels = filtered_labels

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

print(f"Number of classes: {num_classes}")
print(f"Total videos: {len(video_paths)}")
print(f"Classes: {list(label_encoder.classes_)}")

Number of classes: 8
Total videos: 1055
Classes: [np.str_('Basketball'), np.str_('Bowling'), np.str_('Drumming'), np.str_('GolfSwing'), np.str_('JumpingJack'), np.str_('PullUps'), np.str_('PushUps'), np.str_('TennisSwing')]


# Train-Test Split with Sampling

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    video_paths, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

Training samples: 844
Testing samples: 211


# Dataset Class

In [9]:
class VideoDataset(Dataset):
    def __init__(self, video_paths, labels, sequence_length=20):
        self.video_paths = video_paths
        self.labels = labels
        self.sequence_length = sequence_length
        
    def __len__(self):
        return len(self.video_paths)
    
    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]
        
        frames = extract_frames(video_path, self.sequence_length)
        
        if frames is None:
            frames = np.zeros((self.sequence_length, CONFIG['IMG_HEIGHT'], CONFIG['IMG_WIDTH'], 3))
        
        frames = normalize_frames(frames)
        frames = torch.FloatTensor(frames).permute(0, 3, 1, 2)
        
        return frames, label

# Create DataLoaders

In [10]:
train_dataset = VideoDataset(X_train, y_train, CONFIG['SEQUENCE_LENGTH'])
test_dataset = VideoDataset(X_test, y_test, CONFIG['SEQUENCE_LENGTH'])

train_loader = DataLoader(
    train_dataset, 
    batch_size=CONFIG['BATCH_SIZE'], 
    shuffle=True, 
    num_workers=CONFIG['NUM_WORKERS'],
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=CONFIG['BATCH_SIZE'], 
    shuffle=False, 
    num_workers=CONFIG['NUM_WORKERS'],
    pin_memory=True
)

print(f"Train batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

Train batches: 53
Test batches: 14


# CNN-LSTM Model Architecture

In [11]:
class CNNLSTMModel(nn.Module):
    def __init__(self, num_classes, lstm_hidden=512, lstm_layers=2, dropout=0.4):
        super(CNNLSTMModel, self).__init__()
        
        resnet = models.resnet50(pretrained=True)
        self.cnn = nn.Sequential(*list(resnet.children())[:-1])
        
        for param in list(self.cnn.parameters())[:-20]:
            param.requires_grad = False
        
        self.lstm = nn.LSTM(
            input_size=2048,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            dropout=dropout if lstm_layers > 1 else 0,
            bidirectional=True
        )
        
        self.fc = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(lstm_hidden * 2, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, x):
        batch_size, seq_len, c, h, w = x.size()
        
        x = x.view(batch_size * seq_len, c, h, w)
        x = self.cnn(x)
        x = x.view(batch_size, seq_len, -1)
        
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        
        x = self.fc(x)
        return x

model = CNNLSTMModel(num_classes, CONFIG['LSTM_HIDDEN'], CONFIG['LSTM_LAYERS'], CONFIG['DROPOUT'])
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 191MB/s] 


Model parameters: 29,941,064
Trainable parameters: 15,362,312


# Setup Parallel GPU Training

In [12]:
torch.backends.cudnn.benchmark = True

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)

model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=CONFIG['LEARNING_RATE'])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

Using 2 GPUs


# Training Function

In [13]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for frames, labels in tqdm(loader, desc="Training"):
        frames = frames.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        
        optimizer.zero_grad()
        outputs = model(frames)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    return running_loss / len(loader), 100. * correct / total

#  Validation Function

In [14]:
def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for frames, labels in tqdm(loader, desc="Validation"):
            frames = frames.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            
            outputs = model(frames)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    return running_loss / len(loader), 100. * correct / total

# Training Loop

In [15]:
best_acc = 0.0

for epoch in range(CONFIG['EPOCHS']):
    print(f"\nEpoch {epoch+1}/{CONFIG['EPOCHS']}")
    
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, DEVICE)
    val_loss, val_acc = validate(model, test_loader, criterion, DEVICE)
    
    scheduler.step(val_loss)
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_acc': val_acc,
            'label_encoder': label_encoder
        }, 'best_model.pth')
        print(f"Model saved with accuracy: {val_acc:.2f}%")

print(f"\nBest Validation Accuracy: {best_acc:.2f}%")


Epoch 1/12


Training: 100%|██████████| 53/53 [03:45<00:00,  4.25s/it]
Validation: 100%|██████████| 14/14 [00:56<00:00,  4.00s/it]


Train Loss: 1.6370 | Train Acc: 39.34%
Val Loss: 1.2734 | Val Acc: 45.50%
Model saved with accuracy: 45.50%

Epoch 2/12


Training: 100%|██████████| 53/53 [03:39<00:00,  4.15s/it]
Validation: 100%|██████████| 14/14 [00:54<00:00,  3.87s/it]


Train Loss: 1.3722 | Train Acc: 52.25%
Val Loss: 0.9737 | Val Acc: 57.82%
Model saved with accuracy: 57.82%

Epoch 3/12


Training: 100%|██████████| 53/53 [03:39<00:00,  4.15s/it]
Validation: 100%|██████████| 14/14 [00:54<00:00,  3.86s/it]


Train Loss: 1.1767 | Train Acc: 58.53%
Val Loss: 0.8075 | Val Acc: 69.67%
Model saved with accuracy: 69.67%

Epoch 4/12


Training: 100%|██████████| 53/53 [03:39<00:00,  4.14s/it]
Validation: 100%|██████████| 14/14 [00:53<00:00,  3.85s/it]


Train Loss: 1.1015 | Train Acc: 59.60%
Val Loss: 0.6184 | Val Acc: 79.15%
Model saved with accuracy: 79.15%

Epoch 5/12


Training: 100%|██████████| 53/53 [03:39<00:00,  4.14s/it]
Validation: 100%|██████████| 14/14 [00:53<00:00,  3.86s/it]


Train Loss: 1.0008 | Train Acc: 64.45%
Val Loss: 0.6814 | Val Acc: 72.51%

Epoch 6/12


Training: 100%|██████████| 53/53 [03:39<00:00,  4.14s/it]
Validation: 100%|██████████| 14/14 [00:54<00:00,  3.86s/it]


Train Loss: 0.9328 | Train Acc: 65.76%
Val Loss: 0.7495 | Val Acc: 67.30%

Epoch 7/12


Training: 100%|██████████| 53/53 [03:39<00:00,  4.14s/it]
Validation: 100%|██████████| 14/14 [00:54<00:00,  3.86s/it]


Train Loss: 0.9276 | Train Acc: 65.88%
Val Loss: 0.5964 | Val Acc: 76.30%

Epoch 8/12


Training: 100%|██████████| 53/53 [03:41<00:00,  4.19s/it]
Validation: 100%|██████████| 14/14 [00:54<00:00,  3.88s/it]


Train Loss: 0.8631 | Train Acc: 67.42%
Val Loss: 0.5113 | Val Acc: 78.20%

Epoch 9/12


Training: 100%|██████████| 53/53 [03:40<00:00,  4.17s/it]
Validation: 100%|██████████| 14/14 [00:54<00:00,  3.88s/it]


Train Loss: 0.7646 | Train Acc: 70.26%
Val Loss: 0.4387 | Val Acc: 81.04%
Model saved with accuracy: 81.04%

Epoch 10/12


Training: 100%|██████████| 53/53 [03:39<00:00,  4.15s/it]
Validation: 100%|██████████| 14/14 [00:53<00:00,  3.85s/it]


Train Loss: 0.7554 | Train Acc: 71.56%
Val Loss: 0.7716 | Val Acc: 75.36%

Epoch 11/12


Training: 100%|██████████| 53/53 [03:39<00:00,  4.15s/it]
Validation: 100%|██████████| 14/14 [00:54<00:00,  3.88s/it]


Train Loss: 0.8450 | Train Acc: 69.31%
Val Loss: 0.5047 | Val Acc: 78.20%

Epoch 12/12


Training: 100%|██████████| 53/53 [03:40<00:00,  4.16s/it]
Validation: 100%|██████████| 14/14 [00:54<00:00,  3.88s/it]


Train Loss: 0.7186 | Train Acc: 74.53%
Val Loss: 0.3971 | Val Acc: 84.83%
Model saved with accuracy: 84.83%

Best Validation Accuracy: 84.83%


# Save Final Model

In [16]:
torch.save({
    'model_state_dict': model.state_dict(),
    'label_encoder': label_encoder,
    'config': CONFIG,
    'num_classes': num_classes
}, 'action_recognition_model.pth')

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("Model saved as 'action_recognition_model.pth'")
print("Label encoder saved as 'label_encoder.pkl'")

Model saved as 'action_recognition_model.pth'
Label encoder saved as 'label_encoder.pkl'


# Test Inference

In [17]:
def predict_action(video_path, model, label_encoder, device):
    model.eval()
    frames = extract_frames(video_path, CONFIG['SEQUENCE_LENGTH'])
    
    if frames is None:
        return "Error", 0.0
    
    frames = normalize_frames(frames)
    frames = torch.FloatTensor(frames).permute(0, 3, 1, 2).unsqueeze(0)
    frames = frames.to(device)
    
    with torch.no_grad():
        outputs = model(frames)
        probabilities = torch.softmax(outputs, dim=1)
        confidence, predicted = probabilities.max(1)
    
    action = label_encoder.inverse_transform([predicted.item()])[0]
    confidence = confidence.item() * 100
    
    return action, confidence

test_video = X_test[0]
action, confidence = predict_action(test_video, model, label_encoder, DEVICE)

print(f"Test Video: {os.path.basename(test_video)}")
print(f"Predicted: {action}")
print(f"Confidence: {confidence:.2f}%")

Test Video: v_GolfSwing_g02_c02.avi
Predicted: GolfSwing
Confidence: 50.49%


# Create & Download Model ZIP

In [18]:
import zipfile
from IPython.display import FileLink

zip_filename = 'action_recognition_model.zip'

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('action_recognition_model.pth')
    zipf.write('label_encoder.pkl')
    if os.path.exists('best_model.pth'):
        zipf.write('best_model.pth')

print(f"Zip created: {zip_filename}")
print(f"Size: {os.path.getsize(zip_filename) / (1024**2):.2f} MB")

FileLink(zip_filename)

Zip created: action_recognition_model.zip
Size: 318.89 MB
