**I think we can start here with preprocessing and building up a pipeline for classifying with a traditional model from pose estimation**

# Claases

- Walking
- Standing
- Fast walking / jogging?
- Running

In [5]:
import kagglehub


# Download latest version

path = kagglehub.dataset_download("easonlll/hmdb51")


print("Path to dataset files:", path) 

Path to dataset files: C:\Users\marte\.cache\kagglehub\datasets\easonlll\hmdb51\versions\1


In [None]:
import os
import glob
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm # Good practice for progress bars
import torch.nn as nn
from torch.utils.data import random_split
from torchvision.models import mobilenet_v3_large, MobileNet_V3_Large_Weights

# --- 1. Configuration ---
ROOT_DIR = r"/kaggle/input/hmdb51/HMDB51"
TARGET_CLASSES = ["walk", "stand", "run"]
FRAMES_PER_CLIP = 8  # Common and fast choice for initial training

AttributeError: partially initialized module 'torch' has no attribute 'version' (most likely due to a circular import)

In [None]:
# --- 2. Custom HMDB51 Dataset Class ---
class HMDB51FrameDataset(Dataset):
    """
    A minimal PyTorch Dataset for HMDB51 frames, using uniform frame sampling.
    """
    def __init__(self, root_dir, class_list, n_frames=8, transform=None):
        self.root_dir = root_dir
        self.class_list = class_list
        self.n_frames = n_frames
        self.transform = transform
        self.data_samples = []

        # Map class names to integer labels (0, 1, 2, 3)
        self.class_to_idx = {cls: i for i, cls in enumerate(class_list)}
        print("Mapping directory names to labels:", self.class_to_idx)
        
        # --- Pre-cache all sample directories (Fast Setup) ---
        # This is fast because it only reads the directory names once.
        for class_name in class_list:
            class_path = os.path.join(root_dir, class_name)
            # Use glob to find all sample directories (e.g., walk_001_1, walk_002_1)
            for sample_dir in glob.glob(os.path.join(class_path, '*')):
                if os.path.isdir(sample_dir):
                    label = self.class_to_idx[class_name]
                    self.data_samples.append((sample_dir, label))

        print(f"Loaded {len(self.data_samples)} total video samples.")

    def __len__(self):
        return len(self.data_samples)

    def __getitem__(self, idx):
        sample_dir, label = self.data_samples[idx]
        
        # Find all JPG frames in the sample directory
        frame_files = sorted(glob.glob(os.path.join(sample_dir, '*.jpg')))
        
        if not frame_files:
             # Handle empty directories gracefully
             raise RuntimeError(f"No frames found in directory: {sample_dir}")

        # --- Minimal and Fast Frame Sampling ---
        # Select N frames uniformly across the video sequence
        indices = torch.linspace(0, len(frame_files) - 1, self.n_frames).long()
        
        frames = []
        for i in indices:
            frame_path = frame_files[i.item()]
            # Open image using PIL
            img = Image.open(frame_path).convert('RGB')
            if self.transform:
                img = self.transform(img)
            frames.append(img)

        # Stack the frames: (T, C, H, W) -> (C, T, H, W) or (C, H, W) for 2D-CNN
        # Since MobileNet is a 2D-CNN, we pass frames one by one in a batch.
        # For this setup, we average or take a single frame per sample for simplicity.
        # However, for a proper 3D-CNN/TSN setup, we would return the stacked clip.
        
        # For MobileNet, we will average the sampled frames to create one 'representative' image.
        # This is a very common trick for 2D-CNN transfer learning on videos.
        clip_tensor = torch.stack(frames) # (T, C, H, W)
        final_input = torch.mean(clip_tensor, dim=0) # (C, H, W) - Mean of all frames
        
        return final_input, label

In [None]:
# --- 3. Instantiate Dataset and DataLoader ---

# Standard normalization for models pre-trained on ImageNet
image_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224), # MobileNet input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create the dataset instance
hmdb_dataset = HMDB51FrameDataset(
    root_dir=ROOT_DIR,
    class_list=TARGET_CLASSES,
    n_frames=FRAMES_PER_CLIP,
    transform=image_transforms
)

DATASET_SIZE = len(hmdb_dataset)
train_size = int(0.8 * DATASET_SIZE) # 80% for training
test_size = DATASET_SIZE - train_size # 20% for testing

# Split the dataset randomly
train_dataset, test_dataset = random_split(hmdb_dataset, [train_size, test_size])

print(f"Data Split: Training ({train_size}) / Testing ({test_size})")


# Create the DataLoaders for batching and shuffling
BATCH_SIZE = 32

# 1. DataLoader for Training (SHUFFLED)
train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,           # Shuffle for training is CRITICAL
    num_workers=4 
)

# 2. DataLoader for Testing (NOT SHUFFLED)
test_loader = DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,          # Do not shuffle test data
    num_workers=4 
)

print(f"Created two DataLoaders: train_loader and test_loader.")

print(f"\nDataLoader ready to serve batches of size {BATCH_SIZE}."

Mapping directory names to labels: {'walk': 0, 'stand': 1, 'run': 2}
Loaded 934 total video samples.

DataLoader ready to serve batches of size 32.


In [None]:
# 1. Define the target classes
# These are the four actions you want to classify:
TARGET_CLASSES = ["walk", "stand", "run"]
NUM_CLASSES = len(TARGET_CLASSES)

print(f"Targeting {NUM_CLASSES} classes: {TARGET_CLASSES}")

# --- Load the Pre-trained Model ---
## We use the 'large' version of MobileNetV3, which is still lightweight but powerful.
## We specify the best available weights (IMAGENET1K_V2) which contain the transfer knowledge.
try:
    # 2. Load the model pre-trained on ImageNet
    model = mobilenet_v3_large(weights=MobileNet_V3_Large_Weights.IMAGENET1K_V2)
    print("\n Successfully loaded MobileNetV3-Large, pre-trained on ImageNet.")

    # --- Transfer Learning Modification ---
    # The classification head of MobileNetV3 is stored in the 'classifier' attribute.
    # The last layer is an nn.Linear layer that outputs 1000 classes (for ImageNet).

    # 3. Get the input features of the last layer
    # We need to know how many features the MobileNet backbone outputs (usually 1280 for v3 large)
    in_features = model.classifier[3].in_features

    # 4. Replace the final layer with a new one for your 4 classes
    # We keep the model's structure but replace the classification head.
    model.classifier[3] = nn.Linear(in_features, NUM_CLASSES)
    print(f"   Modified final classification layer from 1000 outputs to {NUM_CLASSES}.")

    # --- Freeze the Backbone for Efficient Fine-Tuning ---
    # 5. Freeze all layers except the new classification head
    # This is a common practice in transfer learning to speed up training
    # and prevent the pre-trained weights from being destroyed early on.
    for param in model.parameters():
        param.requires_grad = False

    # 6. Unfreeze the parameters of the new classification head
    # These are the only weights that will be updated during the initial training phase.
    for param in model.classifier.parameters():
        param.requires_grad = True

    # Check the model structure and parameter count
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"\nModel Total Parameters: {total_params:,}")
    print(f"Model Trainable Parameters: {trainable_params:,} (Only the new layer)")
    print("Model is ready for fine-tuning on the HMDB51 data.")

except Exception as e:
    print(f"\n An error occurred during model loading: {e}")

Targeting 3 classes: ['walk', 'stand', 'run']

âœ… Successfully loaded MobileNetV3-Large, pre-trained on ImageNet.
   Modified final classification layer from 1000 outputs to 3.

Model Total Parameters: 4,205,875
Model Trainable Parameters: 1,233,923 (Only the new layer)
Model is ready for fine-tuning on your HMDB51 data.


In [19]:
# Use a GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)

cpu


# Training

In [None]:
# --- 1. Define Training Parameters ---

# Loss Function: Cross-Entropy is standard for classification
criterion = nn.CrossEntropyLoss()

# Optimizer: Adam is a great default choice for transfer learning
# We only pass the trainable parameters (the final layer)
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.to(device)

EPOCHS = 5 # Start with a small number of passes

# --- 2. The Training Loop ---
print("\n--- Starting Training Loop ---")
for epoch in range(EPOCHS):
    model.train() # Set model to training mode
    running_loss = 0.0
    
    # Wrap train_loader with tqdm for a progress bar
    # 'train_loader' is the DataLoader you successfully created in the previous step
    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        
        # 1. Move data to the appropriate device (GPU or CPU)
        inputs, labels = inputs.to(device), labels.to(device)

        # 2. Zero the parameter gradients (clear old gradients)
        optimizer.zero_grad()

        # 3. Forward pass (get model predictions)
        outputs = model(inputs)
        
        # 4. Calculate the loss
        loss = criterion(outputs, labels)

        # 5. Backward pass (calculate gradients)
        loss.backward()

        # 6. Optimize (update the weights of the new layer)
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    # Calculate and print epoch statistics
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"\nEpoch {epoch+1}/{EPOCHS} complete. Average Loss: {epoch_loss:.4f}")

print("\n Training complete!")

NameError: name 'nn' is not defined