**I think we can start here with preprocessing and building up a pipeline for classifying with a traditional model from pose estimation**

# Claases

- Walking
- Standing
- Fast walking / jogging?
- Running

In [3]:
# import kagglehub
# Download latest version from kagglehub:

import kagglehub

# Download latest version

path = kagglehub.dataset_download("easonlll/hmdb51")


print("Path to dataset files:", path)

Using Colab cache for faster access to the 'hmdb51' dataset.
Path to dataset files: /kaggle/input/hmdb51


# Import and Conf

In [9]:
import os
import cv2
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models, applications

DATASET_PATH = path + "/HMDB51"

TARGET_CLASSES = ["run", "walk", "stand"]
FRAME_COUNT = 16       # Number of frames per video to sample
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 8         # Keep small for Colab (8 or 16)

# Data Discovery

In [10]:
def gather_sample_dirs(dataset_root, target_classes):
    sample_paths = []
    sample_labels = []
    class_map = {cls: i for i, cls in enumerate(target_classes)}

    print(f"Scanning directory: {dataset_root}")

    for class_name in target_classes:
        class_dir = os.path.join(dataset_root, class_name)

        if not os.path.isdir(class_dir):
            print(f"Warning: Class folder '{class_name}' NOT found at {class_dir}")
            continue

        label_id = class_map[class_name]

        # In this dataset version, every video is a folder of images
        # We loop through those folders
        items = os.listdir(class_dir)
        for item in items:
            item_path = os.path.join(class_dir, item)
            if os.path.isdir(item_path):
                sample_paths.append(item_path)
                sample_labels.append(label_id)

    sample_paths = np.array(sample_paths)
    sample_labels = np.array(sample_labels)

    print(f"\n--- RESULTS ---")
    print(f"Total samples found: {len(sample_paths)}")
    if len(sample_paths) > 0:
        print(f"Example path: {sample_paths[0]}")
        print(f"Example label: {sample_labels[0]} ({target_classes[sample_labels[0]]})")

    return sample_paths, sample_labels

# Execute the search
X_all, y_all = gather_sample_dirs(DATASET_PATH, TARGET_CLASSES)

Scanning directory: /kaggle/input/hmdb51/HMDB51

--- RESULTS ---
Total samples found: 934
Example path: /kaggle/input/hmdb51/HMDB51/run/THE_PROTECTOR_run_f_cm_np1_le_med_42
Example label: 0 (run)


# Split data

In [None]:
if len(X_all) == 0:
    print("STOP: No data found. Check your path in Cell 1.")
else:
    # Stratify ensures we have equal amounts of run/walk/stand in training and validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_all, y_all,
        test_size=0.2,
        stratify=y_all,
        random_state=42
    )

    print(f"Training samples: {len(X_train)}")
    print(f"Validation samples: {len(X_val)}")

In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models, applications

def build_feature_pooling_model(num_classes=3, frame_count=16, image_size=224):
    """
    Replicates the 'CNN Feature Extraction + SoftMax' approach.
    It treats the video as a 'bag of frames' by averaging features over time.
    """

    # 1. Input: (Batch, Frames, Height, Width, Channels)
    video_input = layers.Input(shape=(frame_count, image_size, image_size, 3))

    # 2. Feature Extractor: EfficientNetB0
    base_cnn = applications.EfficientNetB0(
        weights='imagenet',
        include_top=False,
        input_shape=(image_size, image_size, 3)
    )
    base_cnn.trainable = False

    # 3. Apply CNN to every frame
    # Output Shape: (Batch, Frames, 7, 7, 1280)
    encoded_frames = layers.TimeDistributed(base_cnn)(video_input)

    # 4. Spatial Pooling (Standard Image Step)
    # Average the 7x7 pixels of each frame.
    # Output Shape: (Batch, Frames, 1280)
    frame_features = layers.TimeDistributed(layers.GlobalAveragePooling2D())(encoded_frames)

    # 5. Temporal Pooling (The 'Replication' Step)
    # instead of an LSTM, we just AVERAGE the features across the 'Frames' dimension.
    # This creates one summary vector for the whole video.
    # Output Shape: (Batch, 1280)
    video_summary = layers.GlobalAveragePooling1D()(frame_features)

    # 6. Classification "with the help of SoftMax layer"
    output = layers.Dense(num_classes, activation='softmax')(video_summary)

    # Compile
    model = models.Model(inputs=video_input, outputs=output)
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

# Instantiate
model = build_feature_pooling_model(num_classes=3)
model.summary()

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
