# LSTM-CNN Model for ASL

This notebook implements a collaborative pipeline for data acquisition, preprocessing, model definition, training, and evaluation to recognize American Sign Language (ASL) gestures.


## Setup

Install necessary packages and mount Google Drive and import all required libraries and frameworks.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install kaggle

In [None]:
from tensorflow import keras
import tensorflow as tf
import torch
import json
import os
import pathlib
import cv2
import numpy as np
import subprocess
import zipfile
import shutil
import random as rnd
from tqdm import tqdm
from torch.utils.data import Dataset
import secrets
import matplotlib.pyplot as plt
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
from copy import deepcopy


## Data Acquisition

Download and extract the dataset from Kaggle (for google collab development, place kaggle.json in google drive).


In [None]:
# Configure Kaggle credentials and extract the processed WLASL dataset
os.makedirs("/root/.kaggle", exist_ok=True)
shutil.move("root/.kaggle/kaggle.json", "drive/MyDrive/kaggle.json")
os.chmod("/root/.kaggle/kaggle.json", 0o600)


In [None]:
# Extract dataset
with zipfile.ZipFile("artifact/wlasl-processed.zip", 'r') as zip_ref:
    zip_ref.extractall("artifact/wlasl-processed")


# Video Processing

Video pre-processing and Frame Extraction

In [None]:
def videos_process(output_folder: str, json_fp: str, videos_folder: str): 
    """
    Copy and organize raw video files into train/val/test splits.

    Args:
        output_folder: top‐level folder to receive split directories.
        json_fp: path to WLASL_v0.3.json metadata file.
        videos_folder: folder containing original .mp4 files.
    """
    os.makedirs(output_folder, exist_ok=True)
    # Opening the json
    with open(json_fp, "r") as file:
        data = json.load(file)


    for gloss_data in data:
        gloss_name = gloss_data["gloss"]

        for instance in gloss_data["instances"]:
            video_id = instance["video_id"]
            split = instance["split"] # 'train', 'val', or 'test'


            source_path = os.path.join(videos_folder, f"{video_id}.mp4")
            dest_folder = os.path.join(output_folder, split, gloss_name)
            dest_path = os.path.join(dest_folder, f"{video_id}.mp4")

            # Ensuring that destination folder exists
            os.makedirs(dest_folder, exist_ok=True)

            if os.path.exists(source_path):
                shutil.copy(source_path, dest_path)
                print(f"Copied {source_path} to {dest_path}")
            else:
                print(f"Video not found: {source_path}")


In [None]:
def bbox_file(info: str, output_json_path: str):
    """
    Build a lookup from video_id to bounding box.

    Args:
        info: path to WLASL_v0.3.json file with bbox info per instance.
        output_json_path: where to write the simplified {video_id: bbox} map.
    """
    with open(info, "r") as file:
        data = json.load(file)

    id_to_bbox = {}
    for gloss_data in data:
        for instance in gloss_data["instances"]:
            video_id = instance["video_id"]
            boundbox = instance["bbox"]
            id_to_bbox[video_id] = boundbox


    with open(output_json_path, "w") as file:
        json.dump(id_to_bbox, file)

In [None]:
def obtain_bbox(path_to_file: str, name: str) -> list:
    """
    Retrieve the bounding box for a given video.

    Args:
        path_to_file: JSON file mapping IDs→bboxes.
        name: video_id string.
    Returns:
        List of four ints [x_min, y_min, x_max, y_max].
    """
    with open(path_to_file, 'r') as file:
        bboxs = json.load(file)
        return bboxs[name]

In [None]:
def crop_frame(bbox: list, frame: np.ndarray):
    """
    Crop a video frame to the specified bounding box.

    Args:
        bbox: [x_min, y_min, x_max, y_max]
        frame: H×W×C image array.
    Returns:
        Cropped frame.
    """
    cropped_frame = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]]
    return cropped_frame



In [None]:
def capture_frames(video_path: str, output_file: str, bbox: list = None):
    """
    Extract and save a sequence of frames from a video.

    Args:
        video_path: path to .mp4 file.
        output_file: directory to save extracted .jpg frames.
        bbox: optional bounding box to crop each frame.
    """
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Optionally crop to bbox region
        if bbox:
            frame = crop_frame(bbox, frame)

        # Save only frames 5 through 35 (inclusive)
        if frame_count >= 5 and frame_count <= 35:
            frame_filename = f"video{os.path.basename(video_path).replace('.mp4', '')}_frame_{frame_count}.jpg"
            cv2.imwrite(os.path.join(output_file, frame_filename), frame)
        frame_count += 1

In [None]:
bbox_path = "drive/MyDrive/newfile.json"
videosfilepath = "drive/MyDrive/dataset_split"
output_folder = "drive/MyDrive/frames"

splits = ['test','train','val']


def extract_and_crop_all_splits(
    videos_base: str,
    frames_output: str,
    bbox_json: str,
    splits: list[str] = ('train', 'val', 'test')
):
    """
    Walk through each split/class subfolder, crop & extract frames per video.

    Args:
        videos_base: root folder containing split subfolders (train/val/test).
        frames_output: root folder where per-split frames will be saved.
        bbox_json: path to JSON mapping video_id -> [x_min,y_min,x_max,y_max].
        splits: list of split names (defaults to ['train','val','test']).
    """
    os.makedirs(frames_output, exist_ok=True)
    for split in splits:

        files = [f for f in pathlib.Path(os.path.join(videos_base,split)).iterdir()]
        for file in files:
            i = 0
            word = os.path.basename(file)

            output_file = os.path.join(frames_output, split, word)

            os.makedirs(output_file, exist_ok=True)

            if file.is_dir():
                videos = [f for f in file.iterdir()]
                
            else:
                print(f"Skipping non-directory: {file}")
                continue

            for video_path in videos:

                video_id = os.path.basename(video_path).replace(".mp4", "" )
                print(video_id, video_path)
                bbox = obtain_bbox(bbox_json, video_id)

                try:
                    vid_output = os.path.join(output_file, f"{i}")
                    os.makedirs(vid_output, exist_ok=True)

                    capture_frames(video_path, vid_output, bbox)
                except Exception as e:
                    print(bbox)

                i += 1





In [None]:
class FrameSequencer(Dataset):
    """
    PyTorch Dataset that returns fixed‐length sequences of image frames.

    Args:
        root_dir: top‐level folder containing per-class subdirectories.
        sample_size: number of frames per sequence.
        batch_size: for __len__ calculation (unused in __getitem__).
        target_size: (H, W) to resize each frame.
        shuffle: whether to randomize sample order.
    """
    def __init__(self, root_dir, sample_size = 5, batch_size = 35, target_size=(224, 224), shuffle=True):

        self.root_dir = root_dir
        self.sample_size = sample_size
        self.batch_size = batch_size
        self.target_size = target_size
        self.shuffle = shuffle

        # Gather class names and assign integer labels
        self.classes = sorted([d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))])
        self.class_indicies = {clss: i for i, clss in enumerate(self.classes)}

        # Build (sequence_paths, label) list
        self.samples = []
        for cls in self.classes:
            cls_dir = os.path.join(root_dir, cls)
            vids = os.listdir(cls_dir)
            for i in vids:
                frame_files = (sorted(
                    [os.path.join(cls_dir, i, fname)
                                    for fname in os.listdir(os.path.join(cls_dir, i)) if fname.lower().endswith('.jpg')]
                                    ))

                # Slide window of length sample_size
                for j in range(0, len(frame_files) - self.sample_size + 1, self.sample_size):
                    sequence = frame_files[j: j + self.sample_size]
                    self.samples.append((sequence, self.class_indicies[cls]))


        self.shuffler()


    def __len__(self):
        # Number of batches (ceil of total samples / batch_size)
        return int(np.ceil(len(self.samples) / self.batch_size))

    def __getitem__(self, index):

        seq_paths, label = self.samples[index]
        sequence_imgs = []

        for fp in seq_paths:

            img = cv2.imread(fp)
            if img is None:
                continue
            img = cv2.resize(img, self.target_size)

            # Convert H×W×C → C×H×W
            img = np.transpose(img, (2, 0, 1))
            sequence_imgs.append(img)

        # Stack and convert to torch tensors
        images = torch.from_numpy(np.array(sequence_imgs)).float()
        label = torch.tensor(label).long()

        return images, label


    def shuffler(self):
        if self.shuffle:
            rnd.shuffle(self.samples)



In [None]:

def show_sample(g):
        """
        Visualize the first batch of sequences from a FrameSequencer.
        """
        images, labels = g[0]

        for i, sequence in enumerate(images):
            # Find class name from label
            classname = list(g.class_indicies.keys())[list(g.class_indicies.values()).index(labels[i])]
            rows = 1
            cols = len(sequence)
            for j, image in enumerate(sequence):


                plt.subplot(rows, cols, j + 1)
                # Convert BGR→RGB for display
                plt.imshow(cv2.cvtColor(image.numpy(), cv2.COLOR_BGR2RGB))
                plt.axis("off")
                plt.title(classname)
            plt.show()


# Model Architecture

In [None]:
class CNN_LSTM(nn.Module):
    """
    CNN+LSTM architecture:
      - Pretrained MobileNetV2 for spatial feature extraction.
      - LSTM to capture temporal dependencies.
      - Final Linear layer for classification.
    """
    def __init__(self, feature_size=1280, hidden_size=256, num_classes=10, num_layers=1, device="cpu"):
        super(CNN_LSTM, self).__init__()
        mobilenet = models.mobilenet_v2(pretrained=True)
        self.cnn = mobilenet.features.to(device)
        self.lstm = nn.LSTM(input_size=feature_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        """
        x: tensor of shape (batch, seq_len, C, H, W)
        Returns logits for each class.
        """        

        batch_size, seq_len, C, H, W = x.shape
        # Merge batch and sequence dims for CNN
        x = x.view(batch_size * seq_len, C, H, W)

        with torch.no_grad():
            features = self.cnn(x) # (batch*seq, feat_map, h, w)
            features = F.adaptive_avg_pool2d(features, (1, 1)) # (batch*seq, feat_map, 1, 1)

        lstm_input = features.view(batch_size, seq_len, -1)

        output, _ = self.lstm(lstm_input)
        last_hidden = output[:, -1, :]  # final hidden state
        return self.fc(last_hidden)






## 6. Training Utilities

- `train_one_epoch`:  
  Trains the model over one pass of `train_loader`, returns avg. loss.

- `validate`:  
  Evaluates on `val_loader` without gradient updates, returns avg. loss.

- `EarlyStopping`:  
  Stops training if validation loss doesn’t improve for `patience` epochs,
  and optionally restores the best-performing weights.

- `training_loop`:  
  Combines all the above into a multi-epoch loop with progress logging.

In [None]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0, restore_best_weights=True):
        """
        Initialize the EarlyStopping object.

        Args:
            patience (int, optional): How many epochs to wait after last improvement.
                Defaults to 5.
            min_delta (float, optional): Minimum decrease in loss to qualify as improvement.
                Defaults to 0.
            restore_best_weights (bool, optional): If True, the model will be restored
                to the state with the lowest validation loss. Defaults to True.
        """

        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.early_stop = False
        self.best_loss = float('inf')
        self.best_model_state = None
        self.restore_best_weights = restore_best_weights

    def __call__(self, model, validation_loss):
        """
        Check if validation loss has improved and update internal state.

        This method should be called at the end of each epoch.

        Args:
            model (torch.nn.Module): The model being trained.
            validation_loss (float): The loss computed on the validation set for the current epoch.
        """
        if validation_loss < self.best_loss - self.min_delta:
            self.best_loss = validation_loss
            self.counter = 0
            if self.restore_best_weights:
                self.best_model_state = deepcopy(model.state_dict())
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

    def restore_model(self, model):
        """
        Restore the model's weights to the best observed state.

        Args:
            model (torch.nn.Module): The model instance to restore.

        Returns:
            torch.nn.Module: The model with weights reset to the best observed validation performance.
        """
        if self.best_model_state is not None:
            model.load_state_dict(self.best_model_state)
        return model

In [None]:
def validate(model, dataloader, criterion, device="cpu"):
    """
    Evaluate the model on a validation dataset.

    Args:
        model (torch.nn.Module): The neural network to evaluate.
        dataloader (torch.utils.data.DataLoader): DataLoader for validation data.
        criterion (callable): Loss function to compute the validation loss.
        device (str or torch.device, optional): Device on which to run the computations.
            Defaults to "cpu".

    Returns:
        float: The average loss over the entire validation set.
    """
    model.eval()
    losses = []
    with torch.no_grad():
        for batch_x, batch_y in dataloader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            losses.append(loss.item())
    return sum(losses) / len(losses)

def train_one_epoch(model, train_loader, criterion, optimizer, device="cpu"):
    """
    Train the model for a single epoch.

    Args:
        model (torch.nn.Module): The neural network to train.
        train_loader (torch.utils.data.DataLoader): DataLoader for training data.
        criterion (callable): Loss function to compute training loss.
        optimizer (torch.optim.Optimizer): Optimizer for model parameter updates.
        device (str or torch.device, optional): Device on which to run the computations.
            Defaults to "cpu".

    Returns:
        float: The average training loss over all batches in this epoch.
    """
    model.train()
    total_loss = 0.0
    for batch_x, batch_y in tqdm(train_loader, desc="Training", leave=False):
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(train_loader)

def training_loop(
    model,
    train_loader,
    val_loader,
    criterion,
    optimizer,
    early_stopper=None,
    epochs=10,
    device="cpu"
):
    """
    Run the full training and validation loop for multiple epochs, with optional early stopping.

    Args:
        model (torch.nn.Module): The neural network to train and validate.
        train_loader (torch.utils.data.DataLoader): DataLoader for training data.
        val_loader (torch.utils.data.DataLoader): DataLoader for validation data.
        criterion (callable): Loss function to compute training and validation losses.
        optimizer (torch.optim.Optimizer): Optimizer for model parameter updates.
        early_stopper (callable, optional): An early stopping object with attributes
            `early_stop` (bool) and `restore_best_weights` (bool), and methods
            `__call__(model, val_loss)` and `restore_model(model)`. Defaults to None.
        epochs (int, optional): Maximum number of epochs to run. Defaults to 10.
        device (str or torch.device, optional): Device on which to run the computations.
            Defaults to "cpu".

    Returns:
        torch.nn.Module: The trained model. If early stopping is used and `restore_best_weights`
        is True, returns the model restored to the best observed validation performance.
    """
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)


        # Validation
        val_loss = validate(model, val_loader, criterion, device)

        # Early stopping
        if early_stopper:
            early_stopper(model, val_loss)
            if early_stopper.early_stop:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break

        print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Restore best model if early stopping was used
    if early_stopper and early_stopper.restore_best_weights:
        model = early_stopper.restore_model(model)

    return model

## Running Everything

### Video Processing

In [None]:
json_file_path = "artifact/wlasl-processed/WLASL_v0.3.json"
videos_folder = "artifact/wlasl-processed/videos"
output_folder = "drive/MyDrive/dataset_split"

videos_process(output_folder, json_file_path, output_folder)

In [None]:
json_file_path = "artifact/wlasl-processed/WLASL_v0.3.json"
newjson_file_path = "drive/MyDrive/newfile.json"

bbox_file(json_file_path, newjson_file_path)

### Initialize Dataloader & Dataset

In [None]:

train_dataloader = DataLoader(dataset, batch_size=128, shuffle=True, num_workers=2,
                            pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=128, num_workers=2,
                          pin_memory=True)

In [None]:
from torch.utils.data import DataLoader


dataset = FrameSequencer(root_dir=r"drive/MyDrive/frames/train", batch_size=5, sample_size=5)
val_dataset = FrameSequencer(root_dir=r"drive/MyDrive/frames/val", batch_size=5, sample_size=5)
test_dataset = FrameSequencer(root_dir=r"drive/MyDrive/frames/test", batch_size=5, sample_size=5)





### Initialize Model and Training Utilities

In [None]:
model = CNN_LSTM(num_classes=len(dataset.classes), device="cuda").to("cuda")
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

In [None]:
early_stopping = EarlyStopping()


### Running Training & Saving

In [None]:
model = training_loop(
    model=model,
    train_loader=train_dataloader,
    val_loader=val_dataloader,
    criterion=criterion,
    optimizer=optimizer,
    early_stopper=early_stopping,
    epochs=10,
    device="cuda"
)

In [None]:
torch.save(model.state_dict(), "asl_model.pth")