<h1>Making Mapping file </h>

In [5]:
import os
import json

# Base directory where videos are stored
base_dir = r'D:\sem-project\sem6-project\Datasets\sign-language-dataset-wlasl-videos\dataset\SL'

# 20 target classes you want to consider (case sensitive)
target_classes = [
    "car",
    "computer",
    "door",
    "friend",
    "hospital",
    "love",
    "money",
    "phone",
    "school",
    "stop",
    "train",
    "water",
    "work",
    "write",
    "family",
    "dance",
    "eat",
    "hello",
    "play",
    "read"
]

# List actual folders in base_dir
existing_folders = set(os.listdir(base_dir))

# Take intersection with target classes (only classes that exist)
common_classes = [cls for cls in target_classes if cls in existing_folders]

mapping = {}

for cls in common_classes:
    class_dir = os.path.join(base_dir, cls)
    video_files = [os.path.join(class_dir, f) for f in os.listdir(class_dir) if f.endswith('.mp4')]
    mapping[cls] = video_files

# Save mapping to JSON file
output_json = 'selected_20_classes_mapping.json'
with open(output_json, 'w') as f:
    json.dump(mapping, f, indent=4)

print(f"Mapping JSON saved to {output_json}")
print(f"Total classes mapped: {len(mapping)}")
total_videos = sum(len(v) for v in mapping.values())
print(f"Total videos collected: {total_videos}")


Mapping JSON saved to selected_20_classes_mapping.json
Total classes mapped: 20
Total videos collected: 158


<h1>training the model and saving model</h1>

In [9]:
import json
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import mediapipe as mp
import cv2
from sklearn.preprocessing import LabelEncoder


# Setup MediaPipe Holistic
mp_holistic = mp.solutions.holistic.Holistic(
    static_image_mode=False,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5,
)


# Dataset Class: Extract landmarks from videos on-the-fly
class ASLDataset(Dataset):
    def __init__(self, mapping_file, num_frames=30, transform=None):
        with open(mapping_file, 'r') as f:
            self.data = json.load(f)
        self.num_frames = num_frames
        self.transform = transform


        self.classes = list(self.data.keys())
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(self.classes)


        # flatten (video_path, label) pairs for indexing
        self.video_label_pairs = []
        for label, videos in self.data.items():
            for v in videos:
                self.video_label_pairs.append((v, label))


        self.holistic = mp_holistic


    def __len__(self):
        return len(self.video_label_pairs)


    def extract_landmarks(self, video_path):
        cap = cv2.VideoCapture(video_path)
        length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        indices = torch.linspace(0, max(length - 1, 0), self.num_frames).long().tolist()


        landmarks_seq = []
        frame_idx = 0
        success, frame = cap.read()


        while success and frame_idx <= indices[-1]:
            if frame_idx in indices:
                image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = self.holistic.process(image_rgb)


                frame_landmarks = []


                # Pose landmarks (33)
                if results.pose_landmarks:
                    for lm in results.pose_landmarks.landmark:
                        frame_landmarks.extend([lm.x, lm.y, lm.z])
                else:
                    frame_landmarks.extend([0] * 33 * 3)


                # Face landmarks (468)
                if results.face_landmarks:
                    for lm in results.face_landmarks.landmark:
                        frame_landmarks.extend([lm.x, lm.y, lm.z])
                else:
                    frame_landmarks.extend([0] * 468 * 3)


                # Left and Right Hand landmarks (21 each)
                for hand_landmarks in [results.left_hand_landmarks, results.right_hand_landmarks]:
                    if hand_landmarks:
                        for lm in hand_landmarks.landmark:
                            frame_landmarks.extend([lm.x, lm.y, lm.z])
                    else:
                        frame_landmarks.extend([0] * 21 * 3)


                landmarks_seq.append(frame_landmarks)


            success, frame = cap.read()
            frame_idx += 1


        cap.release()


        # Padding if fewer frames extracted
        while len(landmarks_seq) < self.num_frames:
            landmarks_seq.append([0] * len(landmarks_seq[0]))


        return torch.tensor(landmarks_seq, dtype=torch.float32)


    def __getitem__(self, idx):
        video_path, label_str = self.video_label_pairs[idx]
        landmarks_seq = self.extract_landmarks(video_path)
        label = self.label_encoder.transform([label_str])[0]


        if self.transform:
            landmarks_seq = self.transform(landmarks_seq)


        return landmarks_seq, label


# LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)


    def forward(self, x):
        # x: [batch, seq_len, features]
        out, (hn, cn) = self.lstm(x)
        out = self.dropout(hn[-1])
        out = self.fc(out)
        return out


# Parameters
mapping_json = r'D:/sem-project/sem6-project/20_classes/selected_20_classes_mapping.json'
batch_size = 4
num_epochs = 50
learning_rate = 0.001
hidden_dim = 256
num_layers = 2
num_frames = 30


# Dataset and loader
dataset = ASLDataset(mapping_json, num_frames=num_frames)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_set, val_set = torch.utils.data.random_split(dataset, [train_size, val_size])


train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Input size: number of features per frame (Pose + Face + Hands landmarks × 3)
dummy_sample, _ = dataset[0]
input_size = dummy_sample.shape[1]


num_classes = len(dataset.classes)


model = LSTMClassifier(input_dim=input_size, hidden_dim=hidden_dim, num_layers=num_layers, num_classes=num_classes)
model = model.to(device)


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_samples = 0


    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device).long()  # <-- Fixed here
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()


        train_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        train_correct += (preds == labels).sum().item()
        train_samples += inputs.size(0)


    train_loss /= train_samples
    train_acc = train_correct / train_samples


    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_samples = 0


    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device).long()  # <-- Fixed here
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            val_correct += (preds == labels).sum().item()
            val_samples += inputs.size(0)


    val_loss /= val_samples
    val_acc = val_correct / val_samples


    print(f"Epoch [{epoch + 1}/{num_epochs}], "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")


# Save weights
torch.save(model.state_dict(), 'asl_lstm_model_weights.pth')
print("Model weights saved to asl_lstm_model_weights.pth")


Epoch [1/50], Train Loss: 3.0781, Train Acc: 0.0714, Val Loss: 3.0274, Val Acc: 0.0938
Epoch [2/50], Train Loss: 2.9917, Train Acc: 0.0794, Val Loss: 3.0627, Val Acc: 0.0312
Epoch [3/50], Train Loss: 2.9578, Train Acc: 0.0714, Val Loss: 3.0509, Val Acc: 0.0938
Epoch [4/50], Train Loss: 2.9647, Train Acc: 0.1032, Val Loss: 3.0544, Val Acc: 0.0938
Epoch [5/50], Train Loss: 2.9554, Train Acc: 0.1032, Val Loss: 3.0615, Val Acc: 0.0625
Epoch [6/50], Train Loss: 2.9320, Train Acc: 0.1270, Val Loss: 3.0394, Val Acc: 0.0625
Epoch [7/50], Train Loss: 2.9489, Train Acc: 0.1111, Val Loss: 3.0643, Val Acc: 0.0625
Epoch [8/50], Train Loss: 2.9280, Train Acc: 0.0952, Val Loss: 3.0625, Val Acc: 0.0625
Epoch [9/50], Train Loss: 2.8964, Train Acc: 0.1032, Val Loss: 3.0692, Val Acc: 0.0625
Epoch [10/50], Train Loss: 2.8817, Train Acc: 0.1190, Val Loss: 3.0999, Val Acc: 0.0938
Epoch [11/50], Train Loss: 2.9251, Train Acc: 0.0952, Val Loss: 3.0843, Val Acc: 0.0625
Epoch [12/50], Train Loss: 2.8868, Train 

<h1>Predicting the class of testing video</h1>

In [2]:
import json
import torch
import torch.nn as nn
import mediapipe as mp
import cv2
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, clear_output
from PIL import Image, ImageDraw, ImageFont
import time
import numpy as np

# ---- Paths ----
mapping_json = r'D:/sem-project/sem6-project/20_classes/selected_20_classes_mapping.json'  # mapping file
weights_path = r'D:/sem-project/sem6-project/20_classes/asl_lstm_model_weights.pth'         # model weights
test_clip = r"D:\sem-project\sem6-project\Datasets\sign-language-dataset-wlasl-videos\dataset\SL\door\17326.mp4"  # test video

# ---- MediaPipe Holistic setup ----
mp_holistic = mp.solutions.holistic.Holistic(
    static_image_mode=False,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5,
)

# ---- Landmark extraction function ----
def extract_landmarks_from_video(video_path, num_frames=30):
    cap = cv2.VideoCapture(video_path)
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    indices = torch.linspace(0, max(length - 1, 0), num_frames).long().tolist()
    landmarks_seq = []
    frame_idx = 0
    success, frame = cap.read()
    while success and frame_idx <= indices[-1]:
        if frame_idx in indices:
            image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = mp_holistic.process(image_rgb)
            frame_landmarks = []
            # Pose (33)
            if results.pose_landmarks:
                for lm in results.pose_landmarks.landmark:
                    frame_landmarks.extend([lm.x, lm.y, lm.z])
            else:
                frame_landmarks.extend([0] * 33 * 3)
            # Face (468)
            if results.face_landmarks:
                for lm in results.face_landmarks.landmark:
                    frame_landmarks.extend([lm.x, lm.y, lm.z])
            else:
                frame_landmarks.extend([0] * 468 * 3)
            # Hands (21 each)
            for hand_landmarks in [results.left_hand_landmarks, results.right_hand_landmarks]:
                if hand_landmarks:
                    for lm in hand_landmarks.landmark:
                        frame_landmarks.extend([lm.x, lm.y, lm.z])
                else:
                    frame_landmarks.extend([0] * 21 * 3)
            landmarks_seq.append(frame_landmarks)
        success, frame = cap.read()
        frame_idx += 1
    cap.release()
    while len(landmarks_seq) < num_frames:
        landmarks_seq.append([0] * len(landmarks_seq[0]))
    return torch.tensor(landmarks_seq, dtype=torch.float32)

# ---- Load classes and label encoder ----
with open(mapping_json, 'r') as f:
    data = json.load(f)
classes = list(data.keys())
label_encoder = LabelEncoder()
label_encoder.fit(classes)
num_classes = len(classes)

# ---- Model definition ----
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)
    def forward(self, x):
        out, (hn, cn) = self.lstm(x)
        out = self.dropout(hn[-1])
        out = self.fc(out)
        return out

# ---- Extract landmarks and prepare tensor ----
num_frames = 30
landmarks_seq = extract_landmarks_from_video(test_clip, num_frames=num_frames)
landmarks_seq = landmarks_seq.unsqueeze(0)  # batch size 1

# ---- Load model ----
input_size = landmarks_seq.shape[2]
hidden_dim = 256
num_layers = 2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMClassifier(input_dim=input_size, hidden_dim=hidden_dim, num_layers=num_layers, num_classes=num_classes)
model.load_state_dict(torch.load(weights_path, map_location=device))
model.to(device)
model.eval()

# ---- Prediction ----
with torch.no_grad():
    landmarks_seq = landmarks_seq.to(device)
    output = model(landmarks_seq)
    pred_class_index = torch.argmax(output, dim=1).item()
    pred_label = label_encoder.inverse_transform([pred_class_index])[0]

print(f"Predicted class label: {pred_label}")

# ---- Display video inline with prediction overlay ----
cap = cv2.VideoCapture(test_clip)
if not cap.isOpened():
    print("Error: Could not open video.")
else:
    # Prepare font for overlay using PIL
    try:
        font = ImageFont.truetype("arial.ttf", 40)
    except IOError:
        font = ImageFont.load_default()

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Convert BGR to RGB for correct colors
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_img = Image.fromarray(rgb_frame)
        
        # Draw predicted label text
        draw = ImageDraw.Draw(pil_img)
        draw.text((30, 30), f"Prediction: {pred_label}", font=font, fill=(255, 0, 0))
        
        # Convert back to array for display
        display_img = np.array(pil_img)
        
        # Display inline
        clear_output(wait=True)
        display(Image.fromarray(display_img))
        
        # Delay for approx 30 FPS
        time.sleep(0.033)
    
    cap.release()
    clear_output(wait=True)
    print("Video playback completed.")


Video playback completed.
