In [1]:
import torch
from matplotlib import pyplot as plt
import numpy as np
import cv2
import torch.nn as nn

In [2]:
class HandSignClassifier(nn.Module):
    def __init__(self, num_classes):
        super(HandSignClassifier, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(256 * 7 * 7, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x


In [3]:
model = HandSignClassifier(num_classes=36)

In [4]:
model.load_state_dict(torch.load('../hand_sign_classifier4.pth'))

<All keys matched successfully>

In [5]:
model.eval()

HandSignClassifier(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(7, 7))
  (classifier): Sequential(
    (0): Linear(in_features=12544, out_features=512, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=512, out_features=36, bias=True)
  )
)

In [6]:
from torchvision import transforms
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

In [7]:
from PIL import Image
image_path = '../pre-images/pre-for-ja-ज/hand_0000.png' 
image = Image.open(image_path)

# Apply the transformation
transformed_image = transform(image)

# If you need to add batch dimension (assumes you're working with a model that expects batches)
transformed_image = transformed_image.unsqueeze(0)

In [8]:
result = model(transformed_image)

In [9]:
print(result)

tensor([[ -6.4505,  -7.4426,  -6.7383,   3.4355,  -2.8931,   3.8527,  -4.3654,
          -2.4565,  -4.7128, -12.5092,  -5.1665, -10.2302, -10.3447,  11.6107,
          -1.5805,  -4.4574, -15.9028,   1.0374, -14.1829,  -7.7752,  -2.1233,
          -8.2282,  -6.3645,  -9.9057,   1.3037,  -8.0347,  -9.0312,  -3.4568,
         -10.2339,  -2.0346, -16.2580,  -7.5341, -19.8565,  -6.6283,  -3.7815,
          -1.4296]], grad_fn=<AddmmBackward0>)


In [10]:
predicted_class = torch.argmax(result).item()

In [11]:
class_mapping = {0: 'ब', 1: 'भ', 2: 'च',3: 'छ', 4: 'स', 5: 'द', 6: 'ड', 7: 'ध', 8:'ढ',9: 'ग', 10: 'घ', 11: 'ज्ञ', 12: 'ह', 13: 'ज', 14: 'झ', 15: 'क', 16: 'ख', 17: 'क्ष', 18: 'ल', 19: 'ष', 20: 'म', 21: 'न', 22: 'ण', 23: 'ङ', 24: 'प', 25: 'फ', 26: 'र', 27: 'श', 28: 'त', 29: 'ट', 30: 'थ', 31: 'ठ', 32: 'त्र', 33: 'व', 34: 'ञ', 35: 'य'}

In [12]:
predicted_label = class_mapping[predicted_class]

In [13]:
print(f"Prediction: {predicted_label}")

Prediction: ज


In [14]:
cap = cv2.VideoCapture(0)
while cap.isOpened():
    ret, frame = cap.read()

    cv2.imshow('Sign Language Detection',frame)

    if cv2.waitKey(10) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

In [10]:
import cv2
import mediapipe as mp
import torch
from torchvision import transforms
from PIL import Image, ImageDraw, ImageFont

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your custom model
model = HandSignClassifier(num_classes=36).to(device)
model.load_state_dict(torch.load('../hand_sign_classifier4.pth'))
model.eval()

# Your transformation pipeline for inference
transform_inference = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(), 
])

# Class mapping
class_mapping = {0: 'ब', 1: 'भ', 2: 'च', 3: 'छ', 4: 'स', 5: 'द', 6: 'ड', 7: 'ध', 8: 'ढ', 9: 'ग', 10: 'घ', 11: 'ज्ञ', 12: 'ह', 13: 'ज',
                 14: 'झ', 15: 'क', 16: 'ख', 17: 'क्ष', 18: 'ल', 19: 'ष', 20: 'म', 21: 'न', 22: 'ण', 23: 'ङ', 24: 'प', 25: 'फ', 26: 'र',
                 27: 'श', 28: 'त', 29: 'ट', 30: 'थ', 31: 'ठ', 32: 'त्र', 33: 'व', 34: 'ञ', 35: 'य'}

# Function to get hand bounding box
def get_hand_bounding_box(frame, hand_landmarks, padding=20):
    x_min, x_max, y_min, y_max = frame.shape[1], 0, frame.shape[0], 0
    for landmark in hand_landmarks.landmark:
        x, y = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])
        x_min = min(x_min, x)
        x_max = max(x_max, x)
        y_min = min(y_min, y)
        y_max = max(y_max, y)

    # Add padding to the bounding box coordinates
    x_min = max(0, x_min - padding)
    x_max = min(frame.shape[1], x_max + padding)
    y_min = max(0, y_min - padding)
    y_max = min(frame.shape[0], y_max + padding)

    return x_min, x_max, y_min, y_max

# Initialize Mediapipe hands module
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()
i=1
# Start capturing video from the webcam 
cap = cv2.VideoCapture("E:/signs/NSL23/S1_NSL_consonent_Bright_Cropped/S1_KA.mov")

while cap.isOpened():
    # Read a frame from the webcam
    ret, frame = cap.read()
    if not ret:
        break

    # Flip the frame horizontally for a later selfie-view display, and convert the BGR image to RGB
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Detect hands in the frame
    results = hands.process(rgb_frame)

    # If hands are detected, get bounding box and classify the hand gesture
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Get hand bounding box
            x_min, x_max, y_min, y_max = get_hand_bounding_box(frame, hand_landmarks)

            # Crop the hand region from the frame
            hand_crop = frame[y_min:y_max, x_min:x_max]

            # Apply the transformation to the cropped hand image
            pil_image = Image.fromarray(cv2.cvtColor(hand_crop, cv2.COLOR_BGR2RGB))
            transformed_image = transform_inference(pil_image).unsqueeze(0).to(device)

            pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            pil_image_draw = ImageDraw.Draw(pil_image)
            font = ImageFont.load_default()

            # Get model prediction
            with torch.no_grad():
                model_output = model(transformed_image)
                predicted_class = torch.argmax(model_output).item()
                predicted_label = class_mapping[predicted_class]
            
            print(predicted_label,i)
            i=i+1

            # Display the bounding box and predicted label
            pil_image_draw.rectangle([x_min, y_min, x_max, y_max], outline=(0, 255, 0), width=2)
            pil_image_draw.text((x_min, y_min - 10), predicted_label, font=font, fill=(0, 255, 0))

            frame = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

    # Display the frame
    cv2.imshow('Hand Gesture Recognition', frame)

    # Break the loop when 'q' key is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the capture and close all windows
cap.release()
cv2.destroyAllWindows()


क 1
क 2
क 3
क 4
क 5
क 6
क 7
क 8
क 9
क 10
क 11
क 12
क 13
क 14
क 15
क 16
क 17
क 18
क 19
क 20
क 21
क 22
क 23
क 24
क 25
क 26
क 27
क 28
क 29
क 30
क 31
क 32
क 33
क 34
क 35
क 36
क 37
क 38
क 39
क 40
क 41
क 42
क 43
क 44
क 45
क 46
क 47
क 48
क 49
क 50
क 51
क 52
क 53
क 54
क 55
क 56
क 57
क 58
क 59
क 60
क 61
क 62
क 63
क 64
क 65
क 66
क 67
क 68
क 69
क 70
क 71
क 72
क 73
क 74
क 75
क 76
क 77
क 78
क 79
क 80
क 81
क 82
क 83
क 84
क 85
क 86
क 87
क 88
क 89
क 90
क 91
क 92
क 93
क 94
क 95
क 96
क 97


In [12]:
import cv2
import mediapipe as mp
import torch
from torchvision import transforms
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Your transformation pipeline for inference
transform_inference = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

# Class mapping
class_mapping = {0: 'ब', 1: 'भ', 2: 'च', 3: 'छ', 4: 'स', 5: 'द', 6: 'ड', 7: 'ध', 8: 'ढ', 9: 'ग', 10: 'घ', 11: 'ज्ञ', 12: 'ह', 13: 'ज',
                 14: 'झ', 15: 'क', 16: 'ख', 17: 'क्ष', 18: 'ल', 19: 'ष', 20: 'म', 21: 'न', 22: 'ण', 23: 'ङ', 24: 'प', 25: 'फ', 26: 'र',
                 27: 'श', 28: 'त', 29: 'ट', 30: 'थ', 31: 'ठ', 32: 'त्र', 33: 'व', 34: 'ञ', 35: 'य'}

# Function to get hand bounding box
def get_hand_bounding_box(frame, hand_landmarks, padding=20):
    x_min, x_max, y_min, y_max = frame.shape[1], 0, frame.shape[0], 0
    for landmark in hand_landmarks.landmark:
        x, y = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])
        x_min = min(x_min, x)
        x_max = max(x_max, x)
        y_min = min(y_min, y)
        y_max = max(y_max, y)

    # Add padding to the bounding box coordinates
    x_min = max(0, x_min - padding)
    x_max = min(frame.shape[1], x_max + padding)
    y_min = max(0, y_min - padding)
    y_max = min(frame.shape[0], y_max + padding)

    return x_min, x_max, y_min, y_max

# Initialize Mediapipe hands module
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()

# Load the video
video_path = "E:/signs/NSL23/S1_NSL_consonent_Bright_Cropped/S1_JA.mov"
cap = cv2.VideoCapture(video_path)

frame_count = 0
frame_skip_interval = 5

while cap.isOpened():
    # Read a frame from the video
    ret, frame = cap.read()
    if not ret:
        break

    # Apply frame subsampling
    if frame_count % frame_skip_interval == 0:
        # Convert the frame from BGR to RGB
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Detect hands in the frame
        results = hands.process(rgb_frame)

        # If hands are detected, get bounding box and classify the hand gesture
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                # Get hand bounding box
                x_min, x_max, y_min, y_max = get_hand_bounding_box(frame, hand_landmarks)

                # Crop the hand region from the frame
                hand_crop = frame[y_min:y_max, x_min:x_max]

                # Apply the transformation to the cropped hand image
                pil_image = Image.fromarray(cv2.cvtColor(hand_crop, cv2.COLOR_BGR2RGB))
                transformed_image = transform_inference(pil_image)
                transformed_image = transformed_image.unsqueeze(0).to(device)

                # Get model prediction
                with torch.no_grad():
                    # Assuming `model` is loaded and defined
                    model_output = model(transformed_image)
                    predicted_class = torch.argmax(model_output).item()
                    predicted_label = class_mapping[predicted_class]

                # Display bounding box and predicted label on the frame
                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                cv2.putText(frame, predicted_label, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

                # Print the predicted label for each extracted hand image
                print(f"Frame {frame_count}: Predicted Label: {predicted_label}")

    # Write the frame with annotations to the output video

    # Display the frame
    cv2.imshow('Hand Gesture Recognition', frame)

    # Break the loop when 'q' key is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

    frame_count += 1

# Release the video capture and writer objects
cap.release()
output_video.release()
cv2.destroyAllWindows()


Frame 10: Predicted Label: ल
Frame 15: Predicted Label: ज
Frame 20: Predicted Label: ज
Frame 25: Predicted Label: ज
Frame 30: Predicted Label: ज
Frame 35: Predicted Label: ज
Frame 40: Predicted Label: ज
Frame 45: Predicted Label: ज
Frame 50: Predicted Label: ज
Frame 55: Predicted Label: ज
Frame 60: Predicted Label: ज
Frame 65: Predicted Label: ज
Frame 70: Predicted Label: ज
Frame 75: Predicted Label: ज
Frame 80: Predicted Label: ज
Frame 85: Predicted Label: ज
Frame 90: Predicted Label: ज


NameError: name 'output_video' is not defined

In [None]:
import cv2
import torch
import numpy as np
from torchvision import transforms
import mediapipe as mp
import torch.nn as nn
from PIL import Image

# Load your custom models
class HandSignClassifier(nn.Module):
    def __init__(self, num_classes):
        super(HandSignClassifier, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(256 * 7 * 7, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

# Path to your saved model
model_path = "E:/major-project/hand_sign_classifier_final.pth"
model = HandSignClassifier(num_classes=36)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

# MediaPipe Hands detector
mp_hands = mp.solutions.hands
hands_detector = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.)

# Function to calculate hand bounding box coordinates from landmarks
def get_hand_bounding_box(frame, hand_landmarks, padding=20):
    x_min, x_max, y_min, y_max = frame.shape[1], 0, frame.shape[0], 0
    for landmark in hand_landmarks.landmark:
        x, y = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])
        x_min = min(x_min, x)
        x_max = max(x_max, x)
        y_min = min(y_min, y)
        y_max = max(y_max, y)

    # Add padding to the bounding box coordinates
    x_min = max(0, x_min - padding)
    x_max = min(frame.shape[1], x_max + padding)
    y_min = max(0, y_min - padding)
    y_max = min(frame.shape[0], y_max + padding)

    return x_min, x_max, y_min, y_max

def preprocess_image(frame, hand_landmarks):
    # Access the hand landmarks (keypoints) to get the bounding box coordinates
    x_min, x_max, y_min, y_max = get_hand_bounding_box(frame, hand_landmarks)

    # Crop the hand region
    hand_image = frame[y_min:y_max, x_min:x_max]

    # Convert NumPy array to PIL Image
    hand_image_pil = Image.fromarray(cv2.cvtColor(hand_image, cv2.COLOR_BGR2RGB))

    # Apply the grayscale transform
    transform = transforms.Compose([
        transforms.Grayscale(),
        transforms.ToTensor(),
    ])
    
    input_tensor = transform(hand_image_pil).unsqueeze(0)

    return input_tensor

def predict_hand_sign(frame, model):
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands_detector.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            input_tensor = preprocess_image(frame, hand_landmarks)
            model.eval()
            with torch.no_grad():
                output = model(input_tensor)

            _, predicted_class = torch.max(output, 1)
            return predicted_class.item()

    return None

# Path to the image you want to test
image_path = "E:/Downloads/20240221_095053.jpg"
image = cv2.imread(image_path)

# Display the original image
cv2.imshow('Original Image', image)
cv2.waitKey(0)
cv2.destroyAllWindows()

# Make prediction and display the result
predicted_class = predict_hand_sign(image, model)
print("Predicted Class Index:", predicted_class)

# Map the class index to the corresponding label
class_mapping_nepali = {
    0: 'क', 1: 'ख', 2: 'ग', 3: 'घ', 4: 'ङ', 5: 'च',
    6: 'छ', 7: 'ज', 8: 'झ', 9: 'ञ', 10: 'ट', 11: 'ठ',
    12: 'ड', 13: 'ढ', 14: 'ण', 15: 'त', 16: 'थ', 17: 'द',
    18: 'ध', 19: 'न', 20: 'प', 21: 'फ', 22: 'ब', 23: 'भ',
    24: 'म', 25: 'य', 26: 'र', 27: 'ल', 28: 'व', 29: 'श',
    30: 'ष', 31: 'स', 32: 'ह', 33: 'क्ष', 34: 'त्र', 35: 'ज्ञ'
}

predicted_label = class_mapping_nepali[predicted_class]
print("Predicted Class Label:", predicted_label)
