In [None]:
#image
import cv2
import numpy as np
import torch
from torchvision import transforms, models
import mediapipe as mp

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = models.mobilenet_v2(pretrained=False)
model.classifier[1] = torch.nn.Linear(model.last_channel, 24)  
model = model.to(device)

model_path = "/home/mostafabakr/Desktop/Project X/Final_models/asl_image_model.pth"
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

class_names = [
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
    'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y'
]

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7)
mp_drawing = mp.solutions.drawing_utils

cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

img_size = 224
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def refine_segmentation(hand_image, mask):
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
    
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        largest_contour = max(contours, key=cv2.contourArea)
        refined_mask = np.zeros_like(mask)
        cv2.drawContours(refined_mask, [largest_contour], -1, 255, thickness=cv2.FILLED)
        return cv2.bitwise_and(hand_image, hand_image, mask=refined_mask)
    return hand_image

while True:
    success, frame = cap.read()
    if not success:
        print("Error: Could not read frame from webcam.")
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            h, w, _ = frame.shape
            x_min = int(min([lm.x for lm in hand_landmarks.landmark]) * w) - 20
            y_min = int(min([lm.y for lm in hand_landmarks.landmark]) * h) - 20
            x_max = int(max([lm.x for lm in hand_landmarks.landmark]) * w) + 20
            y_max = int(max([lm.y for lm in hand_landmarks.landmark]) * h) + 20

            x_min, y_min = max(0, x_min), max(0, y_min)
            x_max, y_max = min(w, x_max), min(h, y_max)

            hand_image = frame[y_min:y_max, x_min:x_max]

            hand_hsv = cv2.cvtColor(hand_image, cv2.COLOR_BGR2HSV)
            lower_skin = np.array([0, 20, 70], dtype=np.uint8)
            upper_skin = np.array([20, 255, 255], dtype=np.uint8)
            mask = cv2.inRange(hand_hsv, lower_skin, upper_skin)

            hand_segmented = refine_segmentation(hand_image, mask)

            if hand_segmented.size != 0:
                padded_image = np.ones((img_size, img_size, 3), np.uint8) * 255
                h_crop, w_crop = hand_segmented.shape[:2]
                scale = img_size / max(h_crop, w_crop)
                resized = cv2.resize(hand_segmented, (int(w_crop * scale), int(h_crop * scale)))
                start_x = (img_size - resized.shape[1]) // 2
                start_y = (img_size - resized.shape[0]) // 2
                padded_image[start_y:start_y + resized.shape[0], start_x:start_x + resized.shape[1]] = resized

                input_image = transform(padded_image).unsqueeze(0).to(device)
                with torch.no_grad():
                    outputs = model(input_image)
                    probabilities = torch.nn.functional.softmax(outputs[0], dim=0)
                    confidence, predicted = torch.max(probabilities, 0)
                    predicted_index = predicted.item()

                if confidence >= 0.5 and 0 <= predicted_index < len(class_names):
                    label = class_names[predicted_index]
                    confidence_text = f"{confidence.item() * 100:.1f}%"
                else:
                    label = "Unknown"
                    confidence_text = ""

                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)
                cv2.putText(frame, label, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                if confidence_text:
                    bar_width = int(200 * confidence.item())
                    cv2.rectangle(frame, (x_min, y_max + 10), (x_min + bar_width, y_max + 30), (0, 0, 255), -1)
                    cv2.putText(frame, confidence_text, (x_min, y_max + 50), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

    cv2.imshow("Hand Detection", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



Using device: cuda


  model.load_state_dict(torch.load(model_path, map_location=device))


In [None]:
#Video
import cv2
import numpy as np
import torch
from torchvision import transforms, models
import mediapipe as mp

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = models.mobilenet_v2(pretrained=False)
model.classifier[1] = torch.nn.Linear(model.last_channel, 2)  
model = model.to(device)

model_path = "/home/mostafabakr/Desktop/Project X/Final_models/asl_video_model.pth"
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

class_names = ['J', 'Z']

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7)
mp_drawing = mp.solutions.drawing_utils

img_size = 224
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

def refine_segmentation(hand_image, mask):
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
    
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        largest_contour = max(contours, key=cv2.contourArea)
        refined_mask = np.zeros_like(mask)
        cv2.drawContours(refined_mask, [largest_contour], -1, 255, thickness=cv2.FILLED)
        return cv2.bitwise_and(hand_image, hand_image, mask=refined_mask)
    return hand_image

while True:
    success, frame = cap.read()
    if not success:
        print("Error: Could not read frame from webcam.")
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            h, w, _ = frame.shape
            x_min = int(min([lm.x for lm in hand_landmarks.landmark]) * w) - 20
            y_min = int(min([lm.y for lm in hand_landmarks.landmark]) * h) - 20
            x_max = int(max([lm.x for lm in hand_landmarks.landmark]) * w) + 20
            y_max = int(max([lm.y for lm in hand_landmarks.landmark]) * h) + 20

            x_min, y_min = max(0, x_min), max(0, y_min)
            x_max, y_max = min(w, x_max), min(h, y_max)

            hand_image = frame[y_min:y_max, x_min:x_max]

            hand_hsv = cv2.cvtColor(hand_image, cv2.COLOR_BGR2HSV)
            lower_skin = np.array([0, 20, 70], dtype=np.uint8)
            upper_skin = np.array([20, 255, 255], dtype=np.uint8)
            mask = cv2.inRange(hand_hsv, lower_skin, upper_skin)

            hand_segmented = refine_segmentation(hand_image, mask)

            if hand_segmented.size != 0:
                padded_image = np.ones((img_size, img_size, 3), np.uint8) * 255
                h_crop, w_crop = hand_segmented.shape[:2]
                scale = img_size / max(h_crop, w_crop)
                resized = cv2.resize(hand_segmented, (int(w_crop * scale), int(h_crop * scale)))
                start_x = (img_size - resized.shape[1]) // 2
                start_y = (img_size - resized.shape[0]) // 2
                padded_image[start_y:start_y + resized.shape[0], start_x:start_x + resized.shape[1]] = resized

                input_image = transform(padded_image).unsqueeze(0).to(device)
                with torch.no_grad():
                    outputs = model(input_image)
                    probabilities = torch.nn.functional.softmax(outputs[0], dim=0)
                    confidence, predicted = torch.max(probabilities, 0)
                    predicted_index = predicted.item()

                if confidence >= 0.95 and 0 <= predicted_index < len(class_names):
                    label = class_names[predicted_index]
                    confidence_text = f"{confidence.item() * 100:.1f}%"
                    
                    bar_width = int(200 * confidence.item())
                    cv2.rectangle(frame, (x_min, y_max + 10), (x_min + bar_width, y_max + 30), (0, 255, 0), -1)
                    cv2.putText(frame, confidence_text, (x_min, y_max + 50), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

                    cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                    cv2.putText(frame, label, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

                else:
                    cv2.putText(frame, "Unknown", (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    cv2.imshow("Hand Gesture Detection", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Using device: cuda


  model.load_state_dict(torch.load(model_path, map_location=device))
