In [1]:
import os
import sys
import torch
import cv2
import numpy as np
import mediapipe as mp
import yaml
import pickle

# Add project directory to path
sys.path.append(os.getcwd())

# Import necessary project modules
from modelling.model import build_model
from dataset.Dataloader import build_dataloader
from utils.misc import load_config

class SignLanguageRecognitionDemo:
    def __init__(self, config_path):
        """
        Initialize the sign language recognition demo
        
        Args:
            config_path (str): Path to the configuration YAML file
        """
        # Load configuration
        self.cfg = load_config(config_path)
        
        # Load gloss dictionary
        with open(self.cfg['model']['RecognitionNetwork']['GlossTokenizer']['gloss2id_file'], 'rb') as f:
            self.gloss_dict = pickle.load(f)
        self.id2gloss = {v: k for k, v in self.gloss_dict.items()}
        
        # Initialize model
        self.model = build_model(self.cfg)
        
        # Load best checkpoint
        checkpoint_path = os.path.join(self.cfg['training']['model_dir'], 'ckpts', 'best.ckpt')
        state_dict = torch.load(checkpoint_path, map_location='cuda')
        self.model.load_state_dict(state_dict['model_state'])
        self.model.eval()
        
        # Initialize MediaPipe for keypoint extraction
        self.mp_holistic = mp.solutions.holistic
        self.holistic = self.mp_holistic.Holistic(
            static_image_mode=False,
            model_complexity=1,
            smooth_landmarks=True,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        
        # Prepare video preprocessing
        self.transform_cfg = self.cfg['data']['transform_cfg']
    
    def extract_keypoints(self, frame):
        """
        Extract keypoints from a frame using MediaPipe
        
        Args:
            frame (np.ndarray): Input video frame
        
        Returns:
            dict: Extracted keypoints
        """
        # Convert BGR to RGB
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Process frame
        results = self.holistic.process(rgb_frame)
        
        # Extract keypoints
        keypoints = {
            'pose': [],
            'mouth_half': [],
            'hand': [],
            'face_others_1_3': []
        }
        
        # Pose landmarks
        if results.pose_landmarks:
            for landmark in results.pose_landmarks.landmark:
                keypoints['pose'].extend([landmark.x, landmark.y, landmark.z, landmark.visibility])
        
        # Left and right hand landmarks
        def process_hand(hand_landmarks):
            hand_points = []
            if hand_landmarks:
                for landmark in hand_landmarks.landmark:
                    hand_points.extend([landmark.x, landmark.y, landmark.z])
            return hand_points
        
        keypoints['hand'] = process_hand(results.left_hand_landmarks) + \
                             process_hand(results.right_hand_landmarks)
        
        # Face landmarks (using a subset)
        if results.face_landmarks:
            face_points = results.face_landmarks.landmark
            # Select a subset of face landmarks (you might want to customize this)
            face_subset = face_points[0:68]  # Example: first 68 landmarks
            keypoints['face_others_1_3'] = [
                [landmark.x, landmark.y, landmark.z] for landmark in face_subset
            ]
        
        # Mouth landmarks (simplified)
        if results.face_landmarks:
            mouth_points = results.face_landmarks.landmark[0:20]  # Adjust as needed
            keypoints['mouth_half'] = [
                [landmark.x, landmark.y, landmark.z] for landmark in mouth_points
            ]
        
        return keypoints
    
    def preprocess_frame(self, frame):
        """
        Preprocess frame for model input
        
        Args:
            frame (np.ndarray): Input video frame
        
        Returns:
            dict: Preprocessed inputs for the model
        """
        # Resize frame
        frame = cv2.resize(frame, (self.transform_cfg['img_size'], self.transform_cfg['img_size']))
        
        # Extract keypoints
        keypoints = self.extract_keypoints(frame)
        
        # Convert frame to tensor
        frame_tensor = torch.from_numpy(frame).permute(2, 0, 1).float() / 255.0
        frame_tensor = frame_tensor.unsqueeze(0)  # Add batch dimension
        
        # Convert keypoints to tensor
        keypoint_tensors = {k: torch.tensor(v).float().unsqueeze(0) for k, v in keypoints.items()}
        
        return {
            'rgb': frame_tensor,
            'keypoint': keypoint_tensors
        }
    
    def recognize_sign(self, frame):
        """
        Recognize sign language from a frame
        
        Args:
            frame (np.ndarray): Input video frame
        
        Returns:
            str: Recognized gloss (sign)
        """
        with torch.no_grad():
            # Preprocess frame
            inputs = self.preprocess_frame(frame)
            
            # Run model
            output = self.model(is_train=False, **inputs)
            
            # Get top predictions
            if 'recognition_logits' in output:
                logits = output['recognition_logits']
                top_k_values, top_k_indices = torch.topk(logits, k=5)
                predictions = [self.id2gloss[idx.item()] for idx in top_k_indices[0]]
                
                return predictions
        
        return ["No sign detected"]
    
    def run_webcam_demo(self):
        """
        Run real-time sign language recognition demo using webcam
        """
        cap = cv2.VideoCapture(0)
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            # Flip frame horizontally for natural view
            frame = cv2.flip(frame, 1)
            
            # Recognize sign
            predictions = self.recognize_sign(frame)
            
            # Display predictions
            for i, pred in enumerate(predictions):
                cv2.putText(frame, f"{i+1}. {pred}", 
                            (10, 30 + i*30), 
                            cv2.FONT_HERSHEY_SIMPLEX, 
                            0.7, (0, 255, 0), 2)
            
            # Show frame
            cv2.imshow('Sign Language Recognition', frame)
            
            # Exit on 'q'
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        
        cap.release()
        cv2.destroyAllWindows()
    
    def recognize_video_file(self, video_path):
        """
        Recognize signs in a video file
        
        Args:
            video_path (str): Path to the input video file
        """
        cap = cv2.VideoCapture(video_path)
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            # Recognize sign
            predictions = self.recognize_sign(frame)
            
            # Display predictions
            for i, pred in enumerate(predictions):
                cv2.putText(frame, f"{i+1}. {pred}", 
                            (10, 30 + i*30), 
                            cv2.FONT_HERSHEY_SIMPLEX, 
                            0.7, (0, 255, 0), 2)
            
            # Show frame
            cv2.imshow('Sign Language Recognition', frame)
            
            # Exit on 'q'
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        
        cap.release()
        cv2.destroyAllWindows()

def main():
    # Path to your configuration file
    config_path = "experiments/configs/TwoStream/phoenix-2014_keypoint.yaml"
    
    # Create demo instance
    demo = SignLanguageRecognitionDemo(config_path)
    
    # Choose demo mode
    print("Sign Language Recognition Demo")
    print("1. Webcam Demo")
    print("2. Video File Demo")
    choice = input("Enter your choice (1/2): ")
    
    if choice == '1':
        demo.run_webcam_demo()
    elif choice == '2':
        video_path = input("Enter the path to your video file: ")
        demo.recognize_video_file(video_path)
    else:
        print("Invalid choice!")

if __name__ == "__main__":
    main()

2024-11-30 07:27:21.203752: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-30 07:27:21.220976: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732919241.240953 3231851 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732919241.247011 3231851 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-30 07:27:21.269957: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Overwrite cfg.model.RecognitionNetwork.keypoint_s3d.in_channel -> 79


NameError: name 'logger' is not defined

In [11]:
import cv2

cap = cv2.VideoCapture(0)
if cap.isOpened():
    ret, frame = cap.read()
    if ret:
        print("Camera is accessible")
    else:
        print("Failed to capture frame")
else:
    print("Failed to open camera")

cap.release()

Camera is accessible


In [7]:
import cv2
cap = cv2.VideoCapture(-1)
if cap.isOpened():
    print("Camera is accessible")
else:
    print("Failed to open camera")
cap.release()

Failed to open camera
