In [1]:
!pip install googletrans==4.0.0-rc1



In [2]:
import cv2
import numpy as np
import math
import time
from cvzone.HandTrackingModule import HandDetector
from tensorflow.keras.models import load_model
from googletrans import Translator
from googletrans import LANGUAGES


In [3]:
label_map = {
    0: 'tur_N', 1: 'tur_O', 2: 'tur_P', 3: 'tur_R', 4: 'tur_S', 5: 'tur_T', 
    6: 'tur_U', 7: 'tur_V', 8: 'tur_Y', 9: 'tur_Z', 10: 'tur_D', 11: 'tur_E', 
    12: 'tur_F', 13: 'tur_G', 14: 'tur_H', 15: 'tur_I', 16: 'tur_J', 17: 'tur_K', 
    18: 'tur_L', 19: 'tur_M', 20: 'bis_Q', 21: 'bis_O', 22: 'bis_T', 23: 'bis_tur_C', 
    24: 'bis_D', 25: 'bis_U', 26: 'bis_M', 27: 'bis_K', 28: 'bis_B', 29: 'bis_Y', 
    30: 'bis_S', 31: 'bis_L', 32: 'bis_F', 33: 'bis_Z', 34: 'bis_E', 35: 'bis_G', 
    36: 'bis_P', 37: 'bis_A', 38: 'bis_X', 39: 'bis_V', 40: 'bis_R', 41: 'bis_W', 
    42: 'bis_N', 43: 'bis_I', 44: 'bis_H', 45: 'tur_A', 46: 'tur_B'
}


In [4]:
def preprocess_landmarks(hand_landmarks, bbox, img_size=224):
    # Create a completely black canvas
    black_canvas = np.zeros((img_size, img_size, 3), dtype=np.uint8)

    # Extract bounding box coordinates
    x_min, y_min, w, h = bbox
    max_width = w if w > 0 else 1
    max_height = h if h > 0 else 1

    for lm in hand_landmarks:
        # Only use the 2D x, y values from the landmarks (ignoring z-depth)
        x, y = lm[0], lm[1]
        
        # Scale landmarks relative to bounding box
        scaled_x = int(((x - x_min) / max_width) * img_size)
        scaled_y = int(((y - y_min) / max_height) * img_size)
        
        if 0 <= scaled_x < img_size and 0 <= scaled_y < img_size:
            cv2.circle(black_canvas, (scaled_x, scaled_y), 5, (255, 255, 255), -1)

    # Normalize pixel values
    normalized_img = black_canvas / 255.0
    return normalized_img



In [5]:
for lang_code, lang_name in LANGUAGES.items():
    print(f"{lang_code}: {lang_name}")

af: afrikaans
sq: albanian
am: amharic
ar: arabic
hy: armenian
az: azerbaijani
eu: basque
be: belarusian
bn: bengali
bs: bosnian
bg: bulgarian
ca: catalan
ceb: cebuano
ny: chichewa
zh-cn: chinese (simplified)
zh-tw: chinese (traditional)
co: corsican
hr: croatian
cs: czech
da: danish
nl: dutch
en: english
eo: esperanto
et: estonian
tl: filipino
fi: finnish
fr: french
fy: frisian
gl: galician
ka: georgian
de: german
el: greek
gu: gujarati
ht: haitian creole
ha: hausa
haw: hawaiian
iw: hebrew
he: hebrew
hi: hindi
hmn: hmong
hu: hungarian
is: icelandic
ig: igbo
id: indonesian
ga: irish
it: italian
ja: japanese
jw: javanese
kn: kannada
kk: kazakh
km: khmer
ko: korean
ku: kurdish (kurmanji)
ky: kyrgyz
lo: lao
la: latin
lv: latvian
lt: lithuanian
lb: luxembourgish
mk: macedonian
mg: malagasy
ms: malay
ml: malayalam
mt: maltese
mi: maori
mr: marathi
mn: mongolian
my: myanmar (burmese)
ne: nepali
no: norwegian
or: odia
ps: pashto
fa: persian
pl: polish
pt: portuguese
pa: punjabi
ro: romanian
r

In [6]:
language_code_map = {
    'bis': 'id',  # BISINDO maps to Indonesian ('id')
    'turk': 'tr'  # Turkish maps to Turkish ('tr')
}

In [7]:
# init translator
translator = Translator()

def translate_text(text, source_language='auto', target_language='en'):
    translation = translator.translate(text, src=source_language, dest=target_language)
    return translation.text

In [None]:



# Initialize video capture and hand detector
cap = cv2.VideoCapture(0)
detector = HandDetector(maxHands=2)

# Load the trained model
model = load_model('../models/slt_model_rev2.h5') # TODO: change model here

# Initialize parameters
offset = 45

recognized_labels = []  #list of alphabets
input_text = [] #input to the translator; this is what the user signs in written word
output_text = ""
last_capture_time = 0
pause_time = 5  # pause in between frames; NOTE: can remove if u want!
img_size = 224  # resize target for preprocessing
language_counts = {
    'bis': 0,
    'tur': 0
}


while True:
    success, img = cap.read()
    if not success:
        break

    current_time = time.time()-5

    # Detect hands in the frame
    hands, img = detector.findHands(img, draw=True) 

    #ad's code
    if hands and (current_time - last_capture_time > pause_time):
        # Initialize variables for the combined bounding box
        x_min, y_min = float('inf'), float('inf')
        x_max, y_max = float('-inf'), float('-inf')

        # Loop through detected hands to compute the combined bounding box
        for hand in hands:
            x, y, w, h = hand['bbox']
            x_min = min(x_min, x - offset)
            y_min = min(y_min, y - offset)
            x_max = max(x_max, x + w + offset)
            y_max = max(y_max, y + h + offset)

        # Ensure the bounding box is within image boundaries
        x_min = max(0, x_min)
        y_min = max(0, y_min)
        x_max = min(img.shape[1], x_max)
        y_max = min(img.shape[0], y_max)

        # Draw the single combined bounding box
        cv2.rectangle(img, (x_min, y_min), (x_max, y_max), (255, 0, 255), 2)

        img_just_hands = img[int(y_min):int(y_max), int(x_min):int(x_max)].copy()
        # Black background for landmarks only
        img_landmarks = np.zeros_like(img)
        # Draw landmarks for both hands
        for hand in hands:
            lm_list = hand['lmList']  # List of hand landmarks
            for lm in lm_list:
                cx, cy = lm[:2]
                cv2.circle(img, (cx, cy), 5, (0, 255, 0), cv2.FILLED)
                cv2.circle(img_landmarks, (cx, cy), 5, (0, 255, 0), cv2.FILLED)

        # Crop the image based on the combined bounding box
        img_crop = img[int(y_min):int(y_max), int(x_min):int(x_max)]
        img_landmarks_crop = img_landmarks[int(y_min):int(y_max), int(x_min):int(x_max)]

        # Resize and pad the cropped image to maintain aspect ratio
        aspect_ratio = (y_max - y_min) / (x_max - x_min)
        img_pad = np.zeros((img_size, img_size, 3), np.uint8)
        img_landmarks_pad = np.zeros((img_size, img_size, 3), np.uint8)

        if aspect_ratio > 1:
            # Height is greater; resize width
            k = img_size / (y_max - y_min)
            new_w = math.ceil(k * (x_max - x_min))
            img_resize = cv2.resize(img_crop, (new_w, img_size))
            img_landmarks_resize = cv2.resize(img_landmarks_crop, (new_w, img_size))
            w_offset = (img_size - new_w) // 2
            img_pad[:, w_offset:w_offset + new_w] = img_resize
            img_landmarks_pad[:, w_offset:w_offset + new_w] = img_landmarks_resize
        else:
            # Width is greater; resize height
            k = img_size / (x_max - x_min)
            new_h = math.ceil(k * (y_max - y_min))
            img_resize = cv2.resize(img_crop, (img_size, new_h))
            img_landmarks_resize = cv2.resize(img_landmarks_crop, (img_size, new_h))
            h_offset = (img_size - new_h) // 2
            img_pad[h_offset:h_offset + new_h, :] = img_resize
            img_landmarks_pad[h_offset:h_offset + new_h, :] = img_landmarks_resize
    
        img_landmarks_normalized = img_landmarks_pad / 255.0
        img_input = np.expand_dims(img_landmarks_normalized, axis=0)  # shape: (1, 224, 224, 3)

        #predicting the letter
        try:
            prediction = model.predict(img_input)
            predicted_index = np.argmax(prediction[0])
            print(predicted_index)
            predicted_label = label_map.get(predicted_index)

            if predicted_label != "Unknown":
                recognized_labels.append(predicted_label)
                input_text.append(predicted_label.split('_')[-1])  # Add only new letter

            # Update language counts
            language = predicted_label.split('_')[0]
            language_counts[language] += 1

            last_capture_time = current_time  # Reset timer
        except Exception as e:
            print(e)


    # Display the recognized sequence in the top-left corner
    if input_text:  # Display only if there's input
        cv2.putText(img, 'Input:', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1.4, (0, 255, 0), 3)
        cv2.putText(img, ''.join(input_text), (10, 120), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 4)

    # Display the translated output text in the bottom-right corner
    if output_text:
        output_header = "Output:"
        text_size_header = cv2.getTextSize(output_header, cv2.FONT_HERSHEY_SIMPLEX, 1.4, 3)[0]
        text_size_output = cv2.getTextSize(output_text, cv2.FONT_HERSHEY_SIMPLEX, 2, 4)[0]

        x_header = img.shape[1] - text_size_header[0] - 20
        y_header = img.shape[0] - text_size_output[1] - 20

        x_output = img.shape[1] - text_size_output[0] - 20
        y_output = img.shape[0] - 20

        # Draw output header and text
        cv2.putText(img, output_header, (x_header, y_header - 20), cv2.FONT_HERSHEY_SIMPLEX, 1.4, (0, 255, 0), 3)  # Double size
        cv2.putText(img, output_text, (x_output, y_output), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 4)  # Double size
    
    # Display instructions in the bottom-left corner
    instructions = [
        "Press 'D' to delete input",
        "Press 'T' to translate",
        "Press 'Q' to quit"
    ]

    y_start = img.shape[0] - 80
    line_height = 30

    for i, text in enumerate(instructions):
        y_position = y_start + i * line_height
        cv2.putText(img, text, (10, y_position), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)



    # Display the live video feed
    cv2.imshow("Image", img)

    #key press
    key = cv2.waitKey(1) & 0xFF

    if key == ord('t'):  # Translate
        if input_text:
            input_text_str = ''.join(input_text)
            input_language = max(language_counts, key=language_counts.get) # bis or turk
            input_language_code = language_code_map.get(input_language, 'auto')  # maps to google language code, default to 'auto' if not found
            print(input_language_code)
            output_text = translate_text(input_text_str, source_language=input_language_code, target_language='en') #TODO: create input to change the target language

            print(output_text)

    elif key == ord('d'):  # Delete
        if (input_text):
            input_text.clear() 
        if (output_text):
            output_text = ""
        print("Input text cleared.")

    elif key == ord('q'):  # Quit
        break

# Cleanup
cap.release()
cv2.destroyAllWindows()


I0000 00:00:1734019925.618584 6818570 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1
W0000 00:00:1734019925.679558 6824305 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1734019925.696139 6824305 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 234ms/step
34
Input text cleared.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
41
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
37
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
1
id
Wao
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
21
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
21


: 