# How to begin:

The sign language detection software requires uploading a set of hand signs to match to. The "matching set" folder should include all required information, simply:

1. Download the matching set folder
2. Go to the right had side of this page and click on the folder icon
3. At the top of the Files menu, there is an upload button
4. navigate **inside** of the match folder, and upload all files
5. run all of the "SVD & other setup" cells
6. choose between a photo or live feed, and run associated cells
7. use your **right** hand, and enjoy!

At the bottom of the page is some debugging information that may help understand the program, so try those out too!

# SVD & other setup

## Install libraries

In [None]:
!pip install -q mediapipe

In [None]:
!wget -q https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task

### convert to matrix

in format [x_coords, y_coords, z_coords]

In [None]:
#just print x, y, z matricies

from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np

MARGIN = 10  # pixels

def coord_matrix(rgb_image, detection_result):
  hand_landmarks_list = detection_result.hand_landmarks

  # Loop through the detected hands to visualize.
  for idx in range(len(hand_landmarks_list)):
    hand_landmarks = hand_landmarks_list[idx]

    #nate's addition
    x_coords = [landmark.x for landmark in hand_landmarks]
    y_coords = [landmark.y for landmark in hand_landmarks]
    z_coords = [landmark.z for landmark in hand_landmarks]

    return [x_coords, y_coords, z_coords]

## Single Value Decomposition

In [None]:
#applying above "convert to matrix" to a function

from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

MARGIN = 10  # pixels

def coord_matrix(photo_name):
  base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
  options = vision.HandLandmarkerOptions(base_options=base_options,
                                        num_hands=2)
  detector = vision.HandLandmarker.create_from_options(options)
  image = mp.Image.create_from_file(photo_name)
  detection_result = detector.detect(image)
  rgb_image = image.numpy_view()



  hand_landmarks_list = detection_result.hand_landmarks

  # Loop through the detected hands to visualize.
  for idx in range(len(hand_landmarks_list)):
    hand_landmarks = hand_landmarks_list[idx]
    x_coords = [landmark.x for landmark in hand_landmarks]
    y_coords = [landmark.y for landmark in hand_landmarks]
    z_coords = [landmark.z for landmark in hand_landmarks]

    return np.array([x_coords, y_coords, z_coords])

In [None]:
# make the set of photos to match to
import cv2
import zipfile
from google.colab.patches import cv2_imshow
import numpy as np

gestures = [coord_matrix("SignA.jpg")]
# load the images dataset
with zipfile.ZipFile("match.zip") as facezip:
    for filename in facezip.namelist():
        if filename == "SignA.jpg":
            continue # skip SignA, covered in gestures intialization
        b = coord_matrix(filename)
        b = np.expand_dims(b, axis=0)
        gestures = np.append(gestures, b, axis=0)




In [None]:
# making set to match to, called gestures
# A is index (1, 0, 0)
# G is index (7, 0, 0)
row_vect = gestures[[0],:,:] # Take out the 0th gesture

col_vect = row_vect.reshape((63,1))
A = col_vect
for i in range(1, 72): #change to number of signs ----------------------
  row_vect = gestures[[i],:,:] # Take out the ith gesture
  col_vect = row_vect.reshape((63,1))
  A = np.hstack((A, col_vect))

In [None]:

A_bar = A - (1 / A.shape[1]) * (A @ ( np.ones((A.shape[1], 1)) @ np.ones((1, A.shape[1])) )) # centered A
U, sigma, V_T = np.linalg.svd(A_bar)

k = 63 #the first k colomns of U ------------------------
B_hat = U[:, :k]
sigmat  = np.diag(sigma)[:k, :k]
Vk = V_T.T[:, :k]
W_hat = sigmat @ Vk.T
print(np.shape(W_hat))

(63, 72)


## photo match function

In [None]:
#photo match as a function

def nearest_neighbor(w_i):
  dist_list = np.ones((W_hat.shape[1]))
  for i in range(W_hat.shape[1]):
    dist_list[i] = np.linalg.norm(W_hat[:, i:i+1] - w_i)
  return np.argmin(dist_list)

def guess_image(jpg_name): #jpg_name is string
  abcArr = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y"]
  test_img1 = coord_matrix(jpg_name)
  z_1 = np.reshape(test_img1, (63, 1))  #vectorize test_img1
  z_1_bar = z_1 - (1 / A.shape[1]) * A @ np.ones((A.shape[1], 1))  # center z_1
  w_1 = np.transpose(B_hat) @ z_1_bar  # caculate according to z_1_bar and B_hat
  ind_1 = nearest_neighbor(w_1) % 24  # find the index of the nearest neighbour according to w_1 and W_hat
  print(abcArr[ind_1]) # A = 0, B = 1, ... , G = 6

# Use of software

two options:
1. take photo, one capture at a time
2. live feed

## take photo

In [None]:
#part of take photo, below
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode

def take_photo(filename='photo.jpg', quality=0.8):
  js = Javascript('''
    async function takePhoto(quality) {
      const div = document.createElement('div');
      const capture = document.createElement('button');
      capture.textContent = 'Capture';
      div.appendChild(capture);

      const video = document.createElement('video');
      video.style.display = 'block';
      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      document.body.appendChild(div);
      div.appendChild(video);
      video.srcObject = stream;
      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      // Wait for Capture to be clicked.
      await new Promise((resolve) => capture.onclick = resolve);

      const canvas = document.createElement('canvas');
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      canvas.getContext('2d').drawImage(video, 0, 0);
      stream.getVideoTracks()[0].stop();
      div.remove();
      return canvas.toDataURL('image/jpeg', quality);
    }
    ''')
  display(js)
  data = eval_js('takePhoto({})'.format(quality))
  binary = b64decode(data.split(',')[1])
  with open(filename, 'wb') as f:
    f.write(binary)
  return filename

In [None]:
#takes photos
from IPython.display import Image
filename = take_photo()
print('Saved to {}'.format(filename))

# Show the image which was just taken.
guess_image("photo.jpg")
display(Image(filename))

<IPython.core.display.Javascript object>

KeyboardInterrupt: 

#live feed

Due to google's attempts to prevent any live video capabilities, the live feed has some odd running behavior, but it still works!

Simply run the live feed, then *stop* the cell. This will start the program taking photos and interpreting results. Then to actually stop the program, a new option will apear below the run button (a little box with an arrow). click that and clear output.

In [None]:
#live feed, with SVD processing
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode
from google.colab import output

def take_photo_continuous(interval=1, quality=0.8):
    js = Javascript('''
      async function takePhotoContinuous(interval, quality) {
        const div = document.createElement('div');
        const video = document.createElement('video');
        video.style.display = 'block';
        const stream = await navigator.mediaDevices.getUserMedia({video: true});

        document.body.appendChild(div);
        div.appendChild(video);
        video.srcObject = stream;
        await video.play();

        // Resize the output to fit the video element.
        google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

        while (true) {
          const canvas = document.createElement('canvas');
          canvas.width = video.videoWidth;
          canvas.height = video.videoHeight;
          canvas.getContext('2d').drawImage(video, 0, 0);
          const data = canvas.toDataURL('image/jpeg', quality);
          // Return data to Python kernel
          google.colab.kernel.invokeFunction('notebook.take_photo', [data], {});
          // Print message
          google.colab.kernel.invokeFunction('notebook.print_message', ['Captured photo'], {});
          // Wait for interval
          await new Promise(resolve => setTimeout(resolve, interval * 1000));
        }
      }
    ''')
    display(js)
    # Call the JavaScript function to start capturing photos continuously
    eval_js('takePhotoContinuous({}, {})'.format(interval, quality))

# Define a function to receive the captured photo data
def handle_photo(data):
    binary = b64decode(data.split(',')[1])
    with open('photo.jpg', 'wb') as f:
        f.write(binary)
    print('Photo captured and saved as photo.jpg')
    guess_image("photo.jpg") # ACTUAL GUESSING FUNCTION HERE  -----------


# Define a function to print a message
def print_message(message):
    print(message)

# Register the Python functions to be called from JavaScript
output.register_callback('notebook.take_photo', handle_photo)
output.register_callback('notebook.print_message', print_message)

# Start capturing photos continuously
take_photo_continuous(interval=1, quality=0.8)


<IPython.core.display.Javascript object>

KeyboardInterrupt: 

Photo captured and saved as photo.jpg
N
Captured photo
Photo captured and saved as photo.jpg
N
Captured photo
Photo captured and saved as photo.jpg
N
Captured photo
Photo captured and saved as photo.jpg
L
Captured photo
Photo captured and saved as photo.jpg
L
Captured photo
Photo captured and saved as photo.jpg
L
Captured photo
Photo captured and saved as photo.jpg
N
Captured photo
Photo captured and saved as photo.jpg
N
Captured photo
Photo captured and saved as photo.jpg
A
Captured photo
Photo captured and saved as photo.jpg
N
Captured photo
Photo captured and saved as photo.jpg
N
Captured photo
Photo captured and saved as photo.jpg
Captured photo
Photo captured and saved as photo.jpg
L
Captured photo
Photo captured and saved as photo.jpg
Captured photo
Photo captured and saved as photo.jpg
N
Captured photo
Photo captured and saved as photo.jpg
N
Captured photo
Photo captured and saved as photo.jpg
N
Captured photo
Photo captured and saved as photo.jpg
P
Captured photo
Photo captured

# Testing


### image with landmark points

In [None]:
#function to print image with anchor points

from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np

MARGIN = 10  # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # vibrant green

def draw_landmarks_on_image(rgb_image, detection_result):
  hand_landmarks_list = detection_result.hand_landmarks
  handedness_list = detection_result.handedness
  annotated_image = np.copy(rgb_image)

  # Loop through the detected hands to visualize.
  for idx in range(len(hand_landmarks_list)):
    hand_landmarks = hand_landmarks_list[idx]
    handedness = handedness_list[idx]

    # Draw the hand landmarks.
    hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
    print(hand_landmarks_proto)
    hand_landmarks_proto.landmark.extend([
      landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
    ])
    solutions.drawing_utils.draw_landmarks(
      annotated_image,
      hand_landmarks_proto,
      solutions.hands.HAND_CONNECTIONS,
      solutions.drawing_styles.get_default_hand_landmarks_style(),
      solutions.drawing_styles.get_default_hand_connections_style())

    # Get the top left corner of the detected hand's bounding box.
    height, width, _ = annotated_image.shape
    x_coordinates = [landmark.x for landmark in hand_landmarks]
    y_coordinates = [landmark.y for landmark in hand_landmarks]
    text_x = int(min(x_coordinates) * width)
    text_y = int(min(y_coordinates) * height) - MARGIN

    #nate's addition
    x_coords = [landmark.x for landmark in hand_landmarks]
    y_coords = [landmark.y for landmark in hand_landmarks]
    z_coords = [landmark.z for landmark in hand_landmarks]
    print(len(x_coords))
    print(len(y_coords))
    print(len(z_coords))

    # Draw handedness (left or right hand) on the image.
    cv2.putText(annotated_image, f"{handedness[0].category_name}",
                (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
                FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)

  return annotated_image

In [None]:
# STEP 1: Import the necessary modules.
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

# STEP 2: Create an HandLandmarker object.
base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
options = vision.HandLandmarkerOptions(base_options=base_options,
                                       num_hands=2)
detector = vision.HandLandmarker.create_from_options(options)

# STEP 3: Load the input image.
image = mp.Image.create_from_file("photo.jpg")

# STEP 4: Detect hand landmarks from the input image.
detection_result = detector.detect(image)

# STEP 5: Process the classification result. In this case, visualize it.
annotated_image = draw_landmarks_on_image(image.numpy_view(), detection_result)
cv2_imshow(cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR))
guess_image("photo.jpg")
