In [None]:
import numpy as np
from tensorflow import keras
from keras.applications import ResNet50
import cv2
import numpy as np
import mediapipe as mp
from ultralytics import YOLO

In [None]:
model = YOLO('/Users/mahikanair/.pyenv/runs/classify/train3/weights/best.pt')  #pre-trained(by us) yolo

In [None]:
mp_hands = mp.solutions.hands #model trained to identify hands through landmarks 
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)
#kind of hands is only images (static) with the confidence being the level of confidence that the object being detected is a hand

In [None]:
def predict_from_video(img):
    #the image is currently a numpy array (uint.8), we need it in the jpg file format for yolo
    cv2.imwrite('saved_image.jpg', cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) #making sure it's saved as RGB
    results = model('saved_image.jpg') #direct jpg to model for correct classification, results is a list
    prediction = results[0].probs.top1 #results[0] gives the first object detected, each element of results is a module with classes in it
    #the class probs has the instance variable top1 for the class number that has the highest prediction
    return prediction #this is our predicted class for this image 


def box_maker(mhl, h, w):
    x_land = []
    y_land = []
    for hand_landmarks in mhl: #there should be 21 landmarks for each hand 
        print(hand_landmarks)
        for i in range(len(hand_landmarks.landmark)):
            x = hand_landmarks.landmark[i].x
            y = hand_landmarks.landmark[i].y
            x_land.append(x)
            y_land.append(y)
        
        #we found out these are normalised coordinates, so we had to multiply it with the dimensios of the image to get the real coordinates
        x1 = int(min(x_land)*w) 
        y1 = int(min(y_land)*h) 
        
        x2 = int(max(x_land)*w) 
        y2 = int(max(y_land)*h) 
        
        
        #a rectangular box is fine for display around the hand, but to avoid distortion we need to make square boxes around the ROI for prediction
        width = x2 - x1
        height = y2 - y1

        longer_side = max(width, height)
        new_x1 = x1 + (width - longer_side) // 2 - 20
        new_y1 = y1 + (height - longer_side) // 2 - 20
        new_x2 = new_x1 + longer_side + 20
        new_y2 = new_y1 + longer_side + 20
        #the -+20s are to make it slightly bigger and make sure the box isnt cutting off the fingers 
        #absolute to prevent negative values which could crash the model 
        new_x1 = abs(new_x1)
        new_x2 = abs(new_x2)
        new_y1 = abs(new_y1)
        new_y2 = abs(new_y2)
        
    return x1, y1, x2, y2, new_x1, new_y1, new_x2, new_y2


capture = cv2.VideoCapture(1)

#label mappigs to mudra names 
labels_dict = {0: 'alapadma', 1: 'arala', 12: 'ardhachandra', 23: 'ardhapataka', 24: 'bhramhara', 25: 'chandrakala', 26: 'chatura', 27:'hamsapaksha', 28:'hamsasya', 29:'kangula', 2:'kapitha', 3:'kartarimukha', 4:'katakamukha-1', 5:'katakamukha-2', 6:'katakamukha-3', 7:'mayura', 8:'mrighashisha', 9:'mukula', 10:'mushthi', 11:'padmakosha', 13:'pataka', 14:'santamsha', 15:'sarpashisha', 16:'shikhara', 17:'shukatunda', 18:'singhamukha', 19:'suchi', 20:'tamrachuda', 21:'tripataka', 22:'trishula'}

while True:
    ret, frame = capture.read()
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) #the image 
    h, w, _ = frame.shape
    results = hands.process(frame) #calculates the hand landmarks 
    if results.multi_hand_landmarks: #if it found the landmarks 
        #we want to detect 1 or 2 hands (essentially for one dancer)
        if len(results.multi_hand_landmarks) == 1:  
            x1, y1, x2, y2, new_x1, new_y1, new_x2, new_y2 = box_maker(results.multi_hand_landmarks, h, w)
            #cropping frame with square coordinates
            cropped_frame = frame[new_y1:new_y2, new_x1:new_x2]
            
            prediction = predict_from_video(cropped_frame)
            #finding the name of the predicted mudra
            predicted_character = labels_dict[prediction]
            #frame for hand
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 0), 4)
            #text display for the mudra name
            cv2.putText(frame, predicted_character, (x1, y1 - 10), cv2.FONT_HERSHEY_DUPLEX, 2, (0, 0, 0), 3) 
            
            cv2.imshow('frame', cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) #to see the frame in the video 
            
        elif len(results.multi_hand_landmarks) == 2: #everything repeated twice for two hands 
            x11, y11, x21, y21, new_x11, new_y11, new_x21, new_y21 = box_maker([results.multi_hand_landmarks[0]], h, w)
            x12, y12, x22, y22, new_x12, new_y12, new_x22, new_y22 = box_maker([results.multi_hand_landmarks[1]], h, w)
            
            cropped_frame1 = frame[new_y11:new_y21, new_x11:new_x21]
            cropped_frame2 = frame[new_y12:new_y22, new_x12:new_x22]
            
            prediction1 = predict_from_video(cropped_frame1)
            prediction2 = predict_from_video(cropped_frame2)
            
            predicted_character1 = labels_dict[prediction1]
            predicted_character2 = labels_dict[prediction2]
            
            cv2.rectangle(frame, (x11, y11), (x21, y21), (0, 0, 0), 4)
            cv2.rectangle(frame, (x12, y12), (x22, y22), (0, 0, 0), 4)
            
            cv2.putText(frame, predicted_character1, (x11, y11 - 10), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 0), 3)
            cv2.putText(frame, predicted_character2, (x12, y12 - 10), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 0), 3)
            
            cv2.imshow('frame', cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
            
        next = cv2.waitKey(25) #every 25 ms
        if next == 27:  #to stop with esc key 
            break

capture.release()
cv2.destroyAllWindows()