In [None]:
import cv2
import mediapipe as mp
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from utils.db_utils import *
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from utils.landmarks_utils import *
from utils.training_utils import *
import numpy as np

In [2]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size=4,output_size=3, hiddenLayerCount=1, temperature=1.0):
        super(MLP, self).__init__()
        self.hidden_layers = nn.ModuleList()
        self.hidden_layers.append(nn.Linear(input_size, hidden_size))
        for i in range(hiddenLayerCount-1):
            self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.temperature = temperature
        self.hiddenLayerCount = hiddenLayerCount
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, x):
        for layer in self.hidden_layers:
            x = F.relu(layer(x))
        logits = self.output_layer(x)
        return logits
    def forward_with_temperature(self, x): 
        logits = self.forward(x)
        scaled_logits = logits / self.temperature
        probabilities = nn.Softmax(dim=0)(scaled_logits)
        return probabilities


In [3]:
class UnitVectorDataset(Dataset):
    def __init__(self, unitvectors, labels, transform=None):
        self.unitvectors = unitvectors
        self.labels = labels
        self.transform = transform
    def __len__(self):
        return len(self.unitvectors)
    def __getitem__(self, idx):
        sample = {'unitvectors': torch.tensor(self.unitvectors[idx], dtype=torch.float32),
                  'label': torch.tensor(self.labels[idx], dtype=torch.long)}
        if self.transform:
            sample = self.transform(sample) 
        return sample

In [None]:

labels_map = {'middlefinger': 0, 'devilhorns': 1, 'fingerscrossed':2, 'nothing': 3}

conn = lanternConnect(DATABASE_URL)
vectors = getData(conn,'handunitvectors')
handunitvectors = [vector[1] for vector in vectors]
labels = [labels_map[vector[0]] for vector in vectors]
hand_gesture_dataset = UnitVectorDataset(handunitvectors, labels)
gesture_dataloader = DataLoader(hand_gesture_dataset, batch_size=32, shuffle=True)
model = MLP(input_size=108, hidden_size=8, output_size=len(labels_map), hiddenLayerCount=3,temperature=0.7)
# model.apply(custom_weight_init)
criterion = nn.CrossEntropyLoss()  # For classification tasks
optimizer = optim.Adam(model.parameters(), lr=0.00007)
epoch_randomizations = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
syntheticVectorsCount = 300

for i in range(len(epoch_randomizations)):
    randomizations = epoch_randomizations[i]
    for batch in gesture_dataloader:
        gestureUnitVectors = batch['unitvectors']
        gesture = batch['label']
        for j in range(len(gestureUnitVectors)):
            unitVectors = applyRandomRotations(gestureUnitVectors[j],randomizations)
            flattenedUnitVectors = unitVectors.flatten()
            outputs = model(torch.tensor(flattenedUnitVectors,dtype=torch.float32))
            loss = criterion(outputs, gesture[j])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    for j in range(syntheticVectorsCount):
        syntheticUnitVectors = generateSyntheticUnitVectors(conn)
        flattenedUnitVectors = syntheticUnitVectors.flatten()
        outputs = model(torch.tensor(flattenedUnitVectors,dtype=torch.float32))
        loss = criterion(outputs, torch.tensor(3))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch [{i+1}/{len(epoch_randomizations)}], Loss: {loss.item():.4f}')

In [None]:
def evaluate(model, vectors):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0        
        for vector in vectors:
            gesture = labels_map[vector[0]]
            hand_vector = vector[1].flatten()
            output = model(torch.tensor(hand_vector, dtype=torch.float32))
            predicted = torch.argmax(output) 
            if predicted == gesture:
                correct += 1
            total += 1
        for i in range(1000):
            syntheticUnitVectors = generateSyntheticUnitVectors(conn)
            syntheticUnitVectors = syntheticUnitVectors.flatten()
            output = model(torch.tensor(syntheticUnitVectors, dtype=torch.float32))
            gesture = 3
            predicted = torch.argmax(output)
            if predicted == gesture:
                correct += 1
            total += 1
        print(f'Accuracy: {100 * correct / total:.2f}%')
        
evaluate(model, vectors)


In [None]:
gesture_classes = [gesture for gesture in labels_map.keys()]
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False,
                       max_num_hands=2 ,
                       min_detection_confidence=0.5,
                       min_tracking_confidence=0.5)
cap = cv2.VideoCapture(0)  # 0 is the default camera
while cap.isOpened():
    success, image = cap.read()
    if not success:
        print("Ignoring empty camera frame.")
        continue
    # To improve performance, optionally mark the image as not writeable to pass by reference.
    # image.flags.writeable = False
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    flippedimage = cv2.flip(image, 1)
    results = hands.process(image)
    
    # Draw the hand annotations on the image.
    image.flags.writeable = True
    
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            nparray_list = unitVectorNormalization(hand_landmarks.landmark)
            list = []
            for np_array in nparray_list:
                list.extend(np_array.tolist()) 
            with torch.no_grad():
              output = model(torch.tensor(list))
              predicted = torch.argmax(output)
            #   print(predicted)
                
            cv2.putText(image, 
                    gesture_classes[predicted.item()],  # Text
                    (50, 50),          # Position (x, y)
                    cv2.FONT_HERSHEY_SIMPLEX,  # Font type
                    1,                 # Font scale (size)
                    (0, 255, 0),       # Font color (BGR): Green
                    2,                 # Thickness of the text
                    cv2.LINE_AA)       # Line type for better anti-aliasing
    
    # Display the resulting frame
    cv2.imshow('MediaPipe Hands', image)
    key = cv2.pollKey()
    if key > 0:
            if key == ord('q'):
                break

cap.release()
cv2.destroyAllWindows()
