In [1]:
import mediapipe as mp
import cv2
import numpy as np
import uuid
import os
import time

In [2]:
import torch

In [3]:
import torch.utils as utils

In [4]:
utils.data.Dataset()

<torch.utils.data.dataset.Dataset at 0x17d6fb0d0>

In [5]:
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

In [6]:
image = cv2.imread("../training/down.0.jpg")

In [7]:
hands = mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, static_image_mode=False, max_num_hands=1)

In [8]:
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # changes from bgr to rgb since cv2 is bgr but mediapipe requires rgb
image.flags.writeable = False
results = hands.process(image) # this makes the actual detections
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [13]:
landmarks = []
if results.multi_hand_landmarks:
    for landmark in results.multi_hand_landmarks[0].landmark:
        x, y = landmark.x, landmark.y
        landmarks.append([x,y])

In [642]:
landmarks = np.array(landmarks)

In [643]:
landmarks_tensor = torch.tensor(landmarks, dtype=torch.float32)

In [644]:
landmarks_tensor.shape

torch.Size([21, 2])

In [645]:
landmarks_tensor

tensor([[0.2628, 0.3843],
        [0.3132, 0.4501],
        [0.3475, 0.5395],
        [0.3496, 0.6470],
        [0.3377, 0.7126],
        [0.3652, 0.4491],
        [0.3603, 0.5908],
        [0.3541, 0.6657],
        [0.3461, 0.7193],
        [0.3246, 0.4260],
        [0.3129, 0.5904],
        [0.3077, 0.5611],
        [0.3129, 0.5146],
        [0.2826, 0.4209],
        [0.2775, 0.5821],
        [0.2794, 0.5434],
        [0.2843, 0.4964],
        [0.2459, 0.4240],
        [0.2426, 0.5528],
        [0.2482, 0.5272],
        [0.2535, 0.4854]])

In [646]:
data = []
y_vals = []
for i in range(50):
    image = cv2.imread(f"../training/down.{i}.jpg")

    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # changes from bgr to rgb since cv2 is bgr but mediapipe requires rgb
    image.flags.writeable = False
    results = hands.process(image) # this makes the actual detections
    
    landmarks = []
    if results.multi_hand_landmarks:
        for landmark in results.multi_hand_landmarks[0].landmark:
            x, y = landmark.x, landmark.y
            landmarks.append([x,y])
        data.append(landmarks)
        y_vals.append(0)

for i in range(50):
    image = cv2.imread(f"../training/up.{i}.jpg")

    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # changes from bgr to rgb since cv2 is bgr but mediapipe requires rgb
    image.flags.writeable = False
    results = hands.process(image) # this makes the actual detections
    
    landmarks = []
    if results.multi_hand_landmarks:
        for landmark in results.multi_hand_landmarks[0].landmark:
            x, y = landmark.x, landmark.y
            landmarks.append([x,y])
        data.append(landmarks)
        y_vals.append(1)

for i in range(50):
    image = cv2.imread(f"../training/thumb.{i}.jpg")

    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # changes from bgr to rgb since cv2 is bgr but mediapipe requires rgb
    image.flags.writeable = False
    results = hands.process(image) # this makes the actual detections
    
    landmarks = []
    if results.multi_hand_landmarks:
        for landmark in results.multi_hand_landmarks[0].landmark:
            x, y = landmark.x, landmark.y
            landmarks.append([x,y])
        data.append(landmarks)
        y_vals.append(2)


In [24]:
classes = ("down", "up", "thumbs up")

In [648]:
data = torch.tensor(data)
y_vals = torch.tensor(y_vals)

In [649]:
len(y_vals)

125

In [650]:
y_vals

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2])

In [651]:
class LandmarksDataset(utils.data.Dataset):
    def __init__(self, X, y, transform=None):
        self.X = X
        self.y = y
        self.len = len(y)
        self.transform = transform
    def __len__(self):
        return self.len
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [652]:
training_set = LandmarksDataset(data, y_vals)

In [653]:
training_loader = torch.utils.data.DataLoader(training_set, batch_size=4, shuffle=True)

In [654]:
data = []
y_vals = []
for i in range(10):
    image = cv2.imread(f"../validation/down.{i}.jpg")

    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # changes from bgr to rgb since cv2 is bgr but mediapipe requires rgb
    image.flags.writeable = False
    results = hands.process(image) # this makes the actual detections
    
    landmarks = []
    if results.multi_hand_landmarks:
        for landmark in results.multi_hand_landmarks[0].landmark:
            x, y = landmark.x, landmark.y
            landmarks.append([x,y])
        data.append(landmarks)
        y_vals.append(0)

for i in range(10):
    image = cv2.imread(f"../validation/up.{i}.jpg")

    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # changes from bgr to rgb since cv2 is bgr but mediapipe requires rgb
    image.flags.writeable = False
    results = hands.process(image) # this makes the actual detections
    
    landmarks = []
    if results.multi_hand_landmarks:
        for landmark in results.multi_hand_landmarks[0].landmark:
            x, y = landmark.x, landmark.y
            landmarks.append([x,y])
        data.append(landmarks)
        y_vals.append(1)

for i in range(10):
    image = cv2.imread(f"../validation/thumb.{i}.jpg")

    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # changes from bgr to rgb since cv2 is bgr but mediapipe requires rgb
    image.flags.writeable = False
    results = hands.process(image) # this makes the actual detections
    
    landmarks = []
    if results.multi_hand_landmarks:
        for landmark in results.multi_hand_landmarks[0].landmark:
            x, y = landmark.x, landmark.y
            landmarks.append([x,y])
        data.append(landmarks)
        y_vals.append(2)


In [655]:
data = torch.tensor(data)
y_vals = torch.tensor(y_vals)

In [656]:
y_vals

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
        2, 2])

In [657]:
validation_set = LandmarksDataset(data, y_vals)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=4, shuffle=False)

In [14]:
import torch.nn as nn
import torch.nn.functional as F

In [19]:
class HandNetwork(nn.Module):
    def __init__(self):
        super(HandNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(42, 120)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(120, 100)
        self.fc3 = nn.Linear(100, 3)
    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

In [660]:
model = HandNetwork()
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.1)

In [661]:
landmarks_tensor.shape

torch.Size([21, 2])

In [662]:
model(landmarks_tensor.view(-1, 21, 2))

tensor([[ 0.0393,  0.0584, -0.0740]], grad_fn=<AddmmBackward0>)

In [663]:
def train_one_epoch():
    running_loss = 0
    last_loss = 0

    for i, data in enumerate(training_loader):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward() # calculate the gradients
        optimizer.step() # update the params

        running_loss += loss.item()
        if i % 10 == 10-1:
            last_loss = running_loss / 10 # loss per batch
            print(f'  batch {i+1} loss: {last_loss}')
            running_loss = 0
    
    return last_loss

In [15]:
from datetime import datetime

In [682]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

#This is doing some logging that we don't need to worry about right now.
epoch_number = 0

EPOCHS = 10

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))
    
    model.train(True)
    
    avg_loss = train_one_epoch()

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(validation_loader):
        vinputs, vlabels = vdata
        voutputs = model(vinputs)
        vloss = loss_fn(voutputs, vlabels)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    
    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        #torch.save(model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:
  batch 10 loss: 0.03257557181641459
  batch 20 loss: 0.05546635733917356
  batch 30 loss: 0.019798521883785725
LOSS train 0.019798521883785725 valid 0.02325417287647724
EPOCH 2:
  batch 10 loss: 0.014236991945654153
  batch 20 loss: 0.041099155927076934
  batch 30 loss: 0.052653033100068566
LOSS train 0.052653033100068566 valid 0.02325417287647724
EPOCH 3:
  batch 10 loss: 0.022774085216224194
  batch 20 loss: 0.036964822374284265
  batch 30 loss: 0.04879928370937705
LOSS train 0.04879928370937705 valid 0.02325417287647724
EPOCH 4:
  batch 10 loss: 0.019898624811321496
  batch 20 loss: 0.06049192948266864
  batch 30 loss: 0.029163545928895473
LOSS train 0.029163545928895473 valid 0.02325417287647724
EPOCH 5:
  batch 10 loss: 0.05918706860393286
  batch 20 loss: 0.03175804233178496
  batch 30 loss: 0.017756244633346796
LOSS train 0.017756244633346796 valid 0.02325417287647724
EPOCH 6:
  batch 10 loss: 0.06509706024080515
  batch 20 loss: 0.02771271257661283
  batch 30 loss: 0.

In [20]:
import pickle

In [21]:
with open("../models/model.pkl", "rb") as file:
    model = pickle.load(file)

In [25]:
model.eval()

cap = cv2.VideoCapture(0)
frameCounter = 0
prevTime = 0
with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, static_image_mode=False, max_num_hands=1) as hands:
    while cap.isOpened():
        ret, frame = cap.read()
        landmarks = []
        # Detections
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # changes from bgr to rgb since cv2 is bgr but mediapipe requires rgb
        image.flags.writeable = False
        results = hands.process(image) # this makes the actual detections
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        if cv2.waitKey(1) &0xFF == ord('p'):
            if results.multi_hand_landmarks:
                for landmark in results.multi_hand_landmarks[0].landmark:
                    x, y = landmark.x, landmark.y
                    landmarks.append([x,y])
                landmarks = torch.tensor(landmarks)
                out = torch.argmax(model(landmarks.view(-1,21,2)))
                print(f"Prediction: {classes[out]}")

        # Rendering results
        
        # Print fps
        currTime = time.time()
        fps = 1 / (currTime-prevTime)
        prevTime = currTime
        image = cv2.flip(image,1)
        cv2.putText(image, f"FPS: {fps}", (20,70), cv2.FONT_HERSHEY_PLAIN, 3, (0, 196, 255), 2)

        cv2.imshow("Hand Tracking", image)
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
        
cap.release()
cv2.destroyAllWindows()

Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: down
Prediction: down
Prediction: down
Prediction: down
Prediction: down
Prediction: down
Prediction: down
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Prediction: thumbs up
Predictio

In [670]:
model

HandNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=42, out_features=120, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=120, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=3, bias=True)
)

In [671]:
import pickle

In [674]:
with open("../models/model.pkl", "wb") as file:
    pickle.dump(model, file)

In [678]:
loaded_model.

HandNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=42, out_features=120, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=120, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=3, bias=True)
)

In [679]:
model

HandNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=42, out_features=120, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=120, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=3, bias=True)
)