In [156]:
import mediapipe as mp
import cv2
import time
import numpy as np
import torch
import torch.nn as nn
import torch.utils as utils
from datetime import datetime
import pickle

In [6]:
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

In [16]:
classes = ("down", "up", "thumb")

In [18]:
hands = mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, static_image_mode=True, max_num_hands=1)

In [91]:
train_data = []
train_labels = []

for class_index, gesture_class in enumerate(classes):
    for i in range(50):
        image = cv2.imread(f"../training/{gesture_class}.{i}.jpg")

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # changes from bgr to rgb since cv2 is bgr but mediapipe requires rgb
        image.flags.writeable = False
        results = hands.process(image) # this makes the actual detections
        
        landmarks = []
        if results.multi_hand_landmarks:
            for landmark in results.multi_hand_landmarks[0].landmark:
                x, y = landmark.x, landmark.y
                landmarks.append([x,y])
            train_data.append(landmarks)
            train_labels.append(class_index)

In [92]:
train_data = torch.tensor(train_data)
train_labels = torch.tensor(train_labels)

In [93]:
class LandmarksDataset(utils.data.Dataset):
    def __init__(self, X, y, transform=None):
        self.X = X
        self.y = y
        self.len = len(y)
        self.transform = transform
    def __len__(self):
        return self.len
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [94]:
training_set = LandmarksDataset(train_data, train_labels)
training_loader = torch.utils.data.DataLoader(training_set, batch_size=4, shuffle=True)

In [95]:
val_data = []
val_labels = []
for class_index, gesture_class in enumerate(classes):
    for i in range(10):
        image = cv2.imread(f"../validation/{gesture_class}.{i}.jpg")

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # changes from bgr to rgb since cv2 is bgr but mediapipe requires rgb
        image.flags.writeable = False
        results = hands.process(image) # this makes the actual detections
        
        landmarks = []
        if results.multi_hand_landmarks:
            for landmark in results.multi_hand_landmarks[0].landmark:
                x, y = landmark.x, landmark.y
                landmarks.append([x,y])
            val_data.append(landmarks)
            val_labels.append(class_index)

In [96]:
val_data = torch.tensor(val_data)
val_labels = torch.tensor(val_labels)

In [97]:
validation_set = LandmarksDataset(val_data, val_labels)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=4, shuffle=False)

In [98]:
class HandNetwork(nn.Module):
    def __init__(self):
        super(HandNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(42, 120)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(120, 100)
        self.fc3 = nn.Linear(100, 3)
    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

In [99]:
model = HandNetwork()
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.1)

In [100]:
def train_one_epoch():
    running_loss = 0
    last_loss = 0

    for i, data in enumerate(training_loader):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward() # calculate the gradients
        optimizer.step() # update the params

        running_loss += loss.item()
        if i % 10 == 10-1:
            last_loss = running_loss / 10 # loss per batch
            print(f'  batch {i+1} loss: {last_loss}')
            running_loss = 0
    
    return last_loss

In [101]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

#This is doing some logging that we don't need to worry about right now.
epoch_number = 0

EPOCHS = 10

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))
    
    model.train(True)
    
    avg_loss = train_one_epoch()

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(validation_loader):
        vinputs, vlabels = vdata
        voutputs = model(vinputs)
        vloss = loss_fn(voutputs, vlabels)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    
    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)

    epoch_number += 1

EPOCH 1:
  batch 10 loss: 1.113238763809204
  batch 20 loss: 1.0558824300765992
  batch 30 loss: 1.057293999195099
LOSS train 1.057293999195099 valid 0.9899702072143555
EPOCH 2:
  batch 10 loss: 1.0305298686027526
  batch 20 loss: 0.9487264335155488
  batch 30 loss: 0.9574961841106415
LOSS train 0.9574961841106415 valid 0.8579675555229187
EPOCH 3:
  batch 10 loss: 0.7739121735095977
  batch 20 loss: 0.6138310134410858
  batch 30 loss: 0.5274294495582581
LOSS train 0.5274294495582581 valid 0.9383283257484436
EPOCH 4:
  batch 10 loss: 0.3090458258986473
  batch 20 loss: 0.3609661117196083
  batch 30 loss: 0.45305762737989425
LOSS train 0.45305762737989425 valid 0.21307110786437988
EPOCH 5:
  batch 10 loss: 0.16431054323911667
  batch 20 loss: 0.3225617468357086
  batch 30 loss: 0.08997641056776047
LOSS train 0.08997641056776047 valid 0.08713100105524063
EPOCH 6:
  batch 10 loss: 0.08480656733736396
  batch 20 loss: 0.07881109770387411
  batch 30 loss: 0.03611126132309437
LOSS train 0.036

In [102]:
model.eval()
correct_count = 0
total = len(validation_set)
with torch.no_grad():
    for i, vdata in enumerate(validation_loader):
        inputs, labels = vdata
        outputs = torch.argmax(model(inputs), dim=1)
        correct_count += (outputs==labels).sum().item()

In [103]:
correct_count/total

1.0

In [165]:
from sklearn.neighbors import KNeighborsClassifier

In [166]:
knn = KNeighborsClassifier(n_neighbors=5)

In [167]:
train_data.shape

torch.Size([135, 21, 2])

In [168]:
train_data.view(135,21*2)

tensor([[0.2628, 0.3843, 0.3132,  ..., 0.5272, 0.2535, 0.4854],
        [0.2578, 0.3948, 0.3096,  ..., 0.5330, 0.2556, 0.4921],
        [0.2476, 0.3998, 0.2953,  ..., 0.5155, 0.2336, 0.4802],
        ...,
        [0.3773, 0.6548, 0.3923,  ..., 0.6877, 0.4692, 0.6789],
        [0.3763, 0.6591, 0.3913,  ..., 0.6898, 0.4686, 0.6789],
        [0.3564, 0.6563, 0.3718,  ..., 0.6929, 0.4502, 0.6840]])

In [169]:
train_data.shape

torch.Size([135, 21, 2])

In [170]:
knn.fit(train_data.view(135,21*2), train_labels)

In [171]:
from sklearn.metrics import accuracy_score

In [172]:
y_pred = knn.predict(val_data.view(-1,21*2))

In [186]:
val_data.view(-1,21*2)[0].view(1,42)

tensor([[0.3923, 0.1972, 0.4316, 0.2444, 0.4503, 0.3450, 0.4482, 0.4423, 0.4481,
         0.5086, 0.4760, 0.3307, 0.4768, 0.4531, 0.4709, 0.5305, 0.4674, 0.5880,
         0.4438, 0.3205, 0.4314, 0.4712, 0.4205, 0.4463, 0.4221, 0.3985, 0.4062,
         0.3144, 0.3921, 0.4440, 0.3861, 0.4099, 0.3909, 0.3660, 0.3692, 0.3091,
         0.3662, 0.4117, 0.3645, 0.3888, 0.3692, 0.3518]])

In [190]:
out = knn.predict(val_data.view(-1,21*2)[0].view(1,42))
out

array([0])

In [192]:
out.item()

0

In [178]:
y_pred

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2])

In [173]:
val_labels.numpy()

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2])

In [177]:
accuracy_score(val_labels.numpy(), y_pred)

1.0

In [175]:
knn

In [176]:
with open("../models/model2.pkl", "wb") as file:
    pickle.dump(knn, file)