# Demo for Self-Trained Model

## Instructions
Run below cells to initialise model and parameters.
Run 'Actual Program Code' (labelled below) to run demo.

## Requirements
Ensure you have torch, torchvision, opencv, numpy, Pillow installed. You can pip install them.
Also, git clone yolov5 at https://github.com/ultralytics/yolov5 into the same folder as this ipynb.

In [1]:
import torch
import cv2
import numpy as np

from PIL import Image
from torchvision import transforms


In [2]:
import sys
sys.path.append('../')
sys.path.append('../Hand_Pose_Estimation_2D')
print(sys.path)


from Hand_Pose_Estimation_2D.Utils.model import ShallowUNet
from Hand_Pose_Estimation_2D.Utils.utils import (
    COLORMAP,
    heatmaps_to_coordinates,
    N_KEYPOINTS,
    RAW_IMG_SIZE,
    MODEL_IMG_SIZE,
    show_batch_predictions,
    DATASET_MEANS,
    DATASET_STDS,
)

['c:\\Users\\junha\\OneDrive\\Documents\\SUTD\\Term 7\\50.035 - Computer Vision\\50.035CV-C01-Team10-Visual-Interactive-Game\\Hand_Detection_YOLOv5', 'C:\\Users\\junha\\AppData\\Local\\Programs\\Python\\Python39\\python39.zip', 'C:\\Users\\junha\\AppData\\Local\\Programs\\Python\\Python39\\DLLs', 'C:\\Users\\junha\\AppData\\Local\\Programs\\Python\\Python39\\lib', 'C:\\Users\\junha\\AppData\\Local\\Programs\\Python\\Python39', '', 'C:\\Users\\junha\\AppData\\Roaming\\Python\\Python39\\site-packages', 'C:\\Users\\junha\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages', 'C:\\Users\\junha\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\win32', 'C:\\Users\\junha\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\win32\\lib', 'C:\\Users\\junha\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\Pythonwin', 'C:\\Users\\junha\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\IPython\\extensions', 'C:\\Users\\junha\\.ipy

## Loading Trained Model

In [3]:
BEST_WEIGHT_PATH = '.\\weights\\best.pt'
model = torch.hub.load('ultralytics/yolov5', 'custom', BEST_WEIGHT_PATH)

Using cache found in C:\Users\junha/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2022-4-16 torch 1.11.0+cu113 CUDA:0 (NVIDIA GeForce RTX 3060 Ti, 8192MiB)

Fusing layers... 
Model summary: 213 layers, 7012822 parameters, 0 gradients, 15.8 GFLOPs
Adding AutoShape... 


In [4]:
UNET_PATH = '..\\Hand_Pose_Estimation_2D\\model_final'
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE = torch.device("cpu")

unet_model = ShallowUNet(3,21)
unet_model.load_state_dict(
    torch.load(UNET_PATH, map_location=DEVICE)
)
unet_model.to(DEVICE)
unet_model.eval()
print("Model loaded")

Model loaded


## Config Variables

In [5]:
JOINT_LIST = [
    [0, 1, 2, 3, 4],
    [0, 5, 6, 7, 8],
    [9, 10, 11, 12],
    [13, 14, 15, 16],
    [0, 17, 18, 19, 20],
    [5, 9, 13, 17]
]

## WebCam Testing

In [6]:
def plot_boxes(labels, cord, frame):
        """
        Takes a frame and its results as input, and plots the bounding boxes and label on to the frame.
        :param results: contains labels and coordinates predicted by model on the given frame.
        :param frame: Frame which has been scored.
        :return: Frame with bounding boxes and labels ploted on it.
        """
        n = len(labels)
        if n != 0:
            x_shape, y_shape = frame.shape[1], frame.shape[0]
            for i in range(n):
                row = cord[i]
                if row[4] >= 0.5:
                    x1, y1, x2, y2 = int(row[0]*x_shape), int(row[1]*y_shape), int(row[2]*x_shape), int(row[3]*y_shape)
                    bgr = (0, 255, 0)
                    cv2.rectangle(frame, (x1, y1), (x2, y2), bgr, 2)
                    cv2.putText(frame, f'Hand: {row[4]}%', (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.9, bgr, 2)

        return frame

In [7]:
def crop_image(cord, frame):

    x_shape, y_shape = frame.shape[1], frame.shape[0]

    row = cord
    if row[4] >= 0.5:
        x1, y1, x2, y2 = int(row[0]*x_shape), int(row[1]*y_shape), int(row[2]*x_shape), int(row[3]*y_shape)
        frame = frame[y1:y2, x1:x2]
    return frame

In [8]:
def pre_process(image):
    x_shape, y_shape = image.shape[1], image.shape[0]
    image_transform = transforms.Compose(
            [   
                transforms.CenterCrop(max(x_shape,y_shape)),
                transforms.Resize(MODEL_IMG_SIZE),
                transforms.ToTensor(),
                transforms.Normalize(mean=DATASET_MEANS, std=DATASET_STDS),
            ]
        )

    image = Image.fromarray(image)
    image = image_transform(image)
    image = image.unsqueeze(0)
    return image

In [9]:
def draw_hands(preds, crop_image, frame):
    x_shape, y_shape = crop_image.shape[1], crop_image.shape[0]
    
    preds = preds.detach().cpu().numpy()
    pred_keypoints = heatmaps_to_coordinates(preds).squeeze(0)

    if x_shape > y_shape:
        keypoint_tuples = [tuple((np.multiply(i, x_shape)-[0, (x_shape-y_shape)//2]).astype(int)) for i in pred_keypoints]
        
        
    else:
        keypoint_tuples = [tuple((np.multiply(i, y_shape)-[(y_shape-x_shape)//2, 0]).astype(int)) for i in pred_keypoints]

    
    for i in keypoint_tuples:
        cv2.circle(crop_image, i, 2, (255,0,0), 2)
        
    
    for segment in JOINT_LIST:
        for i in range(len(segment)-1):
            crop_image = cv2.line(crop_image, keypoint_tuples[segment[i]], keypoint_tuples[segment[i+1]], (0,128,0), 1, cv2.LINE_AA)
    return crop_image

In [9]:
def draw_hands(crop_coords, image, model):
    image_cropped = crop_image(crop_coords, image)
    image_tensor = pre_process(image_cropped)
    preds = model(image_tensor)
    
    x_shape, y_shape = image_cropped.shape[1], image_cropped.shape[0]
    x1, y1, x2, y2 = int(crop_coords[0]*image.shape[1]), int(crop_coords[1]*image.shape[0]), int(crop_coords[2]*image.shape[1]), int(crop_coords[3]*image.shape[0])

    preds = preds.detach().cpu().numpy()
    pred_keypoints = heatmaps_to_coordinates(preds).squeeze(0)

    keypoint_tuples = []
    for i in pred_keypoints:

        if x_shape > y_shape:
            temp_coords = np.multiply(i, x_shape)
            temp_coords -= [0,(x_shape-y_shape)/2]
            temp_coords += [x1, y1]
            temp_coords = tuple(temp_coords.astype(int))
            keypoint_tuples.append(temp_coords)

        else:
            temp_coords = np.multiply(i, y_shape)
            temp_coords -= [(y_shape-x_shape)/2, 0]
            temp_coords += [x1, y1]
            temp_coords = tuple(temp_coords.astype(int))
            keypoint_tuples.append(temp_coords)

    for i in keypoint_tuples:
        cv2.circle(image, i, 2, (255,0,0), 2)
    
    for segment in JOINT_LIST:
        for i in range(len(segment)-1):
            cv2.line(image, keypoint_tuples[segment[i]], keypoint_tuples[segment[i+1]], (0,128,0), 1, cv2.LINE_AA)

    return image

## Actual Program Code:
Ensure you have run all the above cells before running the below cell. Ensure your webcam is not being used by any other applications. Press 'Q' on keyboard to exit.

In [10]:
Object_colors = list(np.random.rand(80,3)*255)
Object_classes = ['Hand']

cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()

    # Change colour format from BGR to RGB
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Flip image along y axis
    image = cv2.flip(image, 1)

    # Set flag to false
    image.flags.writeable = False

    objs = model(image)

    labels, cord_thres = objs.xyxyn[0][:, -1].cpu().numpy(), objs.xyxyn[0][:, :-1].cpu().numpy()

    image = plot_boxes(labels, cord_thres, image)

    for i in range(len(labels)):
        image = draw_hands(
            crop_coords=cord_thres[i],
            image=image,
            model=unet_model,
        )


    # Change colour format from RGB to BGR
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    cv2.imshow('Hand Tracking', image)

    if cv2.waitKey(10) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()  



torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])
torch.Size([1, 3, 128, 128])


Run this code in case of any crashes to release resources for the webcam for other applications.

In [14]:
cap.release()
cv2.destroyAllWindows()