# Using Keypoint by mediapipe

In [1]:
import os
import random
import pandas as pd
import numpy as np

import cv2
import mediapipe as mp

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

### Hyperparameter Setting

In [2]:
CFG = {
    'FPS':30,
    'IMG_SIZE':128,
    'EPOCHS':30,
    'LEARNING_RATE':2e-4,
    'BATCH_SIZE':16,
    'SEED':41
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

### Data Load

In [4]:
data_root_path = r"D:\AI_Data\TV_HMCR"

In [5]:
df_train = pd.read_csv(os.path.join(data_root_path, "train.csv"))
df_train["path"] = df_train["path"].apply(lambda x: os.path.join(data_root_path, os.path.join(x.split("/")[1], x.split("/")[2])))
display(df_train.head(3))

df_test = pd.read_csv(os.path.join(data_root_path, "test.csv"))
df_test["path"] = df_test["path"].apply(lambda x: os.path.join(data_root_path, os.path.join(x.split("/")[1], x.split("/")[2])))
display(df_test.head(3))

df_train.shape, df_test.shape

Unnamed: 0,id,path,label
0,TRAIN_000,D:\AI_Data\TV_HMCR\train\TRAIN_000.mp4,3
1,TRAIN_001,D:\AI_Data\TV_HMCR\train\TRAIN_001.mp4,0
2,TRAIN_002,D:\AI_Data\TV_HMCR\train\TRAIN_002.mp4,1


Unnamed: 0,id,path
0,TEST_000,D:\AI_Data\TV_HMCR\test\TEST_000.mp4
1,TEST_001,D:\AI_Data\TV_HMCR\test\TEST_001.mp4
2,TEST_002,D:\AI_Data\TV_HMCR\test\TEST_002.mp4


((610, 3), (153, 2))

### Resize

In [6]:
def get_video(path):
    frames = []
    cap = cv2.VideoCapture(path)
    for _ in range(CFG["FPS"]):
        _, img = cap.read()
        img = cv2.resize(img, (CFG["IMG_SIZE"], CFG["IMG_SIZE"]))
        img = img / 255.
        frames.append(img)
    return frames

### Train / Validation Split

In [7]:
train, val, _, _ = train_test_split(df_train, df_train["label"], test_size=0.2, random_state=CFG["SEED"])

train.shape, val.shape

((488, 3), (122, 3))

### Keypoint Extract

In [8]:
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_hands = mp.solutions.hands

In [19]:
from copy import deepcopy
from sklearn.metrics.pairwise import itertools

class Detecter():
    def __init__(self, mode=False, maxHands=2, detection_conf=1, track_conf=0.5):
        self.mode = mode
        self.maxHands = maxHands
        self.detection_conf = detection_conf
        self.track_conf = track_conf
        self.mpHands = mp.solutions.hands
        self.hands = self.mpHands.Hands(self.mode, self.maxHands, self.detection_conf, self.track_conf)
        self.mpDraw = mp.solutions.drawing_utils
        
    def find_hands(self, img, draw=False):
        rgb_img = cv2.flip(cv2.cvtColor(img, cv2.COLOR_BGR2RGB), 1)
        self.results = self.hands.process(rgb_img)
        if self.results.multi_hand_landmarks:
            for hand_mk, hand_ness in zip(self.results.multi_hand_landmarks, self.results.multi_handedness):
                if draw: self.mpDraw.draw_landmarks(img, hand_mk, self.mpHands.HAND_CONNECTIONS)
        return img
    
    def find_position(self, img, hand_no=0, draw=False):
        mk = []
        if self.results.multi_hand_landmarks:
            hand = self.results.multi_hand_landmarks[hand_no]
            for _, mk_part in enumerate(hand.landmark):
                h, w, c = img.shape
                # cx, cy = min(int(mk_part.x*w), w-1), min(int(mk_part.y*h), h-1)
                # mk.append([cx, cy])
                cx, cy, cz = mk_part.x, mk_part.y, mk_part.z
                mk.append(torch.FloatTensor([cx, cy, cz]))
                if draw: cv2.circle(img, (cx, cy), 15, (255, 0, 255), cv2.FILLED)
        return mk

In [39]:
detecter = Detecter()

train_lmk = {}

for v in range(df_train.shape[0])[:3]:
    vPath = df_train.iloc[v]["path"]
    print(v)
    cap = cv2.VideoCapture(vPath)
    while True:
        ret, img = cap.read()
        if not ret: break
        img = detecter.find_hands(img)
        lmk = detecter.find_position(img)
        
        train_lmk[v] = lmk
        
for v in train_lmk.keys():
    train_lmk[v] = torch.stack(train_lmk[0])


0
1
2


TypeError: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor

In [40]:
train_lmk.keys()

dict_keys([0, 1, 2])