<a href="https://colab.research.google.com/github/minjae0501/yolo_block/blob/master/lstm_yolo_detecting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
# !git clone https://github.com/minjae0501/yolo_block.git

Cloning into 'yolo_block'...
remote: Enumerating objects: 3012, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 3012 (delta 3), reused 13 (delta 1), pack-reused 2993[K
Receiving objects: 100% (3012/3012), 117.88 MiB | 50.70 MiB/s, done.
Resolving deltas: 100% (50/50), done.


In [1]:
# %pip install ultralytics
# %pip install mediapipe
import ultralytics
ultralytics.checks()

Ultralytics YOLOv8.0.227 🚀 Python-3.9.13 torch-1.12.1+cu116 CUDA:0 (NVIDIA GeForce RTX 2070, 8192MiB)
Setup complete ✅ (8 CPUs, 15.9 GB RAM, 210.8/232.3 GB disk)


In [2]:
import cv2
import mediapipe as mp
from ultralytics import YOLO
import numpy as np
import torch.nn as nn

In [3]:
import torch

if torch.cuda.is_available() == True:
    device = 'cuda:0'
else:
    device = 'cpu'

print(device)

cuda:0


In [4]:
# ver 9
class hand_LSTM(nn.Module):
    def __init__(self, num_layers=1):
        super(hand_LSTM, self).__init__()
        """
        LayerNorm(): RNN과 LSTM에 적합 
        - LSTM과 같은 순환 신경망에서는 시간에 따른 의존성 때문에 배치 정규화가 잘 작동하지 않을 수 있다.
        - 반면 레이어 정규화는 시간적 의존성에 영향을 받지 않아 RNN과 LSTM에 더 적합하다.
        """
        # bidirectional -> 양방향 lstm: 시퀀스 데이터를 순방향과 역방향 모두 학습
        self.lstm1 = nn.LSTM(67, 128, num_layers, batch_first=True, bidirectional=True)
        # lstm layer 정규화 사용, 양방향이기 때문에 256개 
        self.layer_norm1 = nn.LayerNorm(256)
        self.dropout1 = nn.Dropout(0.1)
        
        self.lstm2 = nn.LSTM(256, 64, num_layers, batch_first=True, bidirectional=True)
        self.layer_norm2 = nn.LayerNorm(128)
        self.dropout2 = nn.Dropout(0.1)
        
        self.lstm3 = nn.LSTM(128, 32, num_layers, batch_first=True, bidirectional=True)
        self.layer_norm3 = nn.LayerNorm(64)
        self.dropout3 = nn.Dropout(0.1)
        
        self.attention = nn.Linear(64, 1)
        self.fc = nn.Linear(64, 2)
        
    def forward(self, x):
        x, _ = self.lstm1(x)
        x = self.layer_norm1(x)
        x = self.dropout1(x)
        
        x, _ = self.lstm2(x)
        x = self.layer_norm2(x)
        x = self.dropout2(x)
        
        x, _ = self.lstm3(x)
        x = self.layer_norm3(x)
        x = self.dropout3(x)
        
        # Attention 메커니즘
        attention_weights = torch.softmax(self.attention(x), dim=1)
        x = torch.sum(attention_weights * x, dim=1)
        
        x = self.fc(x)
        return x

In [5]:
# 모델 불러오기
model_path = './lstm_pth/more_data_lstm_model_ver9.pth'

lstm_model = hand_LSTM().to(device)
lstm_model.load_state_dict(torch.load(model_path, map_location=device))
lstm_model.eval()
print("모델이 성공적으로 불러와졌습니다.")

모델이 성공적으로 불러와졌습니다.


In [6]:
# YOLO 객체 감지 모델 초기화
best_model = './block_model/block_best_02.pt'
# best_model = 'best.pt'
yolo_model = YOLO(best_model)

In [7]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, seq_list):
        self.X = []
        self.y = []
        for dic in seq_list:
            self.y.append(dic['key'])
            self.X.append(dic['value'])
    def __getitem__(self, index):
        data = self.X[index]
        label = self.y[index]
        return torch.Tensor(np.array(data)), torch.tensor(np.array(int(label)))

    def __len__(self):
        return len(self.X)

In [13]:
length = 20
interval = 1
detect_cls = 1

video_path = './data/video_data/val/test.mp4'
# cv2.destroyAllWindows()
cap = cv2.VideoCapture(video_path)
img_list = []
if cap.isOpened():
    cnt = 0
    while True:
        ret , img = cap.read()
        if ret:
            img = cv2.resize(img, (640, 640))
            if cnt == interval:
                img_list.append(img)
                cnt =0
            # cv2.imshow('test_video', img)
            # cv2.waitKey(1)
            cnt += 1

        else: break
cap.release()
# cv2.destroyAllWindows()

print('저장된 frame의 개수 {}'.format(len(img_list)))

저장된 frame의 개수 673


In [14]:
from tqdm import tqdm

lstm_model.eval()
out_img_list = []
dataset = []
status = 'None'

detect_cls = 0
hand_cls = 6

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

print('시퀀스 데이터 분석 중..')
xyz_list_list = []

for img in tqdm(img_list):
    # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = hands.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    xyz_list = []
    filtered_boxes, filtered_box_class = [], []

    if not results.multi_hand_landmarks:
          continue

    for x_y_z in results.multi_hand_landmarks:
          for landmark in x_y_z.landmark:
            xyz_list.append(landmark.x)
            xyz_list.append(landmark.y)
            xyz_list.append(landmark.z)

          mp_drawing.draw_landmarks(img, x_y_z, mp_hands.HAND_CONNECTIONS)

    # YOLO 박스
    box_results = yolo_model.predict(img, conf = 0.6, verbose=False, show = False)
    box_results = box_results[0].boxes
    boxes = box_results.xyxy.cpu().tolist()
    box_class = box_results.cls.cpu().tolist()


    x1, y1, x2, y2 = 0, 0, 0, 0
    hx1, hy1, hx2, hy2 = 0,0,0,0
    for idx, cls in enumerate(box_class):
        if int(cls) == detect_cls:
            x1, y1, x2, y2 = boxes[idx]
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
            cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
        elif int(cls) == hand_cls:
            hx1, hy1, hx2, hy2 = boxes[idx]
            hx1, hy1, hx2, hy2 = int(hx1), int(hy1), int(hx2), int(hy2)
            cv2.rectangle(img, (int(hx1), int(hy1)), (int(hx2), int(hy2)), (0, 0, 255), 2)

    xyz_list.append(abs(x1-hx1)/640) 
    xyz_list.append(abs(x2-hx2)/640)
    xyz_list.append(abs(y1-hy1)/640)
    xyz_list.append(abs(y2-hy2)/640)

    xyz_list_list.append(xyz_list)

    if len(xyz_list_list) == length:
        dataset = []
        dataset.append({'key': 0, 'value': xyz_list_list})
        dataset = MyDataset(dataset)
        dataset = DataLoader(dataset)
        xyz_list_list = []
        for data, label in dataset:
            data = data.to(device)
            with torch.no_grad():
                result = lstm_model(data)
                _, out = torch.max(result, 1)
                if out.item() == 0: status = 'Release'
                else: status = 'Grab'

    cv2.putText(img, status, (0, 50), cv2.FONT_HERSHEY_COMPLEX, 1.5, (0,0, 225), 2)
    out_img_list.append(img)

시퀀스 데이터 분석 중..


100%|██████████| 673/673 [00:22<00:00, 30.39it/s]


In [15]:
filename = './data/video_data/output/video_out.mp4'
fourcc = cv2.VideoWriter_fourcc(*'DIVX')
fps = 30
frameSize = (640,640)
isColor = True
out = cv2.VideoWriter(filename, fourcc, fps, frameSize, isColor)
for out_img in out_img_list:
    # print(out_img)
    out.write(out_img)

out.release()