In [1]:
import cv2
import mediapipe as mp
import numpy as np
import os
import random
import gc
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt

from IPython.display import Image, clear_output
from torch.cuda import memory_allocated, empty_cache
from torch.optim import Adam
from torch.utils.data import random_split
from torch.utils.data import Dataset, DataLoader
from glob import glob
from tqdm import tqdm
# from google.colab.patches import cv2_imshow

%matplotlib inline

In [2]:
# torch_ver Yolov5
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s',
                            device='cuda:0' if torch.cuda.is_available() else 'cpu')  # 예측 모델
yolo_model.classes = [0]  # 예측 클래스 (0 : 사람)

Using cache found in /Users/kimsungwook/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-4-17 Python-3.11.7 torch-2.2.2 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


In [3]:
BATCH_SIZE = 6
EPOCH = 700
NUM_LAYERS = 1      # LSTM model: num_layers
start_dot = 11      # mp.solutions.pose 시작 포인트 (0: 얼굴부터 발목까지, 11: 어깨부터 발목까지)
n_CONFIDENCE = 0.3    # MediaPipe Min Detectin confidence check
y_CONFIDENCE = 0.3    # Yolv5 Min Detectin confidence check

mp_pose = mp.solutions.pose
attention_dot = [n for n in range(start_dot, 29)]

# 라인 그리기
if start_dot == 11:
    """몸 부분만"""
    draw_line = [[11, 13], [13, 15], [15, 21], [15, 19], [15, 17], [17, 19], \
                [12, 14], [14, 16], [16, 22], [16, 20], [16, 18], [18, 20], \
                [23, 25], [25, 27], [24, 26], [26, 28], [11, 12], [11, 23], \
                [23, 24], [12, 24]]
    print('Pose : Only Body')

else:
    """얼굴 포함"""
    draw_line = [[11, 13], [13, 15], [15, 21], [15, 19], [15, 17], [17, 19], \
                [12, 14], [14, 16], [16, 22], [16, 20], [16, 18], [18, 20], \
                [23, 25], [25, 27], [24, 26], [26, 28], [11, 12], [11, 23], \
                [23, 24], [12, 24], [9, 10], [0, 5], [0, 2], [5, 8], [2, 7]]
    print('Pose : Face + Body')

Pose : Only Body


In [4]:
video_path = '../zerobase_DL_project/datas/dataset' # dataset 경로
video_path

'../zerobase_DL_project/datas/dataset'

In [83]:
raw_data = []

In [6]:
os.listdir(video_path)

['abnormal', 'normal']

In [7]:
os.listdir(video_path + '/' + 'normal')

['04_normal_30.mp4',
 '07_normal_30.mp4',
 '10_normal_30.mp4',
 '03_normal_30.mp4',
 '09_normal_30.mp4',
 '08_normal_30.mp4',
 '02_normal_30.mp4',
 '06_normal_30.mp4',
 '01_normal_30.mp4',
 '05_normal_30.mp4']

In [8]:
int('01_normal_30.mp4'.split('_')[2][:2]) >= 30

True

In [9]:
'01_normal_30.mp4'.split('_')[1] == 'normal'

True

In [10]:
label = 0

In [11]:
# skel_data_n, skel_data_f = get_skeleton('{}/{}'.format(video_path + '/' + fold, video_name), attention_dot, draw_line)

In [12]:
a = '{}/{}'.format('../zerobase_DL_project/datas/dataset' + '/' + 'normal', '01_normal_30.mp4')

In [13]:
attention_dot

[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]

In [14]:
draw_line

[[11, 13],
 [13, 15],
 [15, 21],
 [15, 19],
 [15, 17],
 [17, 19],
 [12, 14],
 [14, 16],
 [16, 22],
 [16, 20],
 [16, 18],
 [18, 20],
 [23, 25],
 [25, 27],
 [24, 26],
 [26, 28],
 [11, 12],
 [11, 23],
 [23, 24],
 [12, 24]]

In [15]:
# Yolov4 바운딩 box 안에서 media pipe 데이터 전처리 함수

In [16]:
frame_length = 30 # LSTM 모델에 넣을 frame 수

In [17]:
xy_list_list, xy_list_list_flip = [], []

In [18]:
cv2.destroyAllWindows()

In [19]:
pose = mp_pose.Pose(static_image_mode = True, model_complexity = 1, \
                        enable_segmentation = False, min_detection_confidence = n_CONFIDENCE)

I0000 00:00:1713770130.419851 1211075 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [20]:
cap = cv2.VideoCapture(a)

In [21]:
cap.isOpened()

True

In [22]:
ret, img = cap.read()

In [23]:
ret

True

In [24]:
img

array([[[ 35,  34,  18],
        [ 35,  34,  18],
        [ 35,  34,  18],
        ...,
        [175, 160,  96],
        [178, 162, 101],
        [179, 164, 102]],

       [[ 35,  34,  18],
        [ 35,  34,  18],
        [ 35,  34,  18],
        ...,
        [127, 112,  48],
        [129, 113,  52],
        [130, 115,  53]],

       [[ 35,  34,  18],
        [ 35,  34,  18],
        [ 35,  34,  18],
        ...,
        [131, 117,  51],
        [130, 115,  51],
        [129, 113,  49]],

       ...,

       [[ 42,  40,  47],
        [ 42,  40,  47],
        [ 44,  41,  48],
        ...,
        [151, 130, 131],
        [148, 127, 129],
        [148, 127, 129]],

       [[ 40,  35,  42],
        [ 42,  38,  45],
        [ 46,  41,  48],
        ...,
        [151, 130, 131],
        [148, 127, 129],
        [148, 127, 129]],

       [[ 23,  18,  25],
        [ 25,  20,  27],
        [ 31,  26,  33],
        ...,
        [151, 130, 131],
        [148, 127, 129],
        [148, 127, 129]]

In [25]:
img = cv2.resize(img, (640, 640))

In [26]:
img

array([[[ 35,  34,  18],
        [ 35,  34,  18],
        [ 35,  34,  18],
        ...,
        [147, 135,  65],
        [153, 139,  73],
        [161, 145,  84]],

       [[ 35,  34,  18],
        [ 35,  34,  18],
        [ 35,  34,  18],
        ...,
        [151, 138,  67],
        [137, 124,  54],
        [130, 115,  51]],

       [[ 34,  33,  17],
        [ 35,  34,  18],
        [ 35,  34,  18],
        ...,
        [147, 134,  63],
        [146, 133,  62],
        [144, 130,  64]],

       ...,

       [[ 33,  32,  38],
        [ 28,  27,  34],
        [ 21,  20,  26],
        ...,
        [153, 132, 133],
        [153, 132, 133],
        [149, 128, 130]],

       [[ 42,  40,  47],
        [ 40,  38,  45],
        [ 28,  26,  33],
        ...,
        [153, 132, 133],
        [153, 132, 133],
        [148, 127, 129]],

       [[ 31,  26,  33],
        [ 43,  38,  45],
        [ 46,  42,  49],
        ...,
        [153, 132, 133],
        [153, 132, 133],
        [148, 127, 129]]

In [27]:
res = yolo_model(img)

In [28]:
res

YOLOv5 <class 'models.common.Detections'> instance
image 1/1: 640x640 1 person
Speed: 5.6ms pre-process, 130.2ms inference, 0.9ms NMS per image at shape (1, 3, 640, 640)

In [29]:
res_refine = res.pandas().xyxy[0].values

In [30]:
res_refine

array([[348.2632751464844, 37.53910827636719, 404.1838684082031, 373.28717041015625, 0.8492674231529236, 0, 'person']], dtype=object)

In [31]:
len(res_refine)

1

In [32]:
nms_human = len(res_refine)

In [33]:
bbox = [348.2632751464844, 37.53910827636719, 404.1838684082031, 373.28717041015625, 0.8492674231529236, 0, 'person']

In [34]:
xx1, yy1, xx2, yy2 = int(bbox[0])-10, int(bbox[1]), int(bbox[2])+10, int(bbox[3])

In [35]:
xx1

338

In [36]:
yy1

37

In [37]:
xx2

414

In [38]:
yy2

373

In [39]:
if xx1 < 0:
    xx1 = 0
elif xx2 > 639:
    xx2 = 639
if yy1 < 0:
    yy1 = 0
elif yy2 > 639:
    yy2 = 639

In [40]:
start_point = (xx1, yy1)
print(start_point)

(338, 37)


In [41]:
end_point = (xx2, yy2)
print(end_point)

(414, 373)


In [42]:
bbox[4] > y_CONFIDENCE

True

In [43]:
bbox[4]

0.8492674231529236

In [44]:
c_img = img[yy1:yy2, xx1:xx2]

In [45]:
c_img

array([[[168, 195, 185],
        [170, 193, 184],
        [168, 192, 182],
        ...,
        [ 75,  96, 109],
        [ 75,  96, 110],
        [ 70, 100, 108]],

       [[166, 193, 182],
        [166, 193, 182],
        [167, 194, 183],
        ...,
        [ 80,  97, 105],
        [ 74,  96, 105],
        [ 77,  97, 107]],

       [[166, 195, 185],
        [166, 195, 185],
        [166, 195, 185],
        ...,
        [ 85,  98, 107],
        [ 79,  96, 104],
        [ 79,  95, 105]],

       ...,

       [[182, 179, 176],
        [181, 178, 175],
        [181, 178, 175],
        ...,
        [101, 116, 126],
        [102, 117, 127],
        [102, 117, 128]],

       [[182, 179, 176],
        [181, 178, 175],
        [181, 178, 175],
        ...,
        [ 99, 114, 124],
        [ 97, 113, 123],
        [ 99, 114, 124]],

       [[182, 179, 176],
        [181, 178, 175],
        [181, 178, 175],
        ...,
        [ 97, 112, 123],
        [ 96, 111, 122],
        [ 97, 112, 123]]

In [46]:
results = pose.process(cv2.cvtColor(c_img, cv2.COLOR_BGR2RGB))

In [47]:
results

mediapipe.python.solution_base.SolutionOutputs

In [48]:
results.pose_landmarks

landmark {
  x: 0.7647152543067932
  y: 0.15660327672958374
  z: -1.738620400428772
  visibility: 0.9999854564666748
}
landmark {
  x: 0.759145975112915
  y: 0.13608139753341675
  z: -1.5799812078475952
  visibility: 0.9999537467956543
}
landmark {
  x: 0.7699788212776184
  y: 0.1356266438961029
  z: -1.580010175704956
  visibility: 0.9999368190765381
}
landmark {
  x: 0.7794783115386963
  y: 0.13507285714149475
  z: -1.5803334712982178
  visibility: 0.9999105930328369
}
landmark {
  x: 0.7030701637268066
  y: 0.13584226369857788
  z: -1.6820127964019775
  visibility: 0.999975323677063
}
landmark {
  x: 0.6740812063217163
  y: 0.1354929804801941
  z: -1.6822608709335327
  visibility: 0.9999821186065674
}
landmark {
  x: 0.6453526616096497
  y: 0.13525858521461487
  z: -1.6820390224456787
  visibility: 0.9999833106994629
}
landmark {
  x: 0.7177807092666626
  y: 0.14224877953529358
  z: -0.7812544703483582
  visibility: 0.9998499155044556
}
landmark {
  x: 0.5535531044006348
  y: 0.1436

In [49]:
bool(results.pose_landmarks)

True

In [50]:
ret

True

In [51]:
xy_list_list_flip

[]

In [52]:
xy_list_list

[]

In [53]:
idx = 0

In [54]:
draw_line_dic = {}

In [55]:
xy_list, xy_list_flip = [], []

In [56]:
len(results.pose_landmarks.landmark)

33

In [57]:
results.pose_landmarks.landmark[0]

x: 0.7647152543067932
y: 0.15660327672958374
z: -1.738620400428772
visibility: 0.9999854564666748

In [58]:
#cv2.imshow('img', c_img)

In [59]:
cv2.waitKey(5000)

-1

In [60]:
cv2.destroyAllWindows()

In [61]:
cv2.waitKey(1)

-1

In [62]:
def get_skeleton(video_path, attention_dot, draw_line):
    frame_length = 30 # LSTM 모델에 넣을 frame 수

    xy_list_list, xy_list_list_flip = [], []
    cv2.destroyAllWindows()
    pose = mp_pose.Pose(static_image_mode = True, model_complexity = 1, \
                        enable_segmentation = False, min_detection_confidence = n_CONFIDENCE)
    cap = cv2.VideoCapture(video_path)

    if cap.isOpened():

        while True:
            ret, img = cap.read()

            if ret == True:

                """ Yolo 바운딩 박스 및 좌표 추출"""
                img = cv2.resize(img, (640, 640))
                res = yolo_model(img)
                res_refine = res.pandas().xyxy[0].values
                nms_human = len(res_refine)
                if nms_human > 0:
                    for bbox in res_refine:
                        """바운딩 박스 상하좌우 크기 조절"""
                        xx1, yy1, xx2, yy2 = int(bbox[0])-10, int(bbox[1]), int(bbox[2])+10, int(bbox[3])
                        if xx1 < 0:
                            xx1 = 0
                        elif xx2 > 639:
                            xx2 = 639
                        if yy1 < 0:
                            yy1 = 0
                        elif yy2 > 639:
                            yy2 = 639

                        start_point = (xx1, yy1)
                        end_point = (xx2, yy2)

                        """ Yolov5 바운딩 박스 좌표 안에서 mediapipe Pose 추출"""
                        if bbox[4] > y_CONFIDENCE: # bbox[4] : confidence 데이터
                            # img = cv2.rectangle(img, start_point, end_point, (0, 0, 255), 2) # 바운딩 박스 그리기 : 데이터 추출 확인용
                            c_img = img[yy1:yy2, xx1:xx2] # 바운딩 박스 좌표
                            results = pose.process(cv2.cvtColor(c_img, cv2.COLOR_BGR2RGB)) # Yolov5 바운딩 박스 좌표 안에서 'mp_pose' 좌표

                            if not results.pose_landmarks: continue
                            idx = 0
                            draw_line_dic = {}
                            xy_list, xy_list_flip = [], []
                            # 33 반복문 진행 : 33개 중 18개의 dot
                            for x_and_y in results.pose_landmarks.landmark:
                                if idx in attention_dot:
                                    xy_list.append(x_and_y.x)
                                    xy_list.append(x_and_y.y)
                                    xy_list_flip.append(1 - x_and_y.x)
                                    xy_list_flip.append(x_and_y.y)

                                    x, y = int(x_and_y.x*(xx2-xx1)), int(x_and_y.y*(yy2-yy1))
                                    draw_line_dic[idx] = [x, y]
                                idx += 1

                            if len(xy_list) != len(attention_dot) * 2:
                                print('Error : attention_dot 데이터 오류')

                            xy_list_list.append(xy_list)
                            xy_list_list_flip.append(xy_list_flip)

                            """mediapipe line 그리기 부분 : 데이터 추출(dot) 확인용"""
                            # for line in draw_line:
                            #     x1, y1 = draw_line_dic[line[0]][0], draw_line_dic[line[0]][1]
                            #     x2, y2 = draw_line_dic[line[1]][0], draw_line_dic[line[1]][1]
                            #     c_img = cv2.line(c_img, (x1, y1), (x2, y2), (0, 255, 0), 4)
                            # # cv2.imshow('Landmark Image', img)
                            # cv2_imshow(img)
                            # cv2.waitKey(1)

            elif ret == False: break


        # 부족한 프레임 수 맞추기
        if len(xy_list_list_flip) < 15:
            return False, False
        elif len(xy_list_list_flip) < frame_length:
            f_ln = frame_length - len(xy_list_list_flip)
            for _ in range(f_ln):
                xy_list_list.append(xy_list_list[-1])
                xy_list_list_flip.append(xy_list_list_flip[-1])

    cap.release()
    cv2.destroyAllWindows()


    return xy_list_list, xy_list_list_flip

In [84]:
# 영상 데이터에서 mp pose landmark dot 데이터 추출 부분
raw_data = []

for fold in os.listdir(video_path):
    for video_name in os.listdir(video_path + '/' + fold):
        if int(video_name.split('_')[2][:2]) >= 30: # video name 참조
            if video_name.split('_')[1] == 'normal': label = 0
            else: label = 1
            skel_data_n, skel_data_f = get_skeleton('{}/{}'.format(video_path + '/' + fold, video_name), attention_dot, draw_line)
            if skel_data_n != False:

                seq_list_n = skel_data_n[:30]
                seq_list_f = skel_data_f[:30]
                raw_data.append({'key':label, 'value':seq_list_n})
                raw_data.append({'key':label, 'value':seq_list_f})
random.shuffle(raw_data)

I0000 00:00:1713772963.105791 1211075 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M2
I0000 00:00:1713772966.575101 1211075 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M2
I0000 00:00:1713772970.100734 1211075 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M2
I0000 00:00:1713772973.624299 1211075 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M2
I0000 00:00:1713772977.832882 1211075 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M2
I0000 00:00:1713772981.550991 1211075 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M2
I0000 00:00:1713772985.345221 1211075 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M2
I0000 00:00:1713772988.862360 1211075 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M2
Context leak detected, msgtracer returned -1
I0000 00:00:1713772992.346504 1211075 gl_context.cc:357] GL version: 2.1 (2

In [79]:
len(raw_data)

30

In [85]:
skel_data_n

[[0.7680401802062988,
  0.266559898853302,
  0.28077077865600586,
  0.2654447555541992,
  0.807593047618866,
  0.42858704924583435,
  0.18394699692726135,
  0.43980464339256287,
  0.7671583890914917,
  0.5652652382850647,
  0.1661100685596466,
  0.5884326100349426,
  0.771583080291748,
  0.6041407585144043,
  0.13859006762504578,
  0.6324483156204224,
  0.7429617643356323,
  0.6064115762710571,
  0.17934122681617737,
  0.6317155361175537,
  0.7287959456443787,
  0.5947543978691101,
  0.20339563488960266,
  0.6184134483337402,
  0.6092131733894348,
  0.5415561199188232,
  0.3394407331943512,
  0.5381936430931091,
  0.597990870475769,
  0.7580123543739319,
  0.3373066782951355,
  0.7567845582962036,
  0.5374031066894531,
  0.9171151518821716,
  0.38127923011779785,
  0.9137616157531738],
 [0.8158023953437805,
  0.2537412941455841,
  0.3025131821632385,
  0.2524203956127167,
  0.8160037994384766,
  0.40624114871025085,
  0.19701522588729858,
  0.4119282364845276,
  0.7645604014396667,
  0

In [78]:
# dataset 길이 출력

nd = 0
ad = 0
for i in range(len(raw_data)):
    if raw_data[i]['key'] == 0:
        nd += 1
    else:
        ad += 1
print('normal data:', nd, '| abnormal data:', ad)

normal data: 14 | abnormal data: 16


In [82]:
len(seq_list_n)

30