In [None]:
!pip install opencv-python mediapipe scikit-learn matplotlib

In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [2]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [3]:
def mediapipe_detection(image, model): 
    #image = feed frame
    #model = Holistic model
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR -> RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB -> BGR
    return image, results

In [4]:
#connection_drawing_spec 커스텀 (optional)
def draw_styled_landmarks(image, results):
    #Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [5]:
def draw_custom_pose_landmarks(image, results):
    
    pose_landmark_subset = landmark_pb2.NormalizedLandmarkList(
      landmark = [
          #eye keypoints
          results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_EYE],#0 
          results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_EYE],#1
          #right arm keypoints
          results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_WRIST],#2
          results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_ELBOW],#3
          results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_SHOULDER],#4 
          #left arm keypoints
          results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_WRIST],#5 
          results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_ELBOW],#6
          results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_SHOULDER],#7
          #body keypoints
          results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_HIP],#8
          results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_HIP]])#9
    
    #connection = [pose_landmark_subset index1, pose_landmark_subset index2]
    pose_landmark_subset_connections = [
        [2, 3], [3, 4], #right arm
        [5, 6], [6, 7], #left arm
        [4, 7], [4, 9], [8, 9], [7, 8]] #body

    #Draw pose connections
    mp_drawing.draw_landmarks(image, 
                             landmark_list = pose_landmark_subset, 
                             connections = pose_landmark_subset_connections,
                             landmark_drawing_spec = mp_drawing.DrawingSpec(color=(98,129,205), thickness=2, circle_radius=2), 
                             connection_drawing_spec = mp_drawing.DrawingSpec(color=(122,160,255), thickness=2, circle_radius=2)
                             ) 

In [6]:
def draw_custom_hand_landmarks(image, results):
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(149,140,205), thickness=2, circle_radius=2), 
                             mp_drawing.DrawingSpec(color=(185,174,255), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(205,182,141), thickness=2, circle_radius=2), 
                             mp_drawing.DrawingSpec(color=(255,226,176), thickness=2, circle_radius=2)
                             ) 

In [7]:
def draw_custom_face_landmarks(image, results):
    #face landmark subset
    face_landmark_subset = landmark_pb2.NormalizedLandmarkList(
          landmark = [
              #right eyebrow keypoints
              results.face_landmarks.landmark[46],#0 
              results.face_landmarks.landmark[53],#1
              results.face_landmarks.landmark[52],#2
              results.face_landmarks.landmark[65],#3
              results.face_landmarks.landmark[55],#4

              #left eyebrow keypoints
              results.face_landmarks.landmark[285],#5 
              results.face_landmarks.landmark[295],#6
              results.face_landmarks.landmark[282],#7
              results.face_landmarks.landmark[283],#8
              results.face_landmarks.landmark[276],#9

              #lip keypoints
              results.face_landmarks.landmark[61],#10 
              results.face_landmarks.landmark[81],#11
              results.face_landmarks.landmark[13],#12 
              results.face_landmarks.landmark[311],#13
              results.face_landmarks.landmark[291],#14
              results.face_landmarks.landmark[402],#15 
              results.face_landmarks.landmark[14],#16
              results.face_landmarks.landmark[178]])#17
    
    #connection = {face_landmark_subset index1, face_landmark_subset index2}
    face_landmark_subset_connections = [
        [0, 1], [1, 2], [2, 3], [3, 4], #right eybrow
        [5, 6], [6, 7], [7, 8], [8, 9], #left eyebrow
        [10, 11],[11, 12], [12, 13], [13, 14], [14, 15], [15, 16], [16, 17], [17, 10]] #lip
    
    mp_drawing.draw_landmarks(image, 
                             face_landmark_subset, 
                             face_landmark_subset_connections,
                             mp_drawing.DrawingSpec(color=(112,190,205), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(139,236,255), thickness=2, circle_radius=2)
                             ) 

In [8]:
def extract_keypoints_258(results):
    # result의 landmarks의 모든 key point values -> 하나의 numpy array 로 flatten
    #if landmarks has no value, fill numpy array with zero
    
    if results.pose_landmarks: #pose landmarks
        pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() 
    else:
        pose = np.zeros(132) #33*4

    if results.left_hand_landmarks: #left hand landmarks
        lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() 
    else:
        lh = np.zeros(63) #21*3

    if results.right_hand_landmarks: #right hand landmarks
        rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() 
    else:
        rh = np.zeros(63) #21*3
    
    
    #return np.concatenate([pose, lh, rh])
    return np.concatenate([pose, lh, rh])

In [9]:
#pandas
!pip install xlrd
!pip install openpyxl
!pip install pandas

^C








^C


In [9]:
import pandas as pd

In [10]:
# 파일명
file_name = '45.xlsx'

# Daraframe형식으로 엑셀 파일 읽기
df = pd.read_excel(file_name, sheet_name='Sheet1')

list_res = df.loc[:,'단어'].to_list()
print(list_res)

['가슴', '개', '귀', '내일', '누나', '다리', '동생', '뒤', '딸', '머리', '목', '물', '발', '배', '불', '뼈', '선생님', '손', '아기', '아내', '아들', '아빠', '앞', '어제', '엄마', '옆쪽', '오늘', '오른쪽', '왼쪽', '위', '친구', '코', '팔', '학교', 1, 2, 3, 4, 5, 6, 7, 8, 9, 0]


In [10]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('datas') 

#num of videos
no_sequences = 60

#actions = np.array(['가스', '가슴', '갇히다'])
#actions = np.array(list_res)
actions = np.array(['손흥민', '최고', '고양이', '걸어다닙니다', '볼하트'])

# 1 Video = 50 frames
sequence_length = 30

len(actions)

5

In [11]:
##extract keypoints from video
def frame_capture(action, sequence, video):
    cap = cv2.VideoCapture(video)
    
    # Set mediapipe model 
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        frame_num = 0
        while cap.isOpened():
            # Read feed
            ret, frame = cap.read()
            if ret == False:
                break
                
            # Make detections
            image, results = mediapipe_detection(frame, holistic)  
            # Draw landmarks
            #draw_styled_landmarks(image, results)
            draw_custom_pose_landmarks(image, results)
            draw_custom_hand_landmarks(image, results)
            draw_custom_face_landmarks(image, results)

            # Show to screen
            cv2.imshow('OpenCV Video', image)

            # export keypoints
            keypoints = extract_keypoints(results) #results of frame -> numpy array
            npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
            np.save(npy_path, keypoints) #save numpy array in directory
            
            #increase frame_num
            frame_num = frame_num + 1
                
            # Break gracefully
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
                
        cap.release()
        cv2.destroyAllWindows()

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard 

In [13]:
import tensorflow as tf
from tensorflow.keras.layers import Add, Input, Embedding, GlobalAveragePooling1D, Conv1D, ReLU, MaxPooling1D, LSTM, Dense, Bidirectional, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential, Model

#Residual Block 구현
def residual_block(x, filters, kernel_size=3, stride=1):
    # Shortcut Connection
    shortcut = x
    
    # Main Path
    x = Conv1D(filters, kernel_size=kernel_size, strides=stride, padding='same')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    
    x = Conv1D(filters, kernel_size=kernel_size, strides=stride, padding='same')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    
    x = Conv1D(filters, kernel_size=kernel_size, strides=stride, padding='same')(x)
    x = BatchNormalization()(x)
    
    # Shortcut Connection 추가
    x = Add()([x, shortcut])
    x = ReLU()(x)
    
    return x


input_layer = Input(shape=(30,258))

x = Conv1D(64, kernel_size=3, activation='relu')(input_layer)
x = BatchNormalization()(x)
x = ReLU()(x)
x = MaxPooling1D(pool_size=2)(x)

# --------------------------------------------------------
x = residual_block(x, filters=64)
x = MaxPooling1D(pool_size=2, padding='same')(x)

x = residual_block(x, filters=64)
x = MaxPooling1D(pool_size=2, padding='same')(x)

x = residual_block(x, filters=64)
x = MaxPooling1D(pool_size=2)(x)

# --------------------------------------------------------

x = Bidirectional(LSTM(64, return_sequences=True, activation='tanh'))(x)
x = Dropout(0.3)(x)
x = Bidirectional(LSTM(128, return_sequences=True, activation='tanh'))(x)
x = Dropout(0.3)(x)
x = Bidirectional(LSTM(64, return_sequences=True, activation='tanh'))(x)
x = Dropout(0.3)(x)
x = Bidirectional(LSTM(64, return_sequences=True, activation='tanh'))(x)
x = Dropout(0.3)(x)
x = LSTM(64, return_sequences=False, activation='relu')(x)


output_layer = Dense(actions.shape[0], activation='softmax')(x)
model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

model.summary()

In [14]:
model.load_weights('ssafy_model_258.h5')

In [15]:
#한글 텍스트 출력
from PIL import ImageFont, ImageDraw, Image

def putKoreanText(src, text, pos, font_size, font_color):
    img_pil = Image.fromarray(src)
    draw = ImageDraw.Draw(img_pil)
    font = ImageFont.truetype('C:/Users/User/ActionDetectionforSignLanguage/fonts/gulim.ttc', font_size)
    draw.text(pos, text, font=font, fill= font_color)
    return np.array(img_pil)

In [16]:
# prediction using video
def custom_video_prediction(video):
    
    # 1. detection variables
    sequence = [] #collect 60 frames to make a sequence(=video)
    sentence = [] #concatenate history of predictions together
    threshold = 0.995

    cap = cv2.VideoCapture(video)
    
    #clear subtitles when {clear_cycle} frames passed without new subtitle added
    frames_without_new_subtitle = 0
    clear_cycle = 300
    
    word = ''
    cnt = 0
    frame_num = 0
    
    # Set mediapipe model 
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            # Read video
            ret, frame = cap.read()
            if ret == False:
                break
                
            frame_num = frame_num+1
            
            if frame_num%2 == 0:
                continue

            
            # Make detections
            image, results = mediapipe_detection(frame, holistic)
            print(results)
            # Draw landmarks
            draw_styled_landmarks(image, results)
            #draw_custom_pose_landmarks(image, results)
            #draw_custom_hand_landmarks(image, results)
            #draw_custom_face_landmarks(image, results)

            # 2. Prediction logic
            keypoints = extract_keypoints_258(results)
            sequence.append(keypoints)
            sequence = sequence[-30:] #generate sequence with last 30 frames
            
            #frames_without_new_subtitle += 1

            if len(sequence) == 30:
                #sequence.shape = (60, 1662)
                #the input shape model expects = (number of sequences, 60, 1662)
                res = model.predict(np.expand_dims(sequence, axis=0))[0]
                print(actions[np.argmax(res)])
                
                if word == actions[np.argmax(res)]:
                    cnt = cnt + 1
                else :
                    word = actions[np.argmax(res)]
                    cnt = 1
                    

                #3. Rendering logic
                #ex. res = [0.1, 0.2, 0.7]
                #np.argmax(res) = 2, res[np.argmax(res)] = 0.7
                if res[np.argmax(res)] > threshold and cnt >= 10: 
                    #new subtitle added
                    #frames_without_new_subtitle = 0

                    cur_action_korean = actions[np.argmax(res)]

                    if len(sentence) > 0: 
                        #sentence에 저장된 prediction이 있는 경우 
                        #새로운 prediction인 경우에만 sentence에 추가
                        if cur_action_korean != sentence[-1]:
                            sentence.append(cur_action_korean)
                    else: 
                        #sentence에 저장된 prediction 없는 경우 바로 sentence에 추가
                        sentence.append(cur_action_korean)

                #sentence가 너무 길어지지 않도록 마지막 5개의 prediction만 유지
                if len(sentence) > 5: 
                    sentence = sentence[-5:]
                
                #Clear subtitles if needed
#                if frames_without_new_subtitle >= clear_cycle:
#                    sentence.clear()
                
                #Render subtitles
                cv2.rectangle(image, (0,0), (640, 40), (0, 0, 0), -1) 
#                if target == 'ko':
#                    #putKoreanText(src, text, pos, font_size, font_color
#                    image = putKoreanText(image, ' '.join(sentence), (3,10), 20, (255, 255, 255))
#                else:
#                    cv2.putText(image, ' '.join(sentence), (3,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
                image = putKoreanText(image, ' '.join(sentence), (3,10),(20),(255,255,255))
                # Show to screen
                cv2.imshow('Video Prediction', image)

            # Break gracefully
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
                
    cap.release()
    cv2.destroyAllWindows()

In [25]:
#curDir = "C:/Users/wjdgu/JupyterMain/ActionDetectionforSignLanguage/졸프시연단어"
curDir = "C:/Users/SSAFY/Desktop/tensorflow/ActionDetectionforSignLanguage/validVideos"
for file in os.listdir(curDir):
    if(file.endswith(".mp4")):
        path = os.path.join(curDir, file)
        custom_video_prediction(path)

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

KeyboardInterrupt: 

In [19]:
import cv2
import numpy as np
import mediapipe as mp
import time

# Initialize Mediapipe holistic model
mp_holistic = mp.solutions.holistic

# Function to perform mediapipe detection
def mediapipe_detection_gpt(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

# Function to draw landmarks
def draw_styled_landmarks_gpt(image, results):
    mp_drawing = mp.solutions.drawing_utils
    mp_drawing_styles = mp.solutions.drawing_styles
    # Draw pose, face, and hand landmarks
    if results.pose_landmarks:
        mp_drawing.draw_landmarks(
            image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
            mp_drawing_styles.get_default_pose_landmarks_style())
    if results.face_landmarks:
        mp_drawing.draw_landmarks(
            image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
            mp_drawing_styles.get_default_face_mesh_tesselation_style(),
            mp_drawing_styles.get_default_face_mesh_contours_style())
    if results.left_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    if results.right_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

# Function to extract keypoints
def extract_keypoints_258_gpt(results):
    keypoints = []
    if results.pose_landmarks:
        keypoints.extend(
            [res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark)
    if results.face_landmarks:
        keypoints.extend(
            [res.x, res.y, res.z] for res in results.face_landmarks.landmark)
    if results.left_hand_landmarks:
        keypoints.extend(
            [res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark)
    if results.right_hand_landmarks:
        keypoints.extend(
            [res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark)
    return np.array(keypoints).flatten()

# Function to detect if hands are covering eyes
def hands_covering_eyes_two(results):
    if results.face_landmarks and results.left_hand_landmarks and results.right_hand_landmarks:
        left_hand = results.left_hand_landmarks.landmark
        right_hand = results.right_hand_landmarks.landmark
        face = results.face_landmarks.landmark
        left_eye = face[159]  # Left eye index in Mediapipe face landmarks
        right_eye = face[386] # Right eye index in Mediapipe face landmarks

        left_hand_near_left_eye = any(
            np.linalg.norm(np.array([hand.x, hand.y]) - np.array([left_eye.x, left_eye.y])) < 0.05
            for hand in left_hand)
        right_hand_near_right_eye = any(
            np.linalg.norm(np.array([hand.x, hand.y]) - np.array([right_eye.x, right_eye.y])) < 0.05
            for hand in right_hand)

        return left_hand_near_left_eye and right_hand_near_right_eye
    return False

def hand_covering_eyes(results):
    if results.face_landmarks:
        face = results.face_landmarks.landmark
        left_eye = face[159]  # Left eye index in Mediapipe face landmarks
        right_eye = face[386] # Right eye index in Mediapipe face landmarks

        if results.left_hand_landmarks:
            left_hand = results.left_hand_landmarks.landmark
            left_hand_near_eyes = any(
                np.linalg.norm(np.array([hand.x, hand.y]) - np.array([left_eye.x, left_eye.y])) < 0.05 or
                np.linalg.norm(np.array([hand.x, hand.y]) - np.array([right_eye.x, right_eye.y])) < 0.05
                for hand in left_hand)
            if left_hand_near_eyes:
                return True

        if results.right_hand_landmarks:
            right_hand = results.right_hand_landmarks.landmark
            right_hand_near_eyes = any(
                np.linalg.norm(np.array([hand.x, hand.y]) - np.array([left_eye.x, left_eye.y])) < 0.05 or
                np.linalg.norm(np.array([hand.x, hand.y]) - np.array([right_eye.x, right_eye.y])) < 0.05
                for hand in right_hand)
            if right_hand_near_eyes:
                return True

    return False

def custom_video_prediction_gpt():
    # 1. detection variables
    sequence = []  # collect 60 frames to make a sequence(=video)
    sentence = []  # concatenate history of predictions together
    threshold = 0.995

    word = ''
    cnt = 0

    cap = cv2.VideoCapture(0)

    # Time tracking variables for toggling landmarks
    start_time = None
    show_landmarks = True
    landmark_toggle_duration = 2  # in seconds

    # Set mediapipe model 
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            # Read feed
            ret, frame = cap.read()
            if not ret:
                break

            # 좌우 반전
            frame = cv2.flip(frame, 1)
            
            # Make detections
            image, results = mediapipe_detection(frame, holistic)
            print(results)

            # Check if hands are covering eyes
            if hands_covering_eyes(results):
                if start_time is None:
                    start_time = time.time()
                elif time.time() - start_time >= landmark_toggle_duration:
                    show_landmarks = not show_landmarks
                    start_time = None
            else:
                start_time = None

            # Draw landmarks
            if show_landmarks:
                draw_styled_landmarks(image, results)

            # 2. Prediction logic
            keypoints = extract_keypoints_258(results)
            sequence.append(keypoints)
            sequence = sequence[-30:]  # generate sequence with last 30 frames

            if len(sequence) == 30:
                res = model.predict(np.expand_dims(sequence, axis=0))[0]
                print(actions[np.argmax(res)])

                if word == actions[np.argmax(res)]:
                    cnt += 1
                else:
                    word = actions[np.argmax(res)]
                    cnt = 1

                # 3. Rendering logic
                if res[np.argmax(res)] > threshold and cnt >= 10:
                    cur_action_korean = actions[np.argmax(res)]

                    if len(sentence) > 0:
                        if cur_action_korean != sentence[-1]:
                            sentence.append(cur_action_korean)
                    else:
                        sentence.append(cur_action_korean)

                if len(sentence) > 1:
                    sentence = sentence[-1:]

                # Render subtitles
                cv2.rectangle(image, (0, 0), (640, 80), (0, 0, 0), -1)
                image = putKoreanText(image, ' '.join(sentence), (3, 10), 50, (255, 255, 255))
                cv2.imshow('OpenCV Feed', image)

            # Break gracefully
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()


In [20]:
custom_video_prediction_gpt()

<class 'mediapipe.python.solution_base.SolutionOutputs'>


NameError: name 'hands_covering_eyes' is not defined

# TEST in Realtime

In [21]:
# 1. detection variables
sequence = [] #collect 60 frames to make a sequence(=video)
sentence = [] #concatenate history of predictions together
threshold = 0.995

word = ''
cnt = 0

cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()
        
        frame = cv2.flip(frame, 1)
        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints_258(results)
        sequence.append(keypoints)
        sequence = sequence[-30:] #generate sequence with last 30 frames
        
        if len(sequence) == 30:
            #sequence.shape = (60, 1662)
            #the input shape model expects = (number of sequences, 60, 1662)
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])

            if word == actions[np.argmax(res)]:
                cnt = cnt + 1
            else :
                word = actions[np.argmax(res)]
                cnt = 1


            #3. Rendering logic
            #ex. res = [0.1, 0.2, 0.7]
            #np.argmax(res) = 2, res[np.argmax(res)] = 0.7
            if res[np.argmax(res)] > threshold and cnt >= 10: 
                #new subtitle added
                #frames_without_new_subtitle = 0

                cur_action_korean = actions[np.argmax(res)]

                if len(sentence) > 0: 
                    #sentence에 저장된 prediction이 있는 경우 
                    #새로운 prediction인 경우에만 sentence에 추가
                    if cur_action_korean != sentence[-1]:
                        sentence.append(cur_action_korean)
                else: 
                    #sentence에 저장된 prediction 없는 경우 바로 sentence에 추가
                    sentence.append(cur_action_korean)

            #sentence가 너무 길어지지 않도록 마지막 5개의 prediction만 유지
            if len(sentence) > 1: 
                sentence = sentence[-1:]
                #print(sentence)

            #Clear subtitles if needed
#                if frames_without_new_subtitle >= clear_cycle:
#                    sentence.clear()

            #Render subtitles
            cv2.rectangle(image, (0,0), (640, 80), (0, 0, 0), -1) 
#                if target == 'ko':
#                    #putKoreanText(src, text, pos, font_size, font_color
#                    image = putKoreanText(image, ' '.join(sentence), (3,10), 20, (255, 255, 255))
#                else:
#                    cv2.putText(image, ' '.join(sentence), (3,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
            image = putKoreanText(image, ' '.join(sentence), (3,10),(50),(255,255,255))
            # Show to screen
            cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
            
    cap.release()
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti