# Configurações Gerais

## Instalação das dependências necessárias

In [1]:
!pip install -q mediapipe tensorflow matplotlib

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## Importação de um modelo de reconhecimento das mãos

In [2]:
!wget -q https://storage.googleapis.com/mediapipe-models/gesture_recognizer/gesture_recognizer/float16/1/gesture_recognizer.task

In [134]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from google.colab.patches import cv2_imshow
import cv2
import os
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# UTILS

## Delete folder

In [36]:
import shutil

def delete_folder(folder_path):
    """
    Exclui uma pasta e todo o seu conteúdo.

    :param folder_path: O caminho para a pasta que você deseja excluir.
    """
    try:
        shutil.rmtree(folder_path)
        print("Pasta excluída com sucesso!")
    except Exception as e:
        print(f"Ocorreu um erro ao excluir a pasta: {e}")



## Unzip files

In [45]:
import zipfile
import os

def unzip_file(zip_path, extract_to):
    """
    Extrai um arquivo zip para uma pasta específica.

    :param zip_path: O caminho para o arquivo zip que você deseja extrair.
    :param extract_to: O caminho da pasta onde deseja extrair o conteúdo do arquivo zip.
    """
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print("Arquivo zip extraído com sucesso!")


## Zip Files

In [29]:
import zipfile
import os

def zip_folder(folder_path, output_path):
    """
    Comprime uma pasta para um arquivo zip.

    :param folder_path: O caminho para a pasta que você deseja comprimir.
    :param output_path: O caminho onde o arquivo zip será salvo.
    """
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, folder_path))




## Desenhar landmarks

In [6]:
def draw_landmarks(image, landmarks, color):

  for landmark in landmarks:
    height, width, _ = image.shape
    cx, cy = int(landmark.x * width), int(landmark.y * height)
    cv2.circle(image, (cx, cy), 5, color, cv2.FILLED)

## Formatação dos keyframes


In [68]:
def format_landmarks(landmarks):
  formatted_landmarks = []
  for hands_landmarks in landmarks:
    hand_landmarks = []
    for landmark in hands_landmarks:
      hand_landmarks.append([landmark.x, landmark.y, landmark.z])
  formatted_landmarks.append(np.concatenate(hand_landmarks))

  return np.concatenate(formatted_landmarks)


# Salvar dados (array)

## Importação dos vídeos do drive

In [3]:
from google.colab import drive
drive.mount('/content/drive/')


Mounted at /content/drive/


In [None]:
!unzip './drive/MyDrive/UFMG/Lab4/videos.zip'

## Processamento de vídeo

In [None]:
def show_frame(frame, hands_landmarks):
  colors = [(0, 255, 0), (0, 0, 255)]


  for idx, hand_landmark in enumerate(hands_landmarks):
    draw_landmarks(frame, hand_landmark, colors[idx])

  cv2_imshow(frame)
  cv2.waitKey(0)
  cv2.destroyAllWindows()

In [25]:
def process_video(video_path, label, sequence):
  base_options = python.BaseOptions(model_asset_path='gesture_recognizer.task')
  options = vision.GestureRecognizerOptions(base_options=base_options, num_hands=2)
  recognizer = vision.GestureRecognizer.create_from_options(options)

  cap = cv2.VideoCapture(video_path)

  video_frames = []
  num_frames = 30
  video_num_frames = 0

  keyframes = []

  while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame_rgb)

        recognition_result = recognizer.recognize(image)
        npy_path = os.path.join('./np', label, str(sequence), str((video_num_frames)))
        np.save(npy_path, recognition_result.hand_landmarks)
        video_num_frames += 1


        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

  cap.release()
  cv2.destroyAllWindows()
  return keyframes

## Montar base de dados



In [148]:
directory_path = './videos'
directory_files = os.listdir(directory_path)
labels = np.array(['acontecer', 'aluno', 'amarelo', 'america', 'aproveitar']) # directory_files
label_map = {label:num for num, label in enumerate(labels)}
label_map


{'acontecer': 0, 'aluno': 1, 'amarelo': 2, 'america': 3, 'aproveitar': 4}

In [153]:
# Recria a pasta np
folder_name = './np'
delete_folder(folder_name)
os.mkdir(folder_name)

Pasta excluída com sucesso!


In [154]:
for label in labels:
    for sequence in range(5): #trocar isso por uma variável que simboliza o número de vídeos
        try:
            os.makedirs(os.path.join(folder_name, label, str(sequence)))
        except:
            pass

In [155]:
# Popula array do numpy
for idx, folder_path in enumerate(labels):
  base_path = directory_path + '/'+folder_path
  files = os.listdir(base_path)
  for file_index in range(len(files)):
    file_name = files[file_index]
    video_frames = process_video(base_path + '/'+ file_name, folder_path, file_index)

In [156]:
folder_to_zip = './np'
output_zip_path = './np.zip'
zip_folder(folder_to_zip, output_zip_path)
print("Pasta comprimida com sucesso!")

Pasta comprimida com sucesso!


In [157]:
delete_folder(folder_to_zip)

Pasta excluída com sucesso!


# Processar dados salvos (array)

## Unzip pasta com os dados salvos


In [158]:
folder_name = './np'
os.mkdir(folder_name)
unzip_file('./np.zip', folder_name)

Arquivo zip extraído com sucesso!


## Carrega os dados

In [159]:
actions = os.listdir(folder_name)
label_map = {label:num for num, label in enumerate(actions)}
label_map

{'amarelo': 0, 'aproveitar': 1, 'acontecer': 2, 'aluno': 3, 'america': 4}

In [160]:
sequences, labels = [], []
for action in actions:
    actionFrames = []
    no_sequences = len(os.listdir(folder_name + '/'+ action))
    for sequence in range(no_sequences):
        window = []
        sequence_length = len(os.listdir(folder_name + '/'+ action + '/' + str(sequence)))
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(folder_name, action, str(sequence), "{}.npy".format(frame_num)), allow_pickle=True)
            window.append(res)
        actionFrames.append(window)
        labels.append(label_map[action])
    sequences.append(actionFrames)


In [161]:
'''
Nesse momento, sequence é um array com uma posição para cada action (label), e dentro disso todos os vídeos
Ou seja, imagine que temos actions = ["oi", "bom dia"]
E cada action tem 2 vídeos


sequences = [
  [
    [
      [frame1],
      [frame2]
    ],  //video 1 label 1
    [
      [frame1],
      [frame2]
    ],  //video 2 label 1
  ],
  [
    [
      [frame1],
      [frame2]
    ],  //video 1 label 2
    [
      [frame1],
      [frame2] label 2
    ],
  ]
]

'''

'\nNesse momento, sequence é um array com uma posição para cada action (label), e dentro disso todos os vídeos\nOu seja, imagine que temos actions = ["oi", "bom dia"]\nE cada action tem 2 vídeos\n\n\nsequences = [ \n  [ \n    [\n      [frame1],\n      [frame2]\n    ],  //video 1 label 1\n    [\n      [frame1],\n      [frame2]\n    ],  //video 2 label 1\n  ], \n  [\n    [\n      [frame1],\n      [frame2]\n    ],  //video 1 label 2\n    [\n      [frame1],\n      [frame2] label 2\n    ],  \n  ]\n]\n\n'

# LSTM

## Trata os dados para deixar homogêneo

In [162]:
### Rever esta lógica, estou definindo que todos os vídeos terão 30 frames
formatted_sequences = []
num_frames = 30

for sequence in sequences:
  videos = []
  for video in sequence:
    frames = []
    for frame in video:
      if(len(frame) > 0 and len(frames) < num_frames):
        frames.append(format_landmarks(frame))
    videos.append(frames)
  formatted_sequences.append(videos)


In [163]:
formatted_sequences = np.array(formatted_sequences)
actions = np.array(actions)

## Separa os dados de treinamento e validação

In [164]:
videos = np.concatenate(formatted_sequences)
labels = np.array(labels)
y = to_categorical(labels).astype(int)
x_train, x_test, y_train, y_test = train_test_split(videos, y, test_size=0.3)


## Treinamento do modelo

In [169]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(
    x_train.shape[1], #num_frames
    x_train.shape[2]  #num_landmarks em cada frame
)))

model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))
# model.add(LSTM(64))
# model.add(Dense(7, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=300, batch_size=32, validation_data=(x_test, y_test))


Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.src.callbacks.History at 0x7e16cb795090>

# Análise dos resultados

## Geração da matriz de confusão

Essa matriz serve para analisar o número de verdadeiros/falsos positivos/negativos

In [172]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

yhat = model.predict(x_test)
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()
multilabel_confusion_matrix(ytrue, yhat)




array([[[5, 0],
        [0, 3]],

       [[6, 0],
        [0, 2]],

       [[7, 0],
        [0, 1]],

       [[7, 0],
        [0, 1]],

       [[7, 0],
        [0, 1]]])

## Cálculo da precisão do modelo

In [173]:
accuracy_score(ytrue, yhat)

1.0

# Salvar o modelo gerado

In [None]:
model.save('libras.h5')

# Testando o modelo

In [135]:
def process_test_video(video_path):
  base_options = python.BaseOptions(model_asset_path='gesture_recognizer.task')
  options = vision.GestureRecognizerOptions(base_options=base_options, num_hands=2)
  recognizer = vision.GestureRecognizer.create_from_options(options)

  cap = cv2.VideoCapture(video_path)

  video_frames = []
  num_frames = 30

  while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame_rgb)

        recognition_result = recognizer.recognize(image)
        # show_frame(frame, recognition_result.hand_landmarks)
        if(len(recognition_result.hand_landmarks) > 0 and len(video_frames) < num_frames):
          keypoints = format_landmarks(recognition_result.hand_landmarks)
          video_frames.append(keypoints)


        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

  cap.release()
  cv2.destroyAllWindows()
  return video_frames

In [146]:
new_video = np.array(process_test_video('videos/acontecer/01AcontecerSinalizador07-1.mp4'))
print(np.array([new_video]).shape)

(1, 30, 63)


In [168]:
result = model.predict(np.array([new_video]))

category = actions[np.argmax(result[0])]
print(category)

acontecer
