In [1]:
# Importando o dataframe
# Primeira análise apenas com extroversão
data_path = "../chalearn_dataset"

In [2]:
import pandas as pd

data_training = pd.read_csv(f"{data_path}/train/extraversion_data.csv")
df_training = pd.DataFrame.from_dict(data_training)

df_training.head(2000)

Unnamed: 0.1,Unnamed: 0,extraversion
0,zEyRyTnIw5I.005.mp4,0
1,nskJh7v6v1U.004.mp4,0
2,eHcRre1YsNA.000.mp4,0
3,VuadgOz6T7s.000.mp4,0
4,7nhJXn9PI0I.001.mp4,0
...,...,...
1995,9yZEb6bdxNY.004.mp4,1
1996,dNXqs5HNijI.004.mp4,1
1997,rG8D-A2F8xg.004.mp4,1
1998,F-Dy1EFm_Mw.005.mp4,1


## Implementando um primeiro modelo

EfficientNet B0 -> Transformers -> Classification

In [3]:
IMG_SIZE = 224

In [6]:
# Camadas iniciais do modelo:

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.applications import EfficientNetB0

inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3)) # Por enquanto o input é apenas uma imagem
# Usa EfficientNet B0 como extratora de características da imagem que iremos processar
features_extraction = EfficientNetB0(include_top=False, weights='imagenet')
features_extraction.trainable = False
features_extraction = features_extraction(inputs)

model = tf.keras.Model(inputs, features_extraction)
# model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 efficientnetb0 (Functional)  (None, None, None, 1280)  4049571  
                                                                 
Total params: 4,049,571
Trainable params: 0
Non-trainable params: 4,049,571
_________________________________________________________________


In [5]:
# Transformers - Camada final

In [23]:
#Importações necessárias para o funcionamento da Classe
import numpy as np
import cv2
import matplotlib.pyplot as plt
from tensorflow.keras import layers

# Extrator de Frames
class FramesExtractor(layers.Layer):
    '''
    data_path = caminho do diretório no qual os vídeos estão inseridos
    number_of_frames = número de frames a ser extraído a cada iteração
    dim = dimensões da imagem no formato (altura, largura)
    '''
    def __init__(self, data_path, number_of_frames=10, dim=(224, 224)):
        super(FramesExtractor, self).__init__()
        self.data_path = data_path
        self.number_of_frames = number_of_frames
        self.dim= dim
        

    def call(self, video_name):
        if video_name != None:
            video_path = self.data_path + str(video_name)
            cap = cv2.VideoCapture(video_path)
            frames = []
            length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            jump_size = int(length // self.number_of_frames)
            for _ in range(self.number_of_frames):
                ret, frame = cap.read()
                frames.append(cv2.cvtColor(cv2.resize(frame, self.dim), cv2.COLOR_BGR2RGB))
                for __ in range(jump_size - 1):
                  cap.grab()
            return np.array(frames)
        else:
            return np.array()
    
#     def show(self, video_name):
#         frames = self.call(video_name)
#         print(f"Número de frames: {frames.shape[0]}\nFormato dos frames: {frames.shape[1:]}")

#         # Plotando o resultado:
#         print("Frames extraídos:")
#         fig = plt.figure(figsize=(10, 10))
#         rows = 4
#         columns = 4
#         for _ in range(self.number_of_frames):
#             fig.add_subplot(rows, columns, _ + 1)
#             plt.imshow(frames[_])

In [110]:
#Importações necessárias para o funcionamento da Classe
import tensorflow as tf
import numpy as np
import cv2
import matplotlib.pyplot as plt
from tensorflow.keras import layers

# Extrator de Frames
class FramesExtractor(layers.Layer):
    '''
    data_path = caminho do diretório no qual os vídeos estão inseridos
    number_of_frames = número de frames a ser extraído a cada iteração
    dim = dimensões da imagem no formato (altura, largura)
    '''
    def __init__(self, data_path, number_of_frames=10, dim=(224, 224)):
        super(FramesExtractor, self).__init__()
        self.data_path = data_path
        self.number_of_frames = number_of_frames
        self.dim= dim
        

    '''
    videos_names = vetor com nomes de vídeos a serem precessados
    
    retorna um array com os frames extrapidos
    '''
    def call(self, videos_names):
        batch_size = videos_names.shape[0]
        frames = []
        print(f"BATCH SIZE = {batch_size}")
        for video_name in videos_names:
            video_path = self.data_path + str(video_name)
            cap = cv2.VideoCapture(video_path)
            length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            jump_size = int(length // self.number_of_frames)
            for _ in range(self.number_of_frames):
                ret, frame = cap.read()
                if frame == None:
                    frames.append(np.empty(self.dim, self.dim, 3))
                else:
                    frames.append(cv2.cvtColor(cv2.resize(frame, self.dim), cv2.COLOR_BGR2RGB))
                for __ in range(jump_size - 1):
                  cap.grab()
        return tf.reshape(np.array(frames), [batch_size, self.number_of_frames, self.dim[0], self.dim[1], 3])
    
    def show(self, videos_names):
        frames = self.call(videos_names)
        batch_size = frames.shape[0]
        print(f"Batch Size = {batch_size}")
        print(f"Número de frames: {frames.shape[1]}\nFormato dos frames: {frames.shape[2:]}")

        # Plotando o resultado:
        print("Frames extraídos:")
        for b in range(batch_size):
            fig = plt.figure(figsize=(10, 10))
            rows = 4
            columns = 4
            for _ in range(self.number_of_frames):
                fig.add_subplot(rows, columns, _ + 1)
                plt.imshow(frames[b][_])

In [32]:
class Patches(layers.Layer):
    def __init__(self, patch_dim):
        super(Patches, self).__init__()
        self.patch_dim = patch_dim

    def call(self, tensors):
        batch_size = tf.shape(tensors)[0]
        patches = tf.reshape(tensors, [batch_size, -1, patch_dim])
        print("Patches shape = ", patches.shape)
        return patches

In [11]:
# Já temos um patch projetado linearmente, só precisamos agora fazer o embedding
class Encoder(layers.Layer):
    def __init__(self, num_patches, patch_dim):
        super(Encoder, self).__init__()
        self.num_patches = num_patches
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=patch_dim
        )

    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = patch + self.position_embedding(positions)
        return encoded

In [12]:
# def create_classifier():
#     inputs = layers.Input(shape=input_shape)
#     # Augment data.
#     augmented = data_augmentation(inputs)
#     # Create patches.
#     patches = Patches(patch_size)(augmented)
#     # Encode patches.
#     encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

#     # Create multiple layers of the Transformer block.
#     for _ in range(transformer_layers):
#         # Layer normalization 1.
#         x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
#         # Create a multi-head attention layer.
#         attention_output = layers.MultiHeadAttention(
#             num_heads=num_heads, key_dim=projection_dim, dropout=0.1
#         )(x1, x1)
#         # Skip connection 1.
#         x2 = layers.Add()([attention_output, encoded_patches])
#         # Layer normalization 2.
#         x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
#         # MLP.
#         x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
#         # Skip connection 2.
#         encoded_patches = layers.Add()([x3, x2])

#     # Create a [batch_size, projection_dim] tensor.
#     representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
#     representation = layers.Flatten()(representation)
#     representation = layers.Dropout(0.5)(representation)
#     # Add MLP.
#     features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)
#     # Classify outputs.
#     logits = layers.Dense(num_classes)(features)
#     # Create the Keras model.
#     model = keras.Model(inputs=inputs, outputs=logits)
#     return model

In [45]:
# Camadas iniciais do modelo:
patch_dim = 64
num_patches = 1280 // patch_dim

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.applications import EfficientNetB0

inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3)) # Por enquanto o input é apenas uma imagem
# Usa EfficientNet B0 como extratora de características da imagem que iremos processar
features_extraction = EfficientNetB0(include_top=False, weights='imagenet')
features_extraction.trainable = False
features_extraction = features_extraction(inputs)
# Patches
patches = Patches(patch_dim)(features_extraction)

# Encodding
encoded_patches = Encoder(num_patches, patch_dim)(patches)


model = tf.keras.Model(inputs, encoded_patches)
# model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

model.summary()

Patches shape =  (None, None, 64)


NameError: name 'Encoder' is not defined

In [111]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.applications import EfficientNetB0

# Camada de input
input_shape = ()
inputs = layers.Input(shape=input_shape)

# Extrator de frames
video_data_path =f"{data_path}/train/"
frames_extractor = FramesExtractor(video_data_path, number_of_frames=10, dim=(IMG_SIZE, IMG_SIZE))(inputs)
# frames_extractor(inputs)

# Aplica EfficientNetB0 aos frames como extratora de features
# efficientnet = EfficientNetB0(include_top=False, weights='imagenet')
# efficientnet.trainable = False
# features_extractor = layers.TimeDistributed( efficientnet, input_shape = (10, 224, 224, 3))
# features_extractor = features_extractor(frames_extractor)
model = tf.keras.Model(inputs, frames_extractor)
model.summary()

BATCH SIZE = None


TypeError: Exception encountered when calling layer "frames_extractor_23" (type FramesExtractor).

in user code:

    File "/tmp/ipykernel_682342/1876474392.py", line 39, in call  *
        frames.append(np.empty(self.dim, self.dim, 3))

    TypeError: Cannot interpret '224' as a data type


Call arguments received:
  • videos_names=tf.Tensor(shape=(None,), dtype=float32)

In [101]:
teste = np.empty((10, 224, 224, 3))
teste = tf.reshape(teste, (-1, 10, 224, 224, 3))

print(teste.shape)

(1, 10, 224, 224, 3)
