## Importando módulos

In [1]:
import torch
from torch import nn

## Carga de datos
#### Depende de los scripts de run_training, config_parser, data_handler y batcher en la carpeta de utils

In [2]:
from run_training import get_train_elements

In [3]:
config, batcher = get_train_elements()

Checkpoint dir already exists ./training
Use existing config ./training/config.pkl
Loading data
Loading face vertices
Loading templates
Loading raw audio
Process audio
Loading index maps
Initialize data splits
Initialize training, validation, and test indices
sequence data missing FaceTalk_170811_03274_TA - sentence01
sequence data missing FaceTalk_170811_03274_TA - sentence02
sequence data missing FaceTalk_170811_03274_TA - sentence24
sequence data missing FaceTalk_170913_03279_TA - sentence12
sequence data missing FaceTalk_170913_03279_TA - sentence38
sequence data missing FaceTalk_170912_03278_TA - sentence11
sequence data missing FaceTalk_170809_00138_TA - sentence32


## Conversión de datos a tensores
#### Se convierte los tensores a tipo float32 (originalmente float64 o double para pytorch)

In [4]:
import numpy as np

### Data de entrenamiento

In [32]:
processed_audio, face_vertices, face_templates, subject_idx = batcher.get_training_batch(config['batch_size'])

processed_audio = np.expand_dims(processed_audio, -1)
face_vertices = np.expand_dims(face_vertices, -1)
face_templates = np.expand_dims(face_templates, -1)

processed_audio = torch.from_numpy(processed_audio).type(torch.float32)
face_vertices = torch.from_numpy(face_vertices).type(torch.float32)
face_templates = torch.from_numpy(face_templates).type(torch.float32)
subject_idx = torch.from_numpy(subject_idx)

print("processed audio: ", processed_audio.shape, processed_audio.dtype)
print("face vertices: ", face_vertices.shape, face_vertices.dtype)
print("face templates: ", face_templates.shape, face_templates.dtype)
print("subject index: ", subject_idx.shape, subject_idx.dtype)

processed audio:  torch.Size([128, 16, 29, 1]) torch.float32
face vertices:  torch.Size([128, 5023, 3, 1]) torch.float32
face templates:  torch.Size([128, 5023, 3, 1]) torch.float32
subject index:  torch.Size([128]) torch.int64


### Data de validación

In [33]:
num_training_subjects = batcher.get_num_training_subjects()
val_processed_audio, val_face_vertices, val_face_templates, _ = batcher.get_validation_batch(config['batch_size'])

val_processed_audio = np.expand_dims(np.tile(val_processed_audio, (num_training_subjects, 1, 1)), -1)
val_face_vertices = np.expand_dims(np.tile(val_face_vertices, (num_training_subjects, 1, 1)), -1)
val_face_templates = np.expand_dims(np.tile(val_face_templates, (num_training_subjects, 1, 1)), -1)

val_processed_audio = torch.from_numpy(val_processed_audio).type(torch.float32)
val_face_vertices = torch.from_numpy(val_face_vertices).type(torch.float32)
val_face_templates = torch.from_numpy(val_face_templates).type(torch.float32)

print("processed audio: ", val_processed_audio.shape, val_processed_audio.dtype)
print("face vertices: ", val_face_vertices.shape, val_face_vertices.dtype)
print("face templates: ", val_face_templates.shape, val_face_templates.dtype)

processed audio:  torch.Size([1024, 16, 29, 1]) torch.float32
face vertices:  torch.Size([1024, 5023, 3, 1]) torch.float32
face templates:  torch.Size([1024, 5023, 3, 1]) torch.float32


#### Convertir las condiciones de sujetos a representaciones One Hot

In [7]:
condition = nn.functional.one_hot(subject_idx, batcher.get_num_training_subjects())
print(condition.shape)

torch.Size([128, 8])


In [8]:
val_condition = np.reshape(np.repeat(np.arange(num_training_subjects)[:,np.newaxis],
                repeats=config['num_consecutive_frames']*config['batch_size'], axis=-1), [-1,])
val_condition = torch.from_numpy(val_condition)
val_condition = nn.functional.one_hot(val_condition, batcher.get_num_training_subjects())
print(val_condition.shape)

torch.Size([1024, 8])


## Speech Encoder

### Inicialización de pesos y bias
#### Debido a que Pytorch no posee una implementación para inicializar los pesos con un muestreo truncado de la distribución normal, se ha utilizado la implementación extraída del foro en este enlace: https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/15

In [9]:
from utils.init_trunc_norm import truncated_normal_

#### Se creó una clase personalizada para las capas convolucionales y fully connected para implementar la inicialización de pesos por capa.

In [10]:
class FCLayer(nn.Module):
    def __init__(self, in_units, out_units, init_weights=None, weightini=0.1, bias=0.0):
        super().__init__()
        self.layer = nn.Linear(in_units, out_units)

        # inicialización de pesos
        if init_weights is not None:
            self.layer.weight.data = init_weights
        elif weightini == 0.0:
            nn.init.constant_(self.layer.weight, weightini)
        else:
            #nn.init.normal_(self.layer.weight, std=weightini)
            self.layer.weight.data = truncated_normal_(self.layer.weight.data, std=weightini)
        
        # inicialización de bias
        nn.init.constant_(self.layer.bias, bias)
    
    def forward(self, x):
        return self.layer(x)

In [11]:
class CustomConv2d(nn.Module):
    def __init__(self, in_ch, out_ch, k_size, stride=(0,0), padding=(0,0), std_dev=0.02):
        super().__init__()
        self.conv_layer = nn.Conv2d(in_channels=in_ch, out_channels=out_ch, kernel_size=k_size, stride=stride, padding=padding)

        # inicialización de pesos y bias
        #nn.init.normal_(self.conv_layer.weight, std=std_dev)
        self.conv_layer.weight.data = truncated_normal_(self.conv_layer.weight.data, std=std_dev)
        #nn.init.normal_(self.conv_layer.bias, std=std_dev)
        self.conv_layer.bias.data = truncated_normal_(self.conv_layer.bias.data, std=std_dev)
    
    def forward(self, x):
        return self.conv_layer(x)

#### En este bloque de código se utiliza una capa de Batch Normalization. En la versión en Tensorflow, los autores introducen el tensor de dimensiones $[N,16,29,1]$, siendo que Tensorflow espera uno de la forma $[N,H,W,C]$. Por esto se crea la capa respectiva con "num_features = 1".
#### Sin embargo, se probó que no hay diferencia entre utilizar el tensor de dimensiones $[N,16,1,29]$ con "num_features = 29", teniendo que intercambiar las dimensiones respectivas.
#### También es necesario mencionar que para migrar la capa de Batch Normalization de Tensorflow a Pytorch, es necesario calcular el momentum (Pytorch) a partir del decay (Tensorflow), siendo la fórmula $momentum = 1 - decay$ según el foro: https://discuss.pytorch.org/t/convering-a-batch-normalization-layer-from-tf-to-pytorch/20407
#### Se utiliza padding de $(1,0)$ ya que a diferencia de Tensorflow donde el padding se calcula si se utiliza "SAME", en Pytorch se debe especificar.

In [28]:
speech_encoding_dim = config['expression_dim']
condition_speech_features = config['condition_speech_features']
speech_encoder_size_factor = config['speech_encoder_size_factor']

batch_norm = nn.BatchNorm2d(num_features=29, eps=1e-5, momentum=0.1)

time_convs = nn.Sequential(
            CustomConv2d(in_ch=37, out_ch=32, k_size=(3,1), stride=(2,1), padding=(1,0)),
            nn.ReLU(), # [128, 32, 8, 1]
            CustomConv2d(in_ch=32, out_ch=32, k_size=(3,1), stride=(2,1), padding=(1,0)),
            nn.ReLU(), # [128, 32, 4, 1]
            CustomConv2d(in_ch=32, out_ch=64, k_size=(3,1), stride=(2,1), padding=(1,0)),
            nn.ReLU(), # [128, 64, 2, 1]
            CustomConv2d(in_ch=64, out_ch=64, k_size=(3,1), stride=(2,1), padding=(1,0)),
            nn.ReLU() # [128, 64, 1, 1]
        )

flatten = nn.Flatten()

fc_layers = nn.Sequential(
            FCLayer(72, 128),
            nn.Tanh(),
            FCLayer(128, speech_encoding_dim)
        )

#### Debido a que Batch Normalization en Pytorch espera un tensor de la forma $[N, C, H, W]$ es necesario cambiar las dimensiones del tensor original de $[N, H, W, C]$

#### Si se usa num_features = 1 en la capa de BatchNorm, utilizar:

In [13]:
processed_audio = processed_audio.permute(0,3,1,2)
print("processed audio: ", processed_audio.shape)

processed audio:  torch.Size([128, 1, 16, 29])


In [14]:
val_processed_audio = val_processed_audio.permute(0,3,1,2)
print("processed audio: ", val_processed_audio.shape)

processed audio:  torch.Size([1024, 1, 16, 29])


#### Si se usa num_features = 29 en la capa de BatchNorm, utilizar:

In [34]:
processed_audio = processed_audio.permute(0,2,1,3)
print("processed audio: ", processed_audio.shape)

processed audio:  torch.Size([128, 29, 16, 1])


In [35]:
val_processed_audio = val_processed_audio.permute(0,2,1,3)
print("processed audio: ", val_processed_audio.shape)

processed audio:  torch.Size([1024, 29, 16, 1])


### Procesamiento en la Capa de Batch Normalization

#### Si se usa num_features = 1 en la capa de BatchNorm, utilizar el sgte. bloque de código:
#### Debido a que las transformaciones intermedias se hacen a partir del código en Tensorflow, es necesario volver a transformar el tensor a la forma $[N,C,H,W]$ al final del proceso para poder introducir el dato a las capas convolucionales.

In [15]:
features_norm = batch_norm(processed_audio)
print("features norm: ", features_norm.shape)

# Regresar a la forma [N,H,W,C] (tensorflow)  el dato
features_norm = features_norm.permute(0, 2, 3, 1)
print("features: ", features_norm.shape)

speech_features_reshaped = torch.reshape(features_norm, (-1, features_norm.shape[1], 1, features_norm.shape[2]))
print("features reshaped: ", speech_features_reshaped.shape)

# función equivalente en pytorch a tf.transpose en tensores de n-dimensiones
speech_feature_condition = torch.reshape(condition, (-1, condition.shape[1], 1, 1)).permute(0, 2, 3, 1)
print("feature condition: ", speech_feature_condition.shape)

speech_feature_condition = torch.tile(speech_feature_condition, (1, features_norm.shape[1], 1, 1))
print("feature condition: ", speech_feature_condition.shape)

# Condicionamiento entre los sujetos de entrenamiento
speech_features_reshaped = torch.cat((speech_features_reshaped, speech_feature_condition), dim=-1)
print("features reshaped: ", speech_features_reshaped.shape)

# transformar el tensor a la forma de pytorch [N, C, H, W]
speech_features_reshaped = speech_features_reshaped.permute(0, 3, 1, 2)
print("features reshaped: ", speech_features_reshaped.shape)

features norm:  torch.Size([128, 1, 16, 29])
features:  torch.Size([128, 16, 29, 1])
features reshaped:  torch.Size([128, 16, 1, 29])
feature condition:  torch.Size([128, 1, 1, 8])
feature condition:  torch.Size([128, 16, 1, 8])
features reshaped:  torch.Size([128, 16, 1, 37])
features reshaped:  torch.Size([128, 37, 16, 1])


In [16]:
val_features_norm = batch_norm(val_processed_audio)
print("features norm: ", val_features_norm.shape)

val_features_norm = val_features_norm.permute(0, 2, 3, 1)
print("features: ", val_features_norm.shape)

val_speech_features_reshaped = torch.reshape(val_features_norm, (-1, val_features_norm.shape[1], 1, val_features_norm.shape[2]))
print("features reshaped: ", val_speech_features_reshaped.shape)

# función equivalente en pytorch a tf.transpose en tensores de n-dimensiones
val_speech_feature_condition = torch.reshape(val_condition, (-1, val_condition.shape[1], 1, 1)).permute(0, 2, 3, 1)
print("feature condition: ", val_speech_feature_condition.shape)

val_speech_feature_condition = torch.tile(val_speech_feature_condition, (1, val_features_norm.shape[1], 1, 1))
print("feature condition: ", val_speech_feature_condition.shape)

# Condicionamiento entre los sujetos de validación
val_speech_features_reshaped = torch.cat((val_speech_features_reshaped, val_speech_feature_condition), dim=-1)
print("features reshaped: ", val_speech_features_reshaped.shape)

val_speech_features_reshaped = val_speech_features_reshaped.permute(0, 3, 1, 2)
print("features reshaped: ", speech_features_reshaped.shape)

features norm:  torch.Size([1024, 1, 16, 29])
features:  torch.Size([1024, 16, 29, 1])
features reshaped:  torch.Size([1024, 16, 1, 29])
feature condition:  torch.Size([1024, 1, 1, 8])
feature condition:  torch.Size([1024, 16, 1, 8])
features reshaped:  torch.Size([1024, 16, 1, 37])
features reshaped:  torch.Size([128, 37, 16, 1])


#### Si se usa num_features = 29 en la capa de BatchNorm, utilizar el sgte. bloque de código:

In [36]:
features_norm = batch_norm(processed_audio)
print("features norm: ", features_norm.shape)

speech_feature_condition = torch.reshape(condition, (-1, condition.shape[1], 1, 1))
print("feature condition: ", speech_feature_condition.shape)

speech_feature_condition = torch.tile(speech_feature_condition, (1, 1, features_norm.shape[2], 1))
print("feature condition: ", speech_feature_condition.shape)

# Condicionamiento entre los sujetos de entrenamiento
speech_features_reshaped = torch.cat((features_norm, speech_feature_condition), dim=1)
print("features reshaped: ", speech_features_reshaped.shape)

features norm:  torch.Size([128, 29, 16, 1])
feature condition:  torch.Size([128, 8, 1, 1])
feature condition:  torch.Size([128, 8, 16, 1])
features reshaped:  torch.Size([128, 37, 16, 1])


In [37]:
val_features_norm = batch_norm(val_processed_audio)
print("features norm: ", val_features_norm.shape)

val_speech_feature_condition = torch.reshape(val_condition, (-1, val_condition.shape[1], 1, 1))
print("feature condition: ", val_speech_feature_condition.shape)

val_speech_feature_condition = torch.tile(val_speech_feature_condition, (1, 1, val_features_norm.shape[2], 1))
print("feature condition: ", speech_feature_condition.shape)

# Condicionamiento entre los sujetos de validación
val_speech_features_reshaped = torch.cat((val_features_norm, val_speech_feature_condition), dim=1)
print("features reshaped: ", speech_features_reshaped.shape)

features norm:  torch.Size([1024, 29, 16, 1])
feature condition:  torch.Size([1024, 8, 1, 1])
feature condition:  torch.Size([128, 8, 16, 1])
features reshaped:  torch.Size([128, 37, 16, 1])


### Procesamiento en las Capas Convolucionales
#### Dado que, independiente de la variable "num_features" utilizada en la capa de BatchNorm, la entrada que utiliza las capas convolucionales siguen la forma $[N,C,H,W]$, no es necesario realizar alguna transformación adicional para los siguientes pasos ya que en ambos valores de la variable, la entrada es $[N,37,16,1]$.

In [38]:
features = time_convs(speech_features_reshaped)
print("after convs: ", features.shape)
features_flat = flatten(features)
print("flatten: ", features_flat.shape)

after convs:  torch.Size([128, 64, 1, 1])
flatten:  torch.Size([128, 64])


In [39]:
val_features = time_convs(val_speech_features_reshaped)
print("after convs: ", val_features.shape)
val_features_flat = flatten(val_features)
print("flatten: ", val_features_flat.shape)

after convs:  torch.Size([1024, 64, 1, 1])
flatten:  torch.Size([1024, 64])


### Conversión de los Datos a un Espacio Latente

In [40]:
# Condicionamiento entre los sujetos de entrenamiento
concatenated = torch.cat((features_flat, condition), dim=1)
print("concat: ", concatenated.shape)
fc_result = fc_layers(concatenated)
print("fc result: ", fc_result.shape)

concat:  torch.Size([128, 72])
fc result:  torch.Size([128, 50])


In [41]:
# Condicionamiento entre los sujetos de validación
val_concatenated = torch.cat((val_features_flat, val_condition), dim=1)
print("concat: ", val_concatenated.shape)
val_fc_result = fc_layers(val_concatenated)
print("fc result: ", val_fc_result.shape)

concat:  torch.Size([1024, 72])
fc result:  torch.Size([1024, 50])


## Speech Decoder

In [42]:
expression_basis_fname = config['expression_basis_fname']
init_expression = config['init_expression']

num_vertices = config['num_vertices']
expression_dim = config['expression_dim']

#### Los autores inicializan los pesos a partir de un archivo que brindan en su carpeta de datos.

In [43]:
init_exp_basis = np.zeros((3*num_vertices, expression_dim))

if init_expression:
    init_exp_basis[:, :min(expression_dim, 100)] = np.load(expression_basis_fname)[:, :min(expression_dim, 100)]

init_exp_basis = torch.from_numpy(init_exp_basis).type(torch.float32)
print(init_exp_basis.shape)

torch.Size([15069, 50])


In [44]:
decoder = FCLayer(in_units=expression_dim, out_units=3*num_vertices, init_weights=init_exp_basis)

In [45]:
exp_offset = decoder(fc_result)
print(exp_offset.shape)
exp_offset = torch.reshape(exp_offset, (-1, num_vertices, 3, 1))
print(exp_offset.shape)

torch.Size([128, 15069])
torch.Size([128, 5023, 3, 1])


In [46]:
val_exp_offset = decoder(val_fc_result)
print(val_exp_offset.shape)
val_exp_offset = torch.reshape(val_exp_offset, (-1, num_vertices, 3, 1))
print(val_exp_offset.shape)

torch.Size([1024, 15069])
torch.Size([1024, 5023, 3, 1])


#### Como el resultado de la red es las distancias de traslaciones de los vértices y no los mismos vértices trasladados, es necesario sumarlo al modelo 3D base (Operación de Traslación en Modelos 3D)

In [47]:
predicted = exp_offset + face_templates
print(predicted.shape)

torch.Size([128, 5023, 3, 1])


In [48]:
val_predicted = val_exp_offset + val_face_templates
print(val_predicted.shape)

torch.Size([1024, 5023, 3, 1])
