In [1]:
import sys
import os

sys.path.append(os.path.join(os.path.dirname(os.path.abspath(os.getcwd())), "src"))
os.chdir(os.path.dirname(os.path.abspath(os.getcwd())))

import torch

from src.mslm.utils.setup_train import setup_paths
from src.mslm.utils import create_dataloaders, build_model, run_training, prepare_datasets, ConfigLoader

# Training

In [2]:
from torch.utils.data import Subset

_, _, h5_file = setup_paths()
device = "cuda" if torch.cuda.is_available() else "cpu"

model_parameters = ConfigLoader("config/model/config.toml").load_config()
model_parameters.update({
    "device": device if model_parameters.get("device") == "auto" else model_parameters.get("device", device),
    "input_size": 250 * 2,
    "output_size": 3072,
})

# --- parametros de entrenamiento ---
train_ratio = 0.8
epochs = 10
batch_size = 2
checkpoint_interval = 5
log_interval = 2

# --- config de entrenamiento ---
train_config = ConfigLoader("config/training/train_config.toml").load_config()
train_ratio = train_config.get("train_ratio", train_ratio)
train_config.update({
    "model_version": 1000,
    "learning_rate": train_config.get("learning_rate", 0.00238),
    "epochs": epochs if epochs else train_config.get("epochs", 100),
    "batch_size": batch_size if batch_size else train_config.get("batch_size", 32),
    "checkpoint_interval": checkpoint_interval if checkpoint_interval else train_config.get("checkpoint_interval", 5),
    "log_interval": log_interval if log_interval else train_config.get("log_interval", 2),
    "train_ratio": train_ratio,
    "validation_ratio": round(1 - train_ratio, 2),
    "device": device if model_parameters.get("device") == "auto" else model_parameters.get("device", device),
})
    
tr_ds, val_ds = prepare_datasets(h5_file, train_ratio)

# Seleccionamos solo los primeros 2 ítems de cada dataset para pruebas rápidas
train_subset = Subset(tr_ds, list(range(2)))
val_subset = Subset(val_ds, list(range(2)))

tr_dl, val_dl = create_dataloaders(train_subset, val_subset, batch_size, num_workers=4)

Train size:	10191
Validation size:	2547


In [3]:
for a in tr_dl:
    print(a[2].shape)

torch.Size([2, 4, 3072])


In [4]:
for a in train_subset:
    print(a[0].shape, a[1].shape)

torch.Size([88, 250, 2]) torch.Size([3, 3072])
torch.Size([118, 250, 2]) torch.Size([4, 3072])


In [5]:
model = build_model(**model_parameters)



OptimizedModule(
  (_orig_mod): Imitator(
    (linear_feat): Sequential(
      (0): Linear(in_features=500, out_features=512, bias=True)
      (1): GELU(approximate='none')
      (2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (3): Linear(in_features=512, out_features=256, bias=True)
      (4): GELU(approximate='none')
      (5): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    )
    (conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
    (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (act1): GELU(approximate='none')
    (conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
    (ln2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (act2): GELU(approximate='none')
    (linear_hidden): Linear(in_features=256, out_features=512, bias=True)
    (pe): PositionalEncoding(
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (transformer): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x Transfor

In [6]:
train_config.update({
  "epochs": 200,
})
run_training(train_config, tr_dl, val_dl, model)

cuda
Starting training...
LR: 0.0001


Entrenando:   0%|[32m          [0m| 0/200 [00:00<?, ?it/s]W0626 01:09:17.041000 1251143 site-packages/torch/_logging/_internal.py:1089] [7/0] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
Entrenando:   0%|[32m          [0m| 0/200 [00:06<?, ?it/s]


Epoch: 0.	 Total loss: 2.811551809310913


Entrenando:   0%|[32m          [0m| 1/200 [00:08<29:06,  8.78s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].par

Validation loss: 3.216034173965454


('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].param_groups[0]['params'][19].grad", "L['self'].param_groups[0]['params


Epoch: 2.	 Total loss: 2.756175994873047
Validation loss: 3.1631062030792236


Entrenando:   2%|[32m▏         [0m| 4/200 [00:14<08:48,  2.70s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].par


Epoch: 4.	 Total loss: 2.437065839767456
Validation loss: 3.034444570541382


('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].param_groups[0]['params'][19].grad", "L['self'].param_groups[0]['params


Epoch: 6.	 Total loss: 1.9045217037200928
Validation loss: 3.114173412322998


Entrenando:   4%|[32m▍         [0m| 8/200 [00:21<06:11,  1.93s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].par


Epoch: 8.	 Total loss: 1.5855319499969482
Validation loss: 2.673637866973877


Entrenando:   5%|[32m▌         [0m| 10/200 [00:24<05:00,  1.58s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 10.	 Total loss: 1.5780181884765625


Entrenando:   6%|[32m▌         [0m| 11/200 [00:26<05:39,  1.80s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa

Validation loss: 2.5508744716644287


Entrenando:   6%|[32m▌         [0m| 12/200 [00:27<05:07,  1.64s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 12.	 Total loss: 1.4161946773529053
Validation loss: 2.5389771461486816


Entrenando:   7%|[32m▋         [0m| 14/200 [00:30<04:30,  1.45s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 14.	 Total loss: 1.354885220527649
Validation loss: 2.4540462493896484


Entrenando:   8%|[32m▊         [0m| 16/200 [00:34<05:00,  1.63s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 16.	 Total loss: 1.3200551271438599
Validation loss: 2.431151866912842


Entrenando:   9%|[32m▉         [0m| 18/200 [00:36<04:25,  1.46s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 18.	 Total loss: 1.2759894132614136
Validation loss: 2.424123764038086


Entrenando:  10%|[32m█         [0m| 20/200 [00:39<04:33,  1.52s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 20.	 Total loss: 1.422791600227356


Entrenando:  10%|[32m█         [0m| 21/200 [00:41<04:53,  1.64s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa

Validation loss: 2.4473376274108887


Entrenando:  11%|[32m█         [0m| 22/200 [00:42<04:32,  1.53s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 22.	 Total loss: 1.4753053188323975
Validation loss: 2.6073477268218994


Entrenando:  12%|[32m█▏        [0m| 24/200 [00:45<04:06,  1.40s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 24.	 Total loss: 1.2055549621582031
Validation loss: 2.560199499130249


Entrenando:  13%|[32m█▎        [0m| 26/200 [00:49<04:46,  1.64s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 26.	 Total loss: 1.1416780948638916
Validation loss: 2.7062184810638428


Entrenando:  14%|[32m█▍        [0m| 28/200 [00:51<04:09,  1.45s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 28.	 Total loss: 1.2166639566421509
Validation loss: 2.6324667930603027


Entrenando:  15%|[32m█▌        [0m| 30/200 [00:54<03:51,  1.36s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 30.	 Total loss: 1.00675368309021


Entrenando:  16%|[32m█▌        [0m| 31/200 [00:56<04:18,  1.53s/it]

Validation loss: 2.6211204528808594


('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].param_groups[0]['params'][19].grad", "L['self'].param_groups[0]['params


Epoch: 32.	 Total loss: 0.9383493065834045
Validation loss: 2.6707041263580322


Entrenando:  17%|[32m█▋        [0m| 34/200 [01:00<04:04,  1.47s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 34.	 Total loss: 1.0617785453796387
Validation loss: 2.717081308364868


Entrenando:  18%|[32m█▊        [0m| 36/200 [01:03<04:15,  1.56s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 36.	 Total loss: 0.9397204518318176
Validation loss: 2.8087923526763916


Entrenando:  19%|[32m█▉        [0m| 38/200 [01:06<03:48,  1.41s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 38.	 Total loss: 0.6182915568351746
Validation loss: 2.8268017768859863


Entrenando:  20%|[32m██        [0m| 40/200 [01:09<04:05,  1.53s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 40.	 Total loss: 0.7503646016120911


Entrenando:  20%|[32m██        [0m| 41/200 [01:11<04:21,  1.64s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa

Validation loss: 2.985948324203491


Entrenando:  21%|[32m██        [0m| 42/200 [01:13<04:03,  1.54s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 42.	 Total loss: 0.42483845353126526
Validation loss: 2.7881033420562744


Entrenando:  22%|[32m██▏       [0m| 44/200 [01:15<03:40,  1.42s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 44.	 Total loss: 0.39458152651786804
Validation loss: 2.9418280124664307


Entrenando:  23%|[32m██▎       [0m| 46/200 [01:18<03:55,  1.53s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 46.	 Total loss: 0.32970401644706726
Validation loss: 3.051959991455078


Entrenando:  24%|[32m██▍       [0m| 48/200 [01:22<04:03,  1.60s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 48.	 Total loss: 0.26353052258491516
Validation loss: 2.8342785835266113


Entrenando:  25%|[32m██▌       [0m| 50/200 [01:24<03:37,  1.45s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 50.	 Total loss: 0.24807588756084442


Entrenando:  26%|[32m██▌       [0m| 51/200 [01:26<03:55,  1.58s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa

Validation loss: 2.9387919902801514


Entrenando:  26%|[32m██▌       [0m| 52/200 [01:28<03:40,  1.49s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 52.	 Total loss: 0.20441143214702606
Validation loss: 3.085800886154175


Entrenando:  27%|[32m██▋       [0m| 54/200 [01:30<03:22,  1.38s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 54.	 Total loss: 0.1526268869638443
Validation loss: 2.9975714683532715


Entrenando:  28%|[32m██▊       [0m| 56/200 [01:33<03:38,  1.52s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 56.	 Total loss: 0.14306724071502686
Validation loss: 2.9275381565093994


Entrenando:  29%|[32m██▉       [0m| 58/200 [01:37<03:54,  1.65s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 58.	 Total loss: 0.5850251317024231
Validation loss: 3.0370869636535645


Entrenando:  30%|[32m███       [0m| 60/200 [01:40<03:24,  1.46s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 60.	 Total loss: 0.130988210439682


Entrenando:  30%|[32m███       [0m| 61/200 [01:42<03:42,  1.60s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa

Validation loss: 3.032902240753174


Entrenando:  31%|[32m███       [0m| 62/200 [01:43<03:27,  1.51s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 62.	 Total loss: 0.12893785536289215
Validation loss: 2.9384260177612305


Entrenando:  32%|[32m███▏      [0m| 64/200 [01:46<03:09,  1.39s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 64.	 Total loss: 0.13449157774448395
Validation loss: 3.015401840209961


Entrenando:  33%|[32m███▎      [0m| 66/200 [01:49<03:25,  1.53s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 66.	 Total loss: 0.3706081807613373
Validation loss: 3.116382360458374


Entrenando:  34%|[32m███▍      [0m| 68/200 [01:51<03:05,  1.41s/it]('Grad tensors ["L['self'].param_groups[0]['params'][0].grad", "L['self'].param_groups[0]['params'][1].grad", "L['self'].param_groups[0]['params'][2].grad", "L['self'].param_groups[0]['params'][3].grad", "L['self'].param_groups[0]['params'][4].grad", "L['self'].param_groups[0]['params'][5].grad", "L['self'].param_groups[0]['params'][6].grad", "L['self'].param_groups[0]['params'][7].grad", "L['self'].param_groups[0]['params'][8].grad", "L['self'].param_groups[0]['params'][9].grad", "L['self'].param_groups[0]['params'][10].grad", "L['self'].param_groups[0]['params'][11].grad", "L['self'].param_groups[0]['params'][12].grad", "L['self'].param_groups[0]['params'][13].grad", "L['self'].param_groups[0]['params'][14].grad", "L['self'].param_groups[0]['params'][15].grad", "L['self'].param_groups[0]['params'][16].grad", "L['self'].param_groups[0]['params'][17].grad", "L['self'].param_groups[0]['params'][18].grad", "L['self'].pa


Epoch: 68.	 Total loss: 0.7478004097938538


Entrenando:  34%|[32m███▍      [0m| 68/200 [01:59<03:51,  1.75s/it]


RuntimeError: DataLoader worker (pid(s) 1251237, 1251238, 1251239, 1251240) exited unexpectedly

In [8]:
model.eval()
with torch.no_grad():
    for batch in val_dl:
        inputs, mask, targets, _ = batch
        inputs = inputs.to(device)
        targets = targets.to(device)
        mask = mask.to(device)
        outputs = model(inputs, mask)
        print(f"Inputs shape: {inputs.shape}, Outputs shape: {outputs.shape}, Targets shape: {targets.shape}")
        break  # Solo para probar el primer batch

RuntimeError: DataLoader worker (pid(s) 1251237, 1251238, 1251239, 1251240) exited unexpectedly

In [None]:
L_common = min(outputs.size(1), targets.size(1))
pred_embs     = outputs   [:, :L_common]
target_embs   = targets [:, :L_common]
embedding_mask = mask[:, :L_common]

import torch.nn as nn

loss_fn = nn.MSELoss()
loss = loss_fn(pred_embs, target_embs)
print("MSE Loss:", loss.item())


MSE Loss: 2.086271286010742


In [None]:
target_embs

tensor([[[ 0.0897, -0.3952, -0.1157,  ...,  0.4509,  0.3203,  0.3035],
         [ 0.9322, -1.1671, -0.2401,  ...,  0.9682, -0.9013,  0.0350],
         [ 0.1231, -0.4041, -0.9660,  ..., -0.4154, -2.0379,  1.2492],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0804, -0.4052, -0.1248,  ...,  0.4500,  0.3122,  0.2966],
         [ 0.0402, -2.1052,  1.2279,  ...,  1.7971, -0.5469, -0.7472],
         [-1.6225, -1.8591, -0.6591,  ..., -1.4079, -0.5445, -0.8075],
         ...,
         [-1.2914,  1.0705,  1.3059,  ..., -0.6800, -0.3671,  2.2190],
         [-0.8618,  1.6748,  2.6077,  ..., -0.8423,  0.7961,  1.5343],
         [ 0.2764, -1.0429, -2.7705,  ...,  0.1155, -0.4705, -0.9378]]],
       device='cuda:0')

In [None]:
pred_embs

tensor([[[-0.1389, -0.5081, -0.3362,  ...,  0.3684,  0.2839,  0.4170],
         [-2.2625, -0.1232,  1.0554,  ...,  0.7423, -1.6623, -0.2631],
         [-0.7219,  1.0109,  0.8119,  ..., -0.0616, -3.9642, -1.5129],
         ...,
         [-1.9372, -0.5624,  0.1327,  ...,  0.5759, -2.5311, -0.8052],
         [-0.2607, -0.8605,  1.0719,  ...,  0.3578, -0.7986, -0.0767],
         [-1.3852, -0.7967,  0.2694,  ...,  0.5442, -1.5376, -1.2053]],

        [[-0.1372, -0.6028, -0.4260,  ...,  0.2968,  0.3841,  0.3483],
         [-2.1371, -0.1450,  0.8849,  ...,  0.6219, -1.4993, -0.1890],
         [-0.6464,  1.0247,  0.7741,  ..., -0.1103, -3.8964, -1.4245],
         ...,
         [-2.0894, -0.5207, -0.0071,  ...,  0.2462, -2.8643, -0.8986],
         [-0.0597, -0.9253,  1.1884,  ...,  0.0332, -0.6494, -0.0531],
         [-0.9783, -0.8845, -0.0953,  ...,  0.4618, -1.0861, -1.2029]]],
       device='cuda:0')

In [None]:
outputs

tensor([[[-0.1389, -0.5081, -0.3362,  ...,  0.3684,  0.2839,  0.4170],
         [-2.2625, -0.1232,  1.0554,  ...,  0.7423, -1.6623, -0.2631],
         [-0.7219,  1.0109,  0.8119,  ..., -0.0616, -3.9642, -1.5129],
         ...,
         [-1.4484, -1.0098, -0.1699,  ..., -0.2217, -0.6729,  0.2320],
         [-0.4084, -0.0734, -0.1730,  ...,  0.3982, -0.6137,  0.0183],
         [-1.7066, -0.0720,  0.0958,  ...,  0.3263, -1.3347, -0.3726]],

        [[-0.1372, -0.6028, -0.4260,  ...,  0.2968,  0.3841,  0.3483],
         [-2.1371, -0.1450,  0.8849,  ...,  0.6219, -1.4993, -0.1890],
         [-0.6464,  1.0247,  0.7741,  ..., -0.1103, -3.8964, -1.4245],
         ...,
         [-1.6019, -0.9580, -0.2105,  ..., -0.2810, -0.2262,  0.0546],
         [-0.4252,  0.0775, -0.3439,  ...,  0.4071, -0.1987, -0.2586],
         [-1.8843,  0.0857,  0.0350,  ...,  0.2321, -1.1898, -0.6596]]],
       device='cuda:0')