# Dataset

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertModel
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:",device)


Device: cuda


In [None]:
# Dataset que devuelva SECUENCIAS
# antes cada getitem devolvía UN SOLO TOKEN,
# lo cual hace imposible usar padding, batching.
# Ahora se agrupaman por instancia_id.

class dataset_secuencias(Dataset):
    def __init__(self, df,cant_instancias=None):

        # Agrupamos por secuencia
        # Cada key = instancia_id
        # Cada value = dataframe con todos sus tokens

        if cant_instancias != None:
          df=df[df["instancia_id"]<=cant_instancias]

        self.groups = df.groupby("instancia_id")
        self.keys = list(self.groups.groups.keys())

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        instancia_id = self.keys[idx]
        group = self.groups.get_group(instancia_id)

        tokens = torch.tensor(group["token_id"].tolist(), dtype=torch.long)
        labels_cap = torch.tensor(group["capitalizacion"].tolist(), dtype=torch.long)
        labels_pi = torch.tensor(group["punt_inicial"].tolist(), dtype=torch.long)
        labels_pf = torch.tensor(group["pfinal"].tolist(), dtype=torch.long)

        return tokens, labels_cap, labels_pi, labels_pf

In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_fn(batch, device="cuda"):
    """
    Collate function optimizada para DataLoader.
    Devuelve batch ya en device, listo para usar en forward.
    lengths queda en CPU para pack_padded_sequence.
    """
    # Ordenamos por longitud (requerido por pack_padded_sequence)
    batch.sort(key=lambda x: x[0].size(0), reverse=True)

    # Extraemos listas de tensores (ya torch.Tensor)
    seqs = [x[0] for x in batch]
    caps = [x[1] for x in batch]
    pis  = [x[2] for x in batch]
    pfs  = [x[3] for x in batch]

    lengths = torch.tensor([s.size(0) for s in seqs], dtype=torch.long)  # CPU
    max_len = lengths.max()

    # Creamos batch vectorizado
    pad_token = 0
    pad_label = -100

    B = len(batch)
    padded_seqs = torch.full((B, max_len), pad_token, dtype=torch.long, device=device)
    padded_caps = torch.full((B, max_len), pad_label, dtype=torch.long, device=device)
    padded_pis  = torch.full((B, max_len), pad_label, dtype=torch.long, device=device)
    padded_pfs  = torch.full((B, max_len), pad_label, dtype=torch.long, device=device)

    # Asignamos todo de una vez
    for i, L in enumerate(lengths):
        padded_seqs[i, :L] = seqs[i].to(device)
        padded_caps[i, :L] = caps[i].to(device)
        padded_pis[i, :L]  = pis[i].to(device)
        padded_pfs[i, :L]  = pfs[i].to(device)

    return padded_seqs, padded_caps, padded_pis, padded_pfs, lengths


# Modelos

In [None]:
class Uni_RNN(nn.Module):

    def __init__(self, hidden1=512, hidden2=256):
        super().__init__()
        self.embedding_dim=768
        bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        bert=bert.to(device)
        self.embedding_matrix = bert.embeddings.word_embeddings.weight.to(device)
        self.embedding = nn.Embedding.from_pretrained(
            bert.embeddings.word_embeddings.weight.clone(),
            freeze=True
        )

        self.LSTM = nn.LSTM(self.embedding_dim, hidden1, batch_first=True)
        self.LSTM2 = nn.LSTM(hidden1, hidden2, batch_first=True)

        self.fc_cap = nn.Linear(hidden2, 4)   # MULTICLASE
        self.fc_pinicial = nn.Linear(hidden2, 2)   # (2 clases)
        self.fc_pfinal = nn.Linear(hidden2, 4)   # MULTICLASE

    def forward(self, seqs, lengths, inferencia=False):

        emb = self.embedding(seqs)

        packed = nn.utils.rnn.pack_padded_sequence(
            emb, lengths.cpu(), batch_first=True, enforce_sorted=True
         )

        data, _ = self.LSTM(packed)
        data, _ = self.LSTM2(data)

        final, _ = nn.utils.rnn.pad_packed_sequence(data, batch_first=True) #embedding contextualizado de cada token

        output_cap = self.fc_cap(final)
        output_punt_inicial = self.fc_pinicial(final)
        output_punt_final = self.fc_pfinal(final)

        if inferencia:
            output_cap = torch.softmax(output_cap, dim=-1)
            output_cap = torch.argmax(output_cap, dim=-1)

            output_punt_inicial = torch.softmax(output_punt_inicial, dim=-1)
            output_punt_inicial = torch.argmax(output_punt_inicial, dim=-1)

            output_punt_final = torch.softmax(output_punt_final, dim=-1)
            output_punt_final = torch.argmax(output_punt_final, dim=-1)

        return output_cap, output_punt_inicial, output_punt_final

class Bi_RNN(nn.Module):
    def __init__(self, hidden1=380):
        super().__init__()
        self.embedding_dim=768
        bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        bert=bert.to(device)
        self.embedding_matrix = bert.embeddings.word_embeddings.weight.to(device)
        self.embedding = nn.Embedding.from_pretrained(
            bert.embeddings.word_embeddings.weight.clone(),
            freeze=True
        )

        self.LSTM = nn.LSTM(self.embedding_dim, hidden1,bidirectional=True ,batch_first=True)

        self.fc_cap = nn.Linear(2*hidden1, 4)   # MULTICLASE
        self.fc_pinicial = nn.Linear(2*hidden1, 2)   # (2 clases)
        self.fc_pfinal = nn.Linear(2*hidden1, 4)   # MULTICLASE

    def forward(self, seqs, lengths, inferencia=False):

        emb = self.embedding(seqs)

        packed = nn.utils.rnn.pack_padded_sequence(
            emb, lengths.cpu(), batch_first=True, enforce_sorted=True
         )

        data, _ = self.LSTM(packed)

        final, _ = nn.utils.rnn.pad_packed_sequence(data, batch_first=True) #embedding contextualizado de cada token


        output_cap = self.fc_cap(final)
        output_punt_inicial = self.fc_pinicial(final)
        output_punt_final = self.fc_pfinal(final)

        if inferencia:
            output_cap = torch.softmax(output_cap, dim=-1)
            output_cap = torch.argmax(output_cap, dim=-1)

            output_punt_inicial = torch.softmax(output_punt_inicial, dim=-1)
            output_punt_inicial = torch.argmax(output_punt_inicial, dim=-1)

            output_punt_final = torch.softmax(output_punt_final, dim=-1)
            output_punt_final = torch.argmax(output_punt_final, dim=-1)

        return output_cap, output_punt_inicial, output_punt_final


# Cargamos los datos


In [None]:
# Cargar el dataframe y crear dataloader
df = pd.read_parquet("/content/drive/MyDrive/TP AA II/datos_entrenamiento_posta.parquet")

In [None]:

dataset = dataset_secuencias(df)

train_size = int(0.8 * len(dataset))

valid_size = len(dataset) - train_size

train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
valid_loader  = DataLoader(valid_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


#Entrenamiento de UNi RNN

In [None]:
# Crear Modelo
model = Uni_RNN(
    hidden1=512,
    hidden2=256
)

model=model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

ce  = nn.CrossEntropyLoss(ignore_index=-100)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [None]:
# Training Loop

num_epoch=1

for epoch in range(num_epoch):

    model.train()
    total_loss = 0

    for seqs, caps, pini, pfin, lengths in train_loader:

        seqs = seqs.to(device)
        caps = caps.to(device)
        pini = pini.to(device)
        pfin = pfin.to(device)
        lengths = lengths.to(device)

        out_cap, out_pini, out_pfin = model(seqs, lengths)

        # Pérdida capitalización (multiclase)
        loss_cap = ce(
            out_cap.reshape(-1, 4),
            caps.reshape(-1)
        )

        # Pérdida puntuación inicial
        loss_pini  = ce(
            out_pini.reshape(-1, 2),
            pini.reshape(-1)
        )

        # Pérdida puntuación final (multiclase)
        loss_pfin = ce(
            out_pfin.reshape(-1, 4),
            pfin.reshape(-1)
        )


        loss = loss_cap + loss_pini + loss_pfin

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()/len(train_loader)


    model.eval()
    val_loss = 0

    with torch.inference_mode():
        for seqs, caps, pini, pfin, lengths in valid_loader:

            seqs = seqs.to(device)
            caps = caps.to(device)
            pini = pini.to(device)
            pfin = pfin.to(device)
            #lengths = lengths.to(device)

            out_cap, out_pini, out_pfin = model(seqs, lengths)

            # capitalización
            loss_cap = ce(
                out_cap.reshape(-1, 4),
                caps.reshape(-1)
            )

            # Pérdida puntuación inicial
            loss_pini  = ce(
            out_pini.reshape(-1, 2),
            pini.reshape(-1)
            )

            # puntuación final
            loss_pfin = ce(
                out_pfin.reshape(-1, 4),
                pfin.reshape(-1)
            )


            loss = loss_cap + loss_pini + loss_pfin
            val_loss += loss.item()/len(valid_loader)

    print(f"Epoch {epoch+1}/{num_epoch} | Train Loss = {total_loss:.4f} | Val Loss = {val_loss:.4f}")




# Evaluacion

In [None]:
def decodificador(data,lengths,model):
  from statistics import mode
  """Con los tokens del input y el modelo podemos reconstruir el texto """
  cap, pinicial, pfinal= model(data,lengths,inferencia=True)

  palabras_tokenizadas=[]

  if len(data.shape) == 1:
    pinicial = pinicial.unsqueeze(0)
    pfinal = pfinal.unsqueeze(0)
    cap = cap.unsqueeze(0)


  tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
  tokens= [tokenizer.convert_ids_to_tokens(data[i]) for i in range(data.shape[0]) ]


  for elemento in range(data.shape[0]):
    palabra=""
    i=0
    L=lengths[elemento]
    pinicial_elemento= pinicial[elemento][:L]
    pfinal_elemento = pfinal[elemento][:L]
    cap_elemento = cap[elemento][:L]
    token_elemento=tokens[elemento][:L]
    capitalizacion_palabra=[]
    subcapitalizacion=[]
    palabras_tokenizadas=[]
    while i <len(token_elemento):

      if pinicial_elemento[i]<=0.5:
        apertura=""
      else:
        apertura="¿"

      if pfinal_elemento[i]==0:
        cierre=""
      elif pfinal_elemento[i]==1:
        cierre="?"
      elif pfinal_elemento[i]==2:
        cierre="."
      elif pfinal_elemento[i]==3:
        cierre=","

      if token_elemento[i][:2]=="##":
        palabra+= apertura + token_elemento[i][2:] + cierre
        subcapitalizacion.append(cap_elemento[i])
      else:

          palabra+= apertura + token_elemento[i] + cierre
          subcapitalizacion.append(cap_elemento[i])

      if i != len(token_elemento)-1 and token_elemento[i+1][:2]!="##": #si el proximo token no es decontinuacion termine la palabra
        palabras_tokenizadas.append(palabra)
        capitalizacion_palabra.append(subcapitalizacion)
        subcapitalizacion=[]
        palabra=""

      elif i == len(token_elemento)-1:
        palabras_tokenizadas.append(palabra)
        capitalizacion_palabra.append(subcapitalizacion)
        subcapitalizacion=[]
        palabra=""
      i+=1


    texto_final =""
    for i in range(len(palabras_tokenizadas)):


      #Procesador de mayusculas
      subcapitalizacion=capitalizacion_palabra[i]

      if mode(subcapitalizacion) == 0:
        palabra_procesada=palabras_tokenizadas[i]

      elif mode(subcapitalizacion) == 1:
        if palabras_tokenizadas[i][0]=="¿":
          palabra_procesada=palabras_tokenizadas[i][:2].upper()+palabras_tokenizadas[i][2:]
        else:
          palabra_procesada=palabras_tokenizadas[i][:1].upper()+palabras_tokenizadas[i][1:]

      elif mode(subcapitalizacion) == 2:
        palabra_procesada=""
        for j in range(len(palabras_tokenizadas[i])):
          if j % 2 == 0:
            palabra_procesada+=palabras_tokenizadas[i][j]
          else:
            palabra_procesada+=palabras_tokenizadas[i][j].upper()

      else: #palabras de tipo 3
        palabra_procesada=palabras_tokenizadas[i].upper()

      if i == 0:
        texto_final+=palabra_procesada
      else:
        texto_final+=" "
        texto_final+=palabra_procesada

    print(texto_final)
    print()

In [None]:
datos_visualizacion=pd.read_parquet("/content/drive/MyDrive/TP AA II/visualizacion.parquet")
dataset_visualizacion=dataset_secuencias(datos_visualizacion)
visualizacion_loader = DataLoader(dataset_visualizacion, batch_size=16, shuffle=False, collate_fn=collate_fn)
for seqs, _ , _, _, lengths in visualizacion_loader:
  decodificador(seqs,lengths,model)


In [None]:
def f1_macro_dataset(dataloader, model, n_caps=4, n_pini=2, n_pfin=4, device="cuda"):

    # contadores en device
    tp_caps = torch.zeros(n_caps, device=device)
    fp_caps = torch.zeros(n_caps, device=device)
    fn_caps = torch.zeros(n_caps, device=device)

    tp_pini = torch.zeros(n_pini, device=device)
    fp_pini = torch.zeros(n_pini, device=device)
    fn_pini = torch.zeros(n_pini, device=device)

    tp_pfin = torch.zeros(n_pfin, device=device)
    fp_pfin = torch.zeros(n_pfin, device=device)
    fn_pfin = torch.zeros(n_pfin, device=device)

    model.eval()

    for seqs, caps, pini, pfin, lengths in dataloader:

        seqs = seqs.to(device)
        caps = caps.to(device)
        pini = pini.to(device)
        pfin = pfin.to(device)

        lengths_device= lengths.to(device)

        with torch.inference_mode():
            pred_caps, pred_pini, pred_pfin = model(seqs, lengths, inferencia=True)


        B, L = caps.shape
        mask = torch.arange(L, device=device).unsqueeze(0) < lengths_device.unsqueeze(1)



        for cls in range(n_caps):
            # TP
            tp_caps[cls] += ((pred_caps == cls) & (caps == cls) & mask).sum()
            # FP
            fp_caps[cls] += ((pred_caps == cls) & (caps != cls) & mask).sum()
            # FN
            fn_caps[cls] += ((pred_caps != cls) & (caps == cls) & mask).sum()


        for cls in range(n_pini):
            tp_pini[cls] += ((pred_pini == cls) & (pini == cls) & mask).sum()
            fp_pini[cls] += ((pred_pini == cls) & (pini != cls) & mask).sum()
            fn_pini[cls] += ((pred_pini != cls) & (pini == cls) & mask).sum()


        for cls in range(n_pfin):
            tp_pfin[cls] += ((pred_pfin == cls) & (pfin == cls) & mask).sum()
            fp_pfin[cls] += ((pred_pfin == cls) & (pfin != cls) & mask).sum()
            fn_pfin[cls] += ((pred_pfin != cls) & (pfin == cls) & mask).sum()


    def f1(tp, fp, fn):
        precision = tp / (tp + fp + 1e-9)
        recall    = tp / (tp + fn + 1e-9)
        f1        = 2 * precision * recall / (precision + recall + 1e-9)
        return f1.mean()

    return {
        "caps": f1(tp_caps, fp_caps, fn_caps),
        "pini": f1(tp_pini, fp_pini, fn_pini),
        "pfin": f1(tp_pfin, fp_pfin, fn_pfin),
    }



In [None]:
torch.save(model, "/content/drive/MyDrive/TP AA II/red_uni.pt")

In [None]:
held_out= pd.read_parquet("/content/drive/MyDrive/TP AA II/datos_control_100000.parquet")

held_out_dataset=dataset_secuencias(held_out)

held_out_loader=DataLoader(held_out_dataset,batch_size=16,shuffle=False,collate_fn=collate_fn)

f1_macro_dataset(held_out_loader, model, n_caps=4, n_pini=2, n_pfin=4, device="cuda")

{'caps': tensor(0.7540, device='cuda:0'),
 'pini': tensor(0.7444, device='cuda:0'),
 'pfin': tensor(0.5581, device='cuda:0')}

# Entramiento BI RNN

In [None]:
# Crear Modelo
model = Bi_RNN(
    hidden1=512,
)

model=model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

ce  = nn.CrossEntropyLoss(ignore_index=-100)

In [None]:
# Training Loop

num_epoch=1

for epoch in range(num_epoch):

    model.train()
    total_loss = 0

    for seqs, caps, pini, pfin, lengths in train_loader:

        seqs = seqs.to(device)
        caps = caps.to(device)
        pini = pini.to(device)
        pfin = pfin.to(device)
        lengths = lengths.to(device)

        out_cap, out_pini, out_pfin = model(seqs, lengths)

        # Pérdida capitalización (multiclase)
        loss_cap = ce(
            out_cap.reshape(-1, 4),
            caps.reshape(-1)
        )

        # Pérdida puntuación inicial
        loss_pini  = ce(
            out_pini.reshape(-1, 2),
            pini.reshape(-1)
        )

        # Pérdida puntuación final (multiclase)
        loss_pfin = ce(
            out_pfin.reshape(-1, 4),
            pfin.reshape(-1)
        )


        loss = loss_cap + loss_pini + loss_pfin

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()/len(train_loader)


    model.eval()
    val_loss = 0

    with torch.inference_mode():
        for seqs, caps, pini, pfin, lengths in valid_loader:

            seqs = seqs.to(device)
            caps = caps.to(device)
            pini = pini.to(device)
            pfin = pfin.to(device)
            #lengths = lengths.to(device)

            out_cap, out_pini, out_pfin = model(seqs, lengths)

            # capitalización
            loss_cap = ce(
                out_cap.reshape(-1, 4),
                caps.reshape(-1)
            )

            # Pérdida puntuación inicial
            loss_pini  = ce(
            out_pini.reshape(-1, 2),
            pini.reshape(-1)
            )

            # puntuación final
            loss_pfin = ce(
                out_pfin.reshape(-1, 4),
                pfin.reshape(-1)
            )


            loss = loss_cap + loss_pini + loss_pfin
            val_loss += loss.item()/len(valid_loader)

    print(f"Epoch {epoch+1}/{num_epoch} | Train Loss = {total_loss:.4f} | Val Loss = {val_loss:.4f}")



# Evaluacion Bi RNN

In [None]:
datos_visualizacion=pd.read_parquet("/content/drive/MyDrive/TP AA II/visualizacion.parquet")
dataset_visualizacion=dataset_secuencias(datos_visualizacion)
visualizacion_loader = DataLoader(dataset_visualizacion, batch_size=16, shuffle=False, collate_fn=collate_fn)
for seqs, _ , _, _, lengths in visualizacion_loader:
  decodificador(seqs,lengths,model)


In [None]:
held_out= pd.read_parquet("/content/drive/MyDrive/TP AA II/datos_control_100000.parquet")

held_out_dataset=dataset_secuencias(held_out)

held_out_loader=DataLoader(held_out_dataset,batch_size=16,shuffle=False,collate_fn=collate_fn)

f1_macro_dataset(held_out_loader, model, n_caps=4, n_pini=2, n_pfin=4, device="cuda")

In [None]:
torch.save(model, "/content/drive/MyDrive/TP AA II/red_bi.pt")

# Prediciones

In [None]:
model = torch.load("/content/drive/MyDrive/TP AA II/red_bi.pt", weights_only=False)

In [None]:
class dataset_secuencias_test(Dataset):
    def __init__(self, df, cant_instancias=None):

        if cant_instancias != None:
            df = df[df["instancia_id"] <= cant_instancias]

        self.groups = df.groupby("instancia_id")
        self.keys = list(self.groups.groups.keys())

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        instancia_id = self.keys[idx]
        group = self.groups.get_group(instancia_id)

        tokens = torch.tensor(group["token_id"].tolist(), dtype=torch.long)
        labels_cap = torch.tensor(group["capitalizacion"].tolist(), dtype=torch.long)
        labels_pi = torch.tensor(group["punt_inicial"].tolist(), dtype=torch.long)
        labels_pf = torch.tensor(group["pfinal"].tolist(), dtype=torch.long)

        return tokens, labels_cap, labels_pi, labels_pf, instancia_id

def collate_fn_test(batch, device="cuda"):
    """
    Collate function optimizada para DataLoader.
    Ahora también conserva los instancia_id en el mismo orden
    en que se reordenan las secuencias.
    """

    # batch[i] = (tokens, caps, pis, pfs, instancia_id)

    # Ordenamos por longitud (requerido para pack_padded_sequence)
    batch.sort(key=lambda x: x[0].size(0), reverse=True)

    # Extraemos listas de tensores
    seqs = [x[0] for x in batch]
    caps = [x[1] for x in batch]
    pis  = [x[2] for x in batch]
    pfs  = [x[3] for x in batch]
    instancia_ids = [x[4] for x in batch]   # <--- MUY IMPORTANTE

    lengths = torch.tensor([s.size(0) for s in seqs], dtype=torch.long)  # CPU
    max_len = lengths.max()

    pad_token = 0
    pad_label = -100

    B = len(batch)
    padded_seqs = torch.full((B, max_len), pad_token, dtype=torch.long, device=device)
    padded_caps = torch.full((B, max_len), pad_label, dtype=torch.long, device=device)
    padded_pis  = torch.full((B, max_len), pad_label, dtype=torch.long, device=device)
    padded_pfs  = torch.full((B, max_len), pad_label, dtype=torch.long, device=device)

    for i, L in enumerate(lengths):
        padded_seqs[i, :L] = seqs[i].to(device)
        padded_caps[i, :L] = caps[i].to(device)
        padded_pis[i, :L]  = pis[i].to(device)
        padded_pfs[i, :L]  = pfs[i].to(device)

    return padded_seqs, padded_caps, padded_pis, padded_pfs, lengths, instancia_ids

In [None]:
datos_test = pd.read_csv("/content/drive/MyDrive/TP AA II/datos_test.csv")


datos_test = datos_test.rename(columns={
    "punt_inicial": "punt_inicial",
    "punt_final": "pfinal",
    "capitalización": "capitalizacion"
})

datos_test.fillna(0,inplace=True)

dataset_test=dataset_secuencias_test(datos_test)


test_loader= DataLoader(dataset_test,batch_size=16,collate_fn= collate_fn_test,shuffle=False)


for seqs, _, _, _, lengths, instancia_ids in test_loader:

    # forward
    out_cap, out_pini, out_pfin = model(seqs, lengths, inferencia=True)

    out_cap  = out_cap.detach().cpu().numpy()
    out_pini = out_pini.detach().cpu().numpy()
    out_pfin = out_pfin.detach().cpu().numpy()

    # Para cada elemento del batch
    for i, inst_id in enumerate(instancia_ids):

        L = lengths[i].item()           # tokens reales
        mask = datos_test["instancia_id"] == inst_id

        # orden correcto del grupo dentro de datos_test
        idxs = datos_test.loc[mask].index

        # Chequeo de seguridad
        assert len(idxs) == L, f"ERROR: instancia {inst_id} tiene {len(idxs)} filas en df pero {L} tokens en la red"

        # Asigno token a token:
        datos_test.loc[idxs, "capitalizacion"] = out_cap[i, :L]
        datos_test.loc[idxs, "punt_inicial"]   = out_pini[i, :L]
        datos_test.loc[idxs, "pfinal"]         = out_pfin[i, :L]

datos_test = datos_test.rename(columns={
    "punt_inicial": "punt_inicial",
    "pfinal": "punt_final",
    "capitalizacion": "capitalización"
})

In [None]:
datos_test.to_csv("/content/drive/MyDrive/TP AA II/predicciones_UnderDogs.csv")


In [None]:
for seqs, _ , _, _, lengths,_ in test_loader:
  decodificador(seqs,lengths,model)