In [None]:
!pip install transformers



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertModel
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Dataset que devuelva SECUENCIAS
# antes cada getitem devolvía UN SOLO TOKEN,
# lo cual hace imposible usar padding, batching.
# Ahora se agrupaman por instancia_id.

class dataset_secuencias(Dataset):
    def __init__(self, df,cant_instancias=None):

        # Agrupamos por secuencia
        # Cada key = instancia_id
        # Cada value = dataframe con todos sus tokens

        if cant_instancias != None:
          df=df[df["instancia_id"]<=cant_instancias]

        self.groups = df.groupby("instancia_id")
        self.keys = list(self.groups.groups.keys())

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        instancia_id = self.keys[idx]
        group = self.groups.get_group(instancia_id)

        tokens = torch.tensor(group["token_id"].tolist(), dtype=torch.long)
        labels_cap = torch.tensor(group["capitalizacion"].tolist(), dtype=torch.long)
        labels_pi = torch.tensor(group["punt_inicial"].tolist(), dtype=torch.long)
        labels_pf = torch.tensor(group["pfinal"].tolist(), dtype=torch.long)

        return tokens, labels_cap, labels_pi, labels_pf

In [None]:
# collate_fn con padding
#  DataLoader necesita un modo de juntar
#  secuencias de distintos tamaños y
#  rellenar con padding para formar un batch.

def collate_fn(batch):
    # batch es una lista de tuplas: (tokens, cap, pi, pf)

    # Ordenamos por longitud (requerido por pack_padded_sequence)
    batch.sort(key=lambda x: len(x[0]), reverse=True)

    seqs = [item[0] for item in batch]
    caps = [item[1] for item in batch]
    pis = [item[2] for item in batch]
    pfs = [item[3] for item in batch]

    lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    max_len = lengths.max()

    # Padding manual
    pad_token = 0
    pad_label = -100  # Ignorado por CrossEntropyLoss

    padded_seqs = torch.full((len(batch), max_len), pad_token, dtype=torch.long)
    padded_cap = torch.full((len(batch), max_len), pad_label, dtype=torch.long)
    padded_pi = torch.full((len(batch), max_len), pad_label, dtype=torch.long)
    padded_pf = torch.full((len(batch), max_len), pad_label, dtype=torch.long)

    for i in range(len(batch)):
        L = lengths[i]
        padded_seqs[i, :L] = seqs[i]
        padded_cap[i, :L] = caps[i]
        padded_pi[i, :L] = pis[i]
        padded_pf[i, :L] = pfs[i]

    return padded_seqs, padded_cap, padded_pi, padded_pf, lengths

In [None]:
# Implementacion de la RNN

class Uni_RNN(nn.Module):
    # hidden1: 256-384 según tu config
    # hidden2: 128-256 según tu config
    def __init__(self, hidden1=512, hidden2=256):
        super().__init__()

        bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.embedding_matrix = bert.embeddings.word_embeddings.weight
        self.embedding_dim = self.embedding_matrix.shape[1]

        self.LSTM = nn.LSTM(self.embedding_dim, hidden1, batch_first=True)
        self.LSTM2 = nn.LSTM(hidden1, hidden2, batch_first=True)

        self.fc_cap = nn.Linear(hidden2, 4)   # MULTICLASE
        self.fc_pinicial = nn.Linear(hidden2, 2)   # (2 clases)
        self.fc_pfinal = nn.Linear(hidden2, 4)   # MULTICLASE

    def forward(self, seqs, lengths, inferencia=False):

        emb = self.embedding_matrix[seqs]

        packed = nn.utils.rnn.pack_padded_sequence(
            emb, lengths.cpu(), batch_first=True, enforce_sorted=True
        )

        data, _ = self.LSTM(packed)
        data, _ = self.LSTM2(data)

        final, _ = nn.utils.rnn.pad_packed_sequence(data, batch_first=True)

        output_cap = self.fc_cap(final)
        output_punt_inicial = self.fc_pinicial(final)
        output_punt_final = self.fc_pfinal(final)

        if inferencia:
            output_cap = torch.softmax(output_cap, dim=-1)
            output_cap = torch.argmax(output_cap, dim=-1)

            output_punt_inicial = torch.softmax(output_punt_inicial, dim=-1)
            output_punt_inicial = torch.argmax(output_punt_inicial, dim=-1)

            output_punt_final = torch.softmax(output_punt_final, dim=-1)
            output_punt_final = torch.argmax(output_punt_final, dim=-1)

        return output_cap, output_punt_inicial, output_punt_final


In [None]:
# Cargar el dataframe y crear dataloader
df = pd.read_parquet("/content/drive/MyDrive/TP AA II/datos_modelo.parquet")

dataset = dataset_secuencias(df,50000)

train_size = int(0.8 * len(dataset))
valid_size = len(dataset) - train_size

train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
valid_loader  = DataLoader(valid_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [None]:
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:",device)



Device: cuda


In [None]:
# Crear Modelo
model = Uni_RNN(
    hidden1=32,
    hidden2=16
)

model=model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

ce  = nn.CrossEntropyLoss(ignore_index=-100)

In [None]:
# Training Loop

num_epoch=3

for epoch in range(num_epoch):

    model.train()
    total_loss = 0

    for seqs, caps, pini, pfin, lengths in train_loader:

        seqs = seqs.to(device)
        caps = caps.to(device)
        pini = pini.to(device)
        pfin = pfin.to(device)
        lengths = lengths.to(device)

        out_cap, out_pini, out_pfin = model(seqs, lengths)

        # Pérdida capitalización (multiclase)
        loss_cap = ce(
            out_cap.reshape(-1, 4),
            caps.reshape(-1)
        )

        # Pérdida puntuación inicial
        loss_pini  = ce(
            out_pini.reshape(-1, 2),
            pini.reshape(-1)
        )

        # Pérdida puntuación final (multiclase)
        loss_pfin = ce(
            out_pfin.reshape(-1, 4),
            pfin.reshape(-1)
        )


        loss = loss_cap + loss_pini + loss_pfin

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()/len(train_loader)


    model.eval()
    val_loss = 0

    with torch.inference_mode():
        for seqs, caps, pini, pfin, lengths in valid_loader:

            seqs = seqs.to(device)
            caps = caps.to(device)
            pini = pini.to(device)
            pfin = pfin.to(device)
            lengths = lengths.to(device)

            out_cap, out_pini, out_pfin = model(seqs, lengths)

            # capitalización
            loss_cap = ce(
                out_cap.reshape(-1, 4),
                caps.reshape(-1)
            )

            # Pérdida puntuación inicial
            loss_pini  = ce(
            out_pini.reshape(-1, 2),
            pini.reshape(-1)
            )

            # puntuación final
            loss_pfin = ce(
                out_pfin.reshape(-1, 4),
                pfin.reshape(-1)
            )


            loss = loss_cap + loss_pini + loss_pfin
            val_loss += loss.item()/len(valid_loader)

    print(f"Epoch {epoch+1}/{num_epoch} | Train Loss = {total_loss:.4f} | Val Loss = {val_loss:.4f}")




Epoch 1/3 | Train Loss = 0.9993 | Val Loss = 0.8158
Epoch 2/3 | Train Loss = 0.7646 | Val Loss = 0.7720
Epoch 3/3 | Train Loss = 0.7117 | Val Loss = 0.7580


In [None]:
def decodificador(data,model):
  """Con los tokens del input y el modelo podemos reconstruir el texto """

  tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
  tokens=tokenizer.convert_ids_to_tokens(data)
  texto_salida=""
  pinicial, pfinal , cap= model(data,inferencia=True) #listas que me dicen que poner

  palabras_tokenizadas=[]
  i=0
  palabra=""
  while i <len(tokens):

    if pinicial[i]<=0.5:
      apertura=""
    else:
      apertura="¿"

    if pfinal[i]==0:
      cierre=""
    elif pfinal[i]==1:
      cierre="?"
    elif pfinal[i]==2:
      cierre="."
    elif pfinal[i]==3:
      cierre=","

    if tokens[i][:2]=="##":
      palabra+=apertura+tokens[i][2:]+cierre
    else:
      if cap[i]==1:
        token_procesado=tokens[i][0].upper()+tokens[i][1:]

      elif cap[i]==3:
          token_procesado=tokens[i].upper()

      elif cap[i]==2:
          token_procesado=""
          for j in range(tokens[i]):
            if j % 2 == 0:
              token_procesado+=tokens[i][j]
            else:
              token_procesado+=tokens[i][j].upper()

      elif cap[i] ==0:
        token_procesado=tokens[i]

      palabra+=apertura+token_procesado+cierre


    if i != len(tokens)-1 and tokens[i+1][:2]!="##": #si el proximo token no es decontinuacion termine la palabra
      palabras_tokenizadas.append(palabra)
      palabra=""

    elif i == len(tokens)-1:
      palabras_tokenizadas.append(palabra)
    i+=1


  texto_final =""
  for i in range(len(palabras_tokenizadas)):

    if i == 0:
      texto_final+=palabras_tokenizadas[i]
    else:
      texto_final+=" "
      texto_final+=palabras_tokenizadas[i]

  return texto_final

In [None]:
datos_visualizacion=pd.read_parquet("/content/drive/MyDrive/TP AA II/visualizacion.parquet")
dataset_visualizacion=dataset_secuencias(df)
visualizacion_loader = DataLoader(dataset_visualizacion, batch_size=16, shuffle=True, collate_fn=collate_fn)

In [None]:
for seqs, caps, pini, pfin, lengths in visualizacion_loader:

        seqs = seqs.to(device)
        caps = caps.to(device)
        pini = pini.to(device)
        pfin = pfin.to(device)
        lengths = lengths.to(device)

        out_cap, out_pini, out_pfin = model(seqs, lengths, inferencia=True)