In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd

In [2]:

class Uni_RNN(nn.Module):
    #hidden1: chatpgt 512-384
    #hidden2: chatpgt 256-128
    def __init__(self,embedding_model,hidden1,hidden2):
        super().__init__()
        self.embedding_dim=768
        
        bert= BertModel.from_pretrained(embedding_model)
        self.embedding_vector = bert.embeddings.word_embeddings.weight
        self.LSTM= nn.LSTM(self.embedding_dim,hidden1, batch_first=True)
        self.LSTM2= nn.LSTM(hidden1,hidden2, batch_first=True)
        self.fc_pinicial= nn.Linear(hidden2,1)
        self.fc_cap = nn.Linear(hidden2, 1)         # sigmoid
        self.fc_pfinal = nn.Linear(hidden2, 3) 
                                           
    def forward(self,data,inferencia=False):
        
        data= self.embedding_vector[data].reshape(data.shape[1],self.embedding_dim)
        data, _=self.LSTM(data)
        data, _=self.LSTM2(data)
        final= data

        output_cap= self.fc_cap(final)
        output_punt_inicial=self.fc_pinicial(final)
        output_punt_final=self.fc_pfinal(final)

        if inferencia:
            output_punt_final=F.softmax(self.fc_pfinal(final))
            output_punt_final=torch.argmax(punt_final_logits,dim=-1)
            output_cap= torch.sigmoid(self.fc_cap(final))
            output_punt_inicial= torch.sigmoid(self.fc_pinicial(final))
            
        return output_cap,output_punt_inicial,output_punt_final


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset

class dataset_subtitulos(Dataset):
    def __init__(self, path):
        data = pd.read_parquet(
            path,
            columns=[
                "instancia_id",
                "token_id",
                "punt_inicial",
                "puntos",
                "comas",
                "cierre_pregunta",
                "capitalizacion",
            ],
        )
        self.instancias = [g for _, g in data.groupby("instancia_id")]
        

    def __len__(self):
        return len(self.instancias)

    def __getitem__(self, idx):  
        df_instancia=self.instancias[idx]
        features= torch.tensor(df_instancia["token_id"].values)
        target = torch.tensor(df_instancia[["punt_inicial","puntos","comas","cierre_pregunta","capitalizacion"]].values)
        return features, target

sub_data = dataset_subtitulos("datos_modelo.parquet")



#data_loader

In [None]:
sub_data[2]

In [None]:
train_size=int(len(sub_data)*0.8)
valid_size=len(sub_data)-train_size

train_dataset, valid_dataset = random_split(sub_data,[train_size,valid_size])

data_loader_train=DataLoader(train_dataset,batch_size=1,shuffle=True)
data_loader_valid=DataLoader(valid_dataset,batch_size=1,shuffle=False)

In [None]:
for data, target in data_loader_train:
    break
model=Uni_RNN("bert-base-multilingual-cased",512,256)
bert= BertModel.from_pretrained("bert-base-multilingual-cased")
embedding_vector = bert.embeddings.word_embeddings.weight
print(data.shape)
model.forward(data)

In [None]:
#TRAINING LOOP
data_loader_validad=[]
model=Uni_RNN("bert-base-multilingual-cased",512,256)
bce=nn.BCEWithLogitsLoss()
ce = nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(model.parameters(),lr=0.001)

num_epoch=5
train_losses =[]
val_losses =[]
for epoch in range(num_epoch):
    model.train()
    running_loss=0.0
    for data, target in data_loader_train:
        optimizer.zero_grad()
        
        cap_target= target[:,3]
        pinicial_target=target[:,0]
        pfinal_target=target[:,1:3]
        
        cap_output ,pinicial_output, pfinal_output=model(data)
        loss_cap= bce(cap_output,cap_target)
        loss_pinicial=bce(pinicial_output,pinicial_target)
        loss_pfinal=ce(pfinal_output,pfinal_target)
        loss=loss_cap+loss_pinicial+loss_pfinal #esto es todo un tensor
        
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_loss = running_loss / len(train_loader.dataset)
    train_losses.append(train_loss)

    model.eval()
    running_loss=0.0
    with torch.inference_mode():
        for data, target in data_loader_valid:
            cap_target= target[:,3]
            pinicial_target=target[:,0]
            pfinal_target=target[:,1:3]
            
            cap_output ,pinicial_output, pfinal_output=model(data)
            
            loss_cap= bce(cap_output,cap_target)
            loss_pinicial=bce(pinicial_output,pinicial_target)
            loss_pfinal=ce(pfinal_output,pfinal_target)
            
            loss=loss_cap+loss_pinicial+loss_pfinal #esto es todo un tensor
            running_loss += loss.item()
        val_loss=running_loss / len(val_loader.dataset)
        val_losses.append(val_loss)


    print(f"Epoch {epoch+1}/{num_epoch} -Train loss: {train_loss} - Val loss {val_loss}")