In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd

In [58]:

class Uni_RNN(nn.Module):
    #hidden1: chatpgt 512-384
    #hidden2: chatpgt 256-128
    def __init__(self,embedding_model,hidden1,hidden2):
        super().__init__()
        self.embedding_dim=768
        
        bert= BertModel.from_pretrained(embedding_model)
        self.embedding_vector = bert.embeddings.word_embeddings.weight
        self.LSTM= nn.LSTM(self.embedding_dim,hidden1, batch_first=True)
        self.LSTM2= nn.LSTM(hidden1,hidden2, batch_first=True)
        self.fc_pinicial= nn.Linear(hidden2,1)
        self.fc_cap = nn.Linear(hidden2, 4)         
        self.fc_pfinal = nn.Linear(hidden2, 4) 
                                           
    def forward(self,data,inferencia=False):
        
        data= self.embedding_vector[data]
        data, _=self.LSTM(data)
        data, _=self.LSTM2(data)
        final= data

        output_cap= self.fc_cap(final)
        output_punt_inicial=self.fc_pinicial(final)
        output_punt_final=self.fc_pfinal(final)

        if inferencia:
            output_punt_final=F.softmax(self.fc_pfinal(final))
            output_punt_final=torch.argmax(output_punt_final,dim=-1)
            output_cap= torch.softmax(self.fc_cap(final))
            output_cap=torch.argmax(output_cap,dim=-1)
            output_punt_inicial= torch.sigmoid(self.fc_pinicial(final))
            
        return output_punt_inicial,output_punt_final,output_cap


In [10]:
import pandas as pd
import torch
from torch.utils.data import Dataset

class dataset_subtitulos(Dataset):
    def __init__(self, path):
        data = pd.read_parquet(
            path,
            columns=[
                "instancia_id",
                "token_id",
                "punt_inicial",
                "pfinal",
                "capitalizacion",
            ],
        )
        self.instancias = [g for _, g in data.groupby("instancia_id")]
        

    def __len__(self):
        return len(self.instancias)

    def __getitem__(self, idx):  
        df_instancia=self.instancias[idx]
        features= torch.tensor(df_instancia["token_id"].values)
        target = torch.tensor(df_instancia[["punt_inicial","pfinal","capitalizacion"]].values,dtype=torch.long)
        return features, target

sub_data = dataset_subtitulos("datos_modelo.parquet")



#data_loader

In [11]:
#visualizacion del dataset
sub_data[2] #PINICIAL Pfinal  cap

(tensor([10192, 10321, 13386, 10192, 12994, 21267, 10216, 24256]),
 tensor([[0, 2, 1],
         [0, 0, 1],
         [0, 2, 1],
         [0, 0, 1],
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         [0, 2, 0]]))

In [40]:
#Creamos Dataloaders
train_size=int(len(sub_data)*0.8)
valid_size=len(sub_data)-train_size

train_dataset, valid_dataset = random_split(sub_data,[train_size,valid_size])

data_loader_train=DataLoader(train_dataset,batch_size=None,shuffle=True)
data_loader_valid=DataLoader(valid_dataset,batch_size=None,shuffle=False)

In [48]:
for data, target in data_loader_train:
    break
model=Uni_RNN("bert-base-multilingual-cased",512,256)
bert= BertModel.from_pretrained("bert-base-multilingual-cased")
embedding_vector = bert.embeddings.word_embeddings.weight

In [50]:
ce = nn.CrossEntropyLoss()
print(data.shape,target.shape)
o1,o2,o3 = model.forward(data)
print(o1.shape,o2.shape,o3.shape)
target_cap=target[:,-1]

print(target_cap.shape)
ce(o3,target_cap)  #Target cap  batch size x t_len   
#o1 batch size x tlen x clases
#o1.permute(0,2,1) tiene dimensiones batch_size x clases x tlen

torch.Size([16]) torch.Size([16, 3])
torch.Size([16, 1]) torch.Size([16, 3]) torch.Size([16, 4])
torch.Size([16])


tensor(1.4061, grad_fn=<NllLossBackward0>)

In [52]:
bce=nn.BCEWithLogitsLoss()
target_pinicial=target[:,0].float()
print(o1.shape,target_pinicial.shape)
bce(o1.squeeze(1),target_pinicial)


torch.Size([16, 1]) torch.Size([16])


tensor(0.7041, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [59]:
#TRAINING LOOP

model=Uni_RNN("bert-base-multilingual-cased",512,256)
bce=nn.BCEWithLogitsLoss()
ce = nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(model.parameters(),lr=0.001)

num_epoch=5
train_losses =[]
val_losses =[]
for epoch in range(num_epoch):
    model.train()
    running_loss=0.0
    for data, target in data_loader_train:
        optimizer.zero_grad()

        pinicial_target=target[:,0].float()
        pfinal_target=target[:,1]
        cap_target=target[:,2]
        
        output_punt_inicial,output_punt_final,output_cap=model(data)

        output_punt_inicial=output_punt_inicial.squeeze(1)
        
        
        loss_cap= ce(output_cap,cap_target)
        loss_pinicial=bce(output_punt_inicial,pinicial_target)
        loss_pfinal=ce(output_punt_final,pfinal_target)
        
        loss=loss_cap+loss_pinicial+loss_pfinal #esto es todo un tensor
        
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_loss = running_loss / len(data_loader_train)
    train_losses.append(train_loss)

    model.eval()
    running_loss=0.0
    with torch.inference_mode():
        for data, target in data_loader_valid:

            
            pinicial_target=target[:,0].float()
            pfinal_target=target[:,1]
            cap_target=target[:,2]
            
            output_punt_inicial,output_punt_final,output_cap=model(data)
            
            loss_cap= ce(output_cap,cap_target)
            loss_pinicial=bce(output_punt_inicial,pinicial_target)
            loss_pfinal=ce(output_punt_final,pfinal_target)
            
            loss=loss_cap+loss_pinicial+loss_pfinal #esto es todo un tensor
            running_loss += loss.item()
        val_loss=running_loss / len(data_loader_valid)
        val_losses.append(val_loss)


    print(f"Epoch {epoch+1}/{num_epoch} -Train loss: {train_loss} - Val loss {val_loss}")

KeyboardInterrupt: 