In [1]:
import torch
from torch.utils.data import DataLoader, random_split, Dataset
from torch import nn
import pandas as pd

In [2]:
class imdb50reviws_dataset(Dataset):
    def __init__(self, csv, tokenizer):
        df=pd.read_csv(csv)
        df=df.replace("positive",1)
        df=df.replace("negative",0)
        self.x=df["review"]
        self.y=df["sentiment"]
        self.tokenizer=tokenizer
        
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        sentence=self.x[idx]
        tokens= self.tokenizer.encode(sentence, max_length=100, pad_to_max_length=True)
        y_logs=nn.functional.one_hot(torch.tensor(self.y[idx]),2)
        return torch.Tensor(tokens), y_logs

In [3]:
from transformers import AutoTokenizer

# Descarga un tokenizer preentrenado (por ejemplo, BERT)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokeniza un texto
texto = "Hola, quiero entrenar un modelo LSTM con un tokenizer."
tokens = tokenizer(texto, max_length=100, pad_to_max_length=True)   

print("Tokens:", tokens)


  from .autonotebook import tqdm as notebook_tqdm
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Tokens: {'input_ids': [101, 7570, 2721, 1010, 21864, 10624, 4372, 7913, 11802, 4895, 2944, 2080, 1048, 3367, 2213, 9530, 4895, 19204, 17629, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0



In [4]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Define model
class LSTM_model(nn.Module):
    def __init__(self, vocab_size, embed_size=64, hidden_size=128):
        super(LSTM_model, self).__init__()
        self.embed= nn.Embedding(vocab_size, embed_size)
        self.h0=torch.zeros(1, 1, hidden_size)
        self.c0=torch.zeros(1, 1, hidden_size)
        self.encoder=nn.LSTM(embed_size, hidden_size, num_layers=100, bidirectional=True)
        self.linear = nn.Sequential(
            nn.ReLU(),
            nn.Linear(hidden_size*2, 2)
        )
        

    def forward(self, x):
        x = x.long()
        emb = self.embed(x)
        out_encod, (hn, cn)=self.encoder(emb)
        logits = self.linear(out_encod)
        return logits


  return torch._C._cuda_getDeviceCount() > 0


Using cpu device


In [5]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            
            
def test(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    
    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
batch_size =64
epochs =50
lr= 0.003

def main(batch_size, epochs, lr):    
    dataset = imdb50reviws_dataset("./prueba.csv", tokenizer)
    
    train_dataset, test_dataset = random_split(dataset, [0.8, 0.2])
    
    train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size, shuffle=True)
    
    model = LSTM_model(vocab_size=tokenizer.vocab_size).to(device)
    
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr)
    
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_loader, model, loss_fn, optimizer)
        test(test_loader, model, loss_fn)
    print("Done!")
    
    
main(batch_size, epochs, lr)

  df=df.replace("negative",0)


Epoch 1
-------------------------------
loss: 4.605171  [   64/   80]
Test Error: 
 Accuracy: 21.1%, Avg loss: 4.605170 

Epoch 2
-------------------------------
loss: 4.605171  [   64/   80]
Test Error: 
 Accuracy: 10.5%, Avg loss: 4.605170 

Epoch 3
-------------------------------
loss: 4.605171  [   64/   80]
Test Error: 
 Accuracy: 10.5%, Avg loss: 4.605170 

Epoch 4
-------------------------------
loss: 4.605171  [   64/   80]
Test Error: 
 Accuracy: 36.8%, Avg loss: 4.605170 

Epoch 5
-------------------------------
loss: 4.605171  [   64/   80]
Test Error: 
 Accuracy: 21.1%, Avg loss: 4.605170 

Epoch 6
-------------------------------
loss: 4.605171  [   64/   80]
Test Error: 
 Accuracy: 31.6%, Avg loss: 4.605170 

Epoch 7
-------------------------------
loss: 4.605171  [   64/   80]
Test Error: 
 Accuracy: 31.6%, Avg loss: 4.605170 

Epoch 8
-------------------------------
loss: 4.605171  [   64/   80]
Test Error: 
 Accuracy: 21.1%, Avg loss: 4.605170 

Epoch 9
----------------