In [10]:
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader, random_split, Dataset
from torch import nn
from random import randrange
from transformers import BertTokenizer

In [11]:
class gpt_dataset(Dataset):
    def __init__(self, txt, tokenizer, context_size):
        self.context_size=context_size
        tokens= tokenizer.encode(txt)
        self.sentencedb = [tokens[i:i+self.context_size] for i in range(0, len(tokens), self.context_size)]
                
           
    def __len__(self):
        return len(self.sentencedb)*self.context_size

    def __getitem__(self, idx):
        actual_sentence = self.sentencedb[idx // self.context_size]  # Obtiene la oración correspondiente
        context_end = idx % self.context_size + 1  

        # Evitar acceder a un índice fuera de rango
        if context_end >= len(actual_sentence):
            context_end = len(actual_sentence) - 1  # Limita el índice

        X = actual_sentence[0:context_end]  # Fragmento de la oración hasta `idx`
        y = actual_sentence[context_end] if context_end < len(actual_sentence) else 0  # Evita out of range

        # Rellenar con ceros si es necesario
        while len(X) < self.context_size:
            X.append(0)

        return torch.tensor(X), y

    




In [12]:

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# with open("Kibalion.txt", "r", encoding="utf-8") as file:
#         txt=file.read() 

txt="nico era un alumno ejemplar dsialf faspinfan faljnjlabfae fanppbfaj f afqéajb fjdasf d"
dataset = gpt_dataset(txt, tokenizer, 10)

dataset[10]

KeyboardInterrupt: 

In [2]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Define model
class Transformer_model(nn.Module):
    def __init__(self, vocab_size, embed_size=200, context_size=50):
        super(Transformer_model, self).__init__()
        self.embed= nn.Embedding(vocab_size, embed_size)
        self.encoder_layer=nn.TransformerEncoder(nn.TransformerEncoderLayer(embed_size, 10, batch_first=True, dropout=0.3), 4)
        self.linear = nn.Sequential(
            nn.Flatten(),
            nn.ReLU(),
            nn.Linear(embed_size*context_size, vocab_size)
        )
        

    def forward(self, x):
        emb = self.embed(x) 
        out_encod=self.encoder_layer(emb) 
        logits = self.linear(out_encod)
        return logits


NameError: name 'torch' is not defined

In [3]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    
    loop = tqdm(dataloader, desc="Training", leave=True)
    
    for batch, (X, y) in enumerate(loop):
        X, y = X.to(device), y.to(device)
        
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Update tqdm description
        loop.set_postfix(loss=loss.item())
        
        

        
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    
    loop = tqdm(dataloader, desc="Testing", leave=True)

    with torch.no_grad():
        for X, y in loop:
            X, y = X.to(device), y.to(device)
            
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

            # Update tqdm description
            loop.set_postfix(accuracy=100 * correct / size)

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [4]:

def add_zeros(sentence, context_size):
    list_tokens = sentence.copy()
    while len(list_tokens) < context_size:  # Agregar ceros hasta alcanzar context_size
        list_tokens.append(0)
    return torch.tensor(list_tokens, dtype=torch.long).to(device)  # Asegurar tipo long

def generate_text(model, tokenizer, seed_text, context_size):
    tokens = tokenizer.encode(seed_text)
    sentence = tokens.copy()
    sentence = sentence[1:-1]

    for i in range(context_size-len(tokens)):
        model.eval()

        X = add_zeros(sentence, context_size).reshape((1,context_size))  # Cambiado para usar context_size

        logits = model(X)

        new_token = int(torch.argmax(logits,1))

        # Validar que `new_token` esté dentro del rango
        if new_token >= model.embed.num_embeddings:  
            new_token = 0  # Reemplazo por token especial si está fuera de rango
            
        sentence.append(new_token)

        print("Texto generado hasta ahora:", tokenizer.decode(sentence[1:-1]))

    return sentence


In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = Transformer_model(vocab_size=tokenizer.vocab_size, embed_size=100).to(device)

NameError: name 'BertTokenizer' is not defined

In [6]:
batch_size =10
epochs = 3
lr= 0.0005


def main(model, batch_size, epochs, lr):   
    with open("Kibalion.txt", "r", encoding="utf-8") as file:
        txt=file.read() 
    
    dataset = gpt_dataset(txt, tokenizer, 50)
    
    train_dataset, test_dataset = random_split(dataset, [0.8, 0.2])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    
    
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr)
    
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_loader, model, loss_fn, optimizer)
        # Probar generación de texto
        seed_text = "muchos tópicos de los conocimientos"
        
        generated = generate_text(model, tokenizer, seed_text, 50)
        print("Texto generado:", generated)
        # test(test_loader, model, loss_fn)

        
    print("Done!")
    
    
main(model, batch_size, epochs, lr)

NameError: name 'model' is not defined