In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

import time
import torch
from torchinfo import summary
import mlflow
import os
import sys


sys.path.append("../..")

from modelos.GPT.text_loader import TextLoader
from modelos.GPT.gpt import GPTLanguageModel
from modelos.GPT.utils import estimate_loss, save_wikipedia, get_tokenizer

dataset = "wikitext-103-v1"
subsets_max_size = 20
num_training_subsets = 1

tokenizer = get_tokenizer("gpt2")
vocab_size = tokenizer.vocab_size

context_length = 3
embedding_dim = 64
num_of_attention_heads = 1
num_of_blocks = 1

batch_size = 64
learning_rate = 0.0001
dropout = 0.1

eval_interval = 20
epochs = 1

device = (
    "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
)

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../../credentials.json"


In [16]:
model = GPTLanguageModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    num_of_attention_heads=num_of_attention_heads,
    num_of_blocks=num_of_blocks,
    context_length=context_length,
    dropout=dropout,
    device=device,
)
m = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

def train_subset(model, optimizer, subset):
    
    data = torch.tensor(tokenizer.encode(subset), dtype=torch.long)
    train_size = int(0.2 * len(data))
    
    train_data_loader = TextLoader(data[:train_size], context_length, batch_size, device)
    eval_data_loader = TextLoader(data[train_size:], context_length, batch_size, device)
    
    num_batches = len(train_data_loader)
    eval_every_n_batches = num_batches // 5
    
    start_time = time.time()
    for batch in range(num_batches):
        if batch % eval_every_n_batches == 0:
            losses = estimate_loss(model, train_data_loader, eval_data_loader, eval_interval)
            interval = time.time() - start_time
            print(
                f"step {batch}/{num_batches}: train loss {losses['train']:.4f}, eval loss {losses['eval']:.4f}, interval time ({device}): {interval}"
            )
            start_time = time.time()


        xb, yb = train_data_loader.get_batch()
        _, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()


print(sum(p.numel() for p in m.parameters()) / 1e6, "M parameters")


6.529105 M parameters


In [None]:
for t in range(epochs):
    print(f"Epoch {t+1}")
    print("____________________________________________________")
    for i in range(num_training_subsets):

        print(f"Training subset {i+1}")
        print("____________________________________")

        with open(f"data/wikitext-103-v1/train-{0}.txt", "r", encoding="utf-8") as f:
            subset = f.read()
            train_subset(model, optimizer, subset)

if not os.path.exists("checkpoints"):
    os.makedirs("checkpoints", exist_ok=True)

torch.save(model, "checkpoints/model.pth")