In [None]:
from src.data.load_small_data import *

DATASET_NAME   = "Ankursingh/openwebtext_10K"
DATASET_CONFIG = "plain_text"   # config por defecto del dataset
VOCAB_SIZE = 16000         
MIN_FREQ  = 2
BLOCK_SIZE   = 256  # Ventana de contexto         
VAL_FRACTION   = 0.1
TOKENIZER_PATH = Path("owt10k_tokenizer.json")

CPU_COUNT   = os.cpu_count() or 2
BATCH_SIZE  = 64
NUM_WORKERS  = 2 if CPU_COUNT <= 2 else min(4, CPU_COUNT - 1)             
os.environ["TOKENIZERS_PARALLELISM"] = "false"


train_loader, val_loader, tokenizer = create_dataloaders()

x, y = next(iter(train_loader))
print("Batch x shape:", x.shape) 
print("Batch y shape:", y.shape) 


example_ids = x[0].tolist()
text = tokenizer.decode(example_ids)
print("Texto ejemplo (primer sample de x):")
print(text)

Dataset({
    features: ['text'],
    num_rows: 10000
})
Dataset({
    features: ['text'],
    num_rows: 4007
})
Cargando tokenizer desde owt10k_tokenizer.json...
Tokenizando y concatenando textos...
Total de tokens en este split: 11,175,296
Número de secuencias: 43,483
Forma inputs:  torch.Size([43483, 256])
Forma targets: torch.Size([43483, 256])
Tokenizando y concatenando textos...
Total de tokens en este split: 4,808,361
Número de secuencias: 18,709
Forma inputs:  torch.Size([18709, 256])
Forma targets: torch.Size([18709, 256])
Batch x shape: torch.Size([64, 256])
Batch y shape: torch.Size([64, 256])
Texto ejemplo (primer sample de x):
premier.ticketek.com.au

■ make beautiful music with elton john and his band

media_camera elton john performs in adelaide on january 28

the legendary sir elton john is playing all the hits from his brilliant career spanning five decades including songs from his classic album goodbye yellow brick road which recently celebrated its 40th anniversary.


In [None]:
from src.data.data_utils import *

  
inspect_autoregressive_loader(train_loader, tokenizer,
                              num_batches=1,  
                              max_examples=2, 
                              max_tokens_print=50)


=== Batch 0 ===

--- Ejemplo 0 ---
Input IDs   (primeros 50): [138, 33, 2872, 401, 138, 138, 46, 294, 182, 505, 5295, 1505, 1704, 4402, 1209, 138, 138, 46, 294, 182, 505, 5295, 235, 1505, 1704, 335, 3488, 10375, 207, 2496, 3041, 4402, 17, 256, 2728, 943, 4617, 209, 9540, 3041, 4402, 3870, 234, 14057, 7731, 831, 4036, 209, 1456, 2571]
Target IDs  (primeros 50): [33, 2872, 401, 138, 138, 46, 294, 182, 505, 5295, 1505, 1704, 4402, 1209, 138, 138, 46, 294, 182, 505, 5295, 235, 1505, 1704, 335, 3488, 10375, 207, 2496, 3041, 4402, 17, 256, 2728, 943, 4617, 209, 9540, 3041, 4402, 3870, 234, 14057, 7731, 831, 4036, 209, 1456, 2571, 204]
Input texto (modelo VE):
"\n> learn more\n\neuler hermes north america insurance company\n\neuler hermes is north america's largest provider of trade credit insurance. we offer both domestic and export credit insurance policies that insure clients against commercial and political risk"
Target texto (modelo DEBE predecir):
"> learn more\n\neuler hermes north am

---

In [None]:
import torch

torch.cuda.empty_cache()
torch.cuda.ipc_collect()

import torch, gc
del model
gc.collect()
torch.cuda.empty_cache()

In [None]:
from src.model.gpt_model import *
from src.training.main_loop import *

vocab_size = tokenizer.get_vocab_size()
block_size = 256

device = "cuda" if torch.cuda.is_available() else "cpu"

model = GPT2(
    vocab_size=vocab_size,
    block_size=block_size,  
    n_layer=8,             
    n_head=8,            
    d_model=512,dropout=0.1,).to(device)

if torch.cuda.device_count() > 1:
    print(f"Usando {torch.cuda.device_count()} GPUs con DataParallel")
    model = torch.nn.DataParallel(model)
    use_dataparallel = True

def id2tok_fn(ids):
    return tokenizer.decode(ids)


history = train_gpt_lm(
    model,
    train_loader,
    val_loader=val_loader,
    epochs=10,
    base_lr=3e-4,
    weight_decay=0.01,
    warmup_steps=2000,
    label_smoothing=0.1,
    grad_clip=1.0,
    device=device,
    ckpt_path="gptmini_owt10k.pt",
    log_every=150,
    preview_every=500,
    id2tok_fn=id2tok_fn,
    amp_enabled=True,
    amp_dtype="fp16",   
    val_checking = False , save_ckpt_every = 3)

Usando 2 GPUs con DataParallel
[Epoch 1 | step  150/680 | global_step=150] train_loss=8.9753  ppl=7905.66  tok_acc=3.49%  tok/s=62,364
[Epoch 1 | step  300/680 | global_step=300] train_loss=8.2705  ppl=3906.99  tok_acc=5.50%  tok/s=63,175
[Epoch 1 | step  450/680 | global_step=450] train_loss=7.8397  ppl=2539.43  tok_acc=6.93%  tok/s=63,386
— preview (LM, teacher-forced argmax) —
CTX: " eldemar's royal treasury. their priorities during this time consisted of investigating the invasion forces, looking for possible signs of red robe, trying to track down the rest of the missing key pieces and figuring out some way to leave the time loop. of course, since actually retrieving even the known pieces of the key was impossible with their current skills, and they had no idea what kind"
REF: "emar's royal treasury. their priorities during this time consisted of investigating the invasion forces, looking for possible signs of red robe, trying to track down the rest of the missing key pieces and f

In [None]:
from src.inference.generate_text import *

prompt = "whats your name?"
print(generate(model, tokenizer, prompt))

 whats your name? and if it's not an answer, you could see yourself in the future in order to find yourself.

and that's not true.

