In [None]:
from src.data.load_large_data import *


DATASET_NAME   = "wikitext"
DATASET_CONFIG = "wikitext-103-raw-v1"   

VOCAB_SIZE = 32000        
MIN_FREQ  = 2
BLOCK_SIZE   = 256  # Ventana de contexto         
VAL_FRACTION   = 0.1
TOKENIZER_PATH = Path("wikitext103_tokenizer.json")  

CPU_COUNT   = os.cpu_count() or 2
BATCH_SIZE  = 64
NUM_WORKERS  = 2 if CPU_COUNT <= 2 else min(4, CPU_COUNT - 1)             
os.environ["TOKENIZERS_PARALLELISM"] = "false"


train_loader, val_loader, tokenizer = create_dataloaders()

x, y = next(iter(train_loader))
print("Batch x shape:", x.shape) 
print("Batch y shape:", y.shape) 

example_ids = x[0].tolist()
text = tokenizer.decode(example_ids)
print("Texto ejemplo (primer sample de x):")
print(text)

README.md: 0.00B [00:00, ?B/s]

wikitext-103-raw-v1/test-00000-of-00001.(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00000-of-00002(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00001-of-00002(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/validation-00000-of-(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 1801350
})
Dataset({
    features: ['text'],
    num_rows: 3760
})
Entrenando tokenizer BPE...



Tamaño vocabulario: 16000
Tokenizer guardado en /kaggle/working/wikitext103_tokenizer.json
Tokenizando y concatenando textos...
Total de tokens en este split: 119,513,253
Número de secuencias: 465,032
Forma inputs:  torch.Size([465032, 256])
Forma targets: torch.Size([465032, 256])
Tokenizando y concatenando textos...
Total de tokens en este split: 250,245
Número de secuencias: 973
Forma inputs:  torch.Size([973, 256])
Forma targets: torch.Size([973, 256])
Batch x shape: torch.Size([64, 256])
Batch y shape: torch.Size([64, 256])
Texto ejemplo (primer sample de x):
 nuns . 
 the actor is the french association pour la béatification de l 'impératrice zita . 
 the postulator for the cause is father alexander leonhardt . the judge of the tribunal is father bruno bonnet . the promoter of justice is the father françois scrive . 
 = = titles , style

In [None]:
from src.data.data_utils import *
  
inspect_autoregressive_loader(train_loader, tokenizer,
                              num_batches=1,  
                              max_examples=2, 
                              max_tokens_print=50)


=== Batch 0 ===

--- Ejemplo 0 ---
Input IDs   (primeros 50): [2442, 416, 283, 197, 259, 480, 199, 185, 507, 7922, 350, 1457, 240, 185, 15305, 336, 243, 182, 1047, 259, 434, 4987, 737, 1457, 313, 185, 5561, 5037, 189, 680, 185, 737, 209, 5399, 216, 189, 7254, 551, 864, 189, 215, 3684, 3671, 341, 56, 6608, 13262, 11077, 189, 6181]
Target IDs  (primeros 50): [416, 283, 197, 259, 480, 199, 185, 507, 7922, 350, 1457, 240, 185, 15305, 336, 243, 182, 1047, 259, 434, 4987, 737, 1457, 313, 185, 5561, 5037, 189, 680, 185, 737, 209, 5399, 216, 189, 7254, 551, 864, 189, 215, 3684, 3671, 341, 56, 6608, 13262, 11077, 189, 6181, 7618]
Input texto (modelo VE):
'fall after an f @-@ 5 . the other predominant match on the undercard was a six @-@ man tag team match from the raw brand , between the team of triple h , ric flair , and chris jericho facing shawn michaels , kevin'
Target texto (modelo DEBE predecir):
' after an f @-@ 5 . the other predominant match on the undercard was a six @-@ man tag team

---

## Test Embeddings 

In [None]:
from src.model.embeddings import *

device = "cuda" if torch.cuda.is_available() else "cpu"

vocab_size = tokenizer.get_vocab_size()
d_model = 256
block_size = 256

emb = GPT2Embeddings(vocab_size, d_model, block_size, dropout=0.1).to(device)

x_ids, y_ids = next(iter(train_loader))  
x_ids = x_ids.to(device)

x_emb = emb(x_ids)  # [B, T, d_model]
print(x_emb.shape)


torch.Size([64, 256, 256])


## Test Attention 

In [None]:
from src.model.attention import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = tokenizer.get_vocab_size()
d_model  = 256
block_size = 256

emb  = GPT2Embeddings(vocab_size, d_model, block_size, dropout=0.1).to(device)
attn = CausalSelfAttention(d_model, num_heads=8, block_size=block_size, dropout=0.1).to(device)

x_ids, y_ids = next(iter(train_loader))  # [B, T]
x_ids = x_ids.to(device)

x = emb(x_ids)  # pesos + input 
x = attn(x)     # [B, T, d_model]
print(x.shape) # [B, T, d_model] (self-attn causal)



torch.Size([64, 256, 256])


## Test GPT blocks 

In [None]:
from src.model.gpt_blocks import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = tokenizer.get_vocab_size()
d_model = 256
block_size = 256
num_heads = 8
d_ff = 4 * d_model  # 1024

emb = GPT2Embeddings(vocab_size, d_model, block_size, dropout=0.1).to(device)
mlp = GPT2MLP(d_model, d_ff=d_ff, dropout=0.1).to(device)
block = GPT2Block(d_model, num_heads=num_heads, block_size=block_size, 
                  d_ff=d_ff, dropout=0.1).to(device)


x_ids, y_ids = next(iter(train_loader))  # [B, T]
x_ids = x_ids.to(device)

x = emb(x_ids)  # [B, T, d_model]
print(f"Después de embeddings: {x.shape}")

x_mlp = mlp(x)
print(f"Después de MLP: {x_mlp.shape}")  # [B, T, d_model]

x_block = block(x)
print(f"Después de GPT2Block: {x_block.shape}")  # [B, T, d_model]

print(f"¿Hay NaNs en la salida? {torch.isnan(x_block).any().item()}")



Después de embeddings: torch.Size([64, 256, 256])
Después de MLP: torch.Size([64, 256, 256])
Después de GPT2Block: torch.Size([64, 256, 256])
¿Hay NaNs en la salida? False


## Test FULL GPT

In [None]:
from src.model.gpt_model import *

vocab_size = tokenizer.get_vocab_size()
block_size = 256  

model = GPT2(
    vocab_size=vocab_size,
    block_size=block_size,
    n_layer=4,
    n_head=4,
    d_model=256,
    dropout=0.1).to(device)

x_ids, y_ids = next(iter(train_loader))  # [B, T]
x_ids = x_ids.to(device)
y_ids = y_ids.to(device)

logits, loss = model(x_ids, y_ids)
print("logits shape:", logits.shape)  # [B, T, vocab_size]
print("loss:", loss.item())

---

# *Training*

In [None]:
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

import torch, gc
del model
gc.collect()
torch.cuda.empty_cache()

In [None]:
from src.training.main_loop import *

vocab_size = tokenizer.get_vocab_size()
block_size = 256

device = "cuda" if torch.cuda.is_available() else "cpu"

model = GPT2(
    vocab_size=vocab_size,
    block_size=block_size,  
    n_layer=10,             
    n_head=8,            
    d_model=512,dropout=0.1,).to(device)

if torch.cuda.device_count() > 1:
    print(f"Usando {torch.cuda.device_count()} GPUs con DataParallel")
    model = torch.nn.DataParallel(model)
    use_dataparallel = True

def id2tok_fn(ids):
    return tokenizer.decode(ids)


history = train_gpt_lm(
    model,
    train_loader,
    val_loader=val_loader,
    epochs=10,
    base_lr=3e-4,
    weight_decay=0.01,
    warmup_steps=2000,
    label_smoothing=0.1,
    grad_clip=1.0,
    device=device,
    ckpt_path="gptmini_owt10k.pt",
    log_every=150,
    preview_every=7000,
    id2tok_fn=id2tok_fn,
    amp_enabled=True,
    amp_dtype="fp16",   
    val_checking = False , save_ckpt_every = 3)

Usando 2 GPUs con DataParallel
[Epoch 1 | step  150/7267 | global_step=150] train_loss=8.9506  ppl=7712.62  tok_acc=4.44%  tok/s=51,953
[Epoch 1 | step  300/7267 | global_step=300] train_loss=8.2313  ppl=3756.75  tok_acc=6.35%  tok/s=52,936
[Epoch 1 | step  450/7267 | global_step=450] train_loss=7.7675  ppl=2362.63  tok_acc=8.10%  tok/s=52,863
[Epoch 1 | step  600/7267 | global_step=600] train_loss=7.4614  ppl=1739.52  tok_acc=9.40%  tok/s=53,006
[Epoch 1 | step  750/7267 | global_step=750] train_loss=7.2223  ppl=1369.65  tok_acc=10.55%  tok/s=53,011
[Epoch 1 | step  900/7267 | global_step=900] train_loss=7.0297  ppl=1129.71  tok_acc=11.52%  tok/s=53,081
[Epoch 1 | step 1050/7267 | global_step=1050] train_loss=6.8694  ppl=962.41  tok_acc=12.33%  tok/s=53,106
[Epoch 1 | step 1200/7267 | global_step=1200] train_loss=6.7319  ppl=838.73  tok_acc=13.03%  tok/s=53,142
[Epoch 1 | step 1350/7267 | global_step=1350] train_loss=6.6111  ppl=743.31  tok_acc=13.65%  tok/s=53,136
[Epoch 1 | step 150

In [56]:
history = train_gpt_lm(
    model,
    train_loader,
    val_loader=val_loader,
    epochs=4,
    base_lr=3e-4,
    weight_decay=0.01,
    warmup_steps=2000,
    label_smoothing=0.1,
    grad_clip=1.0,
    device=device,
    ckpt_path="gptmini_owt10k.pt",
    log_every=1000,
    preview_every=7000,
    id2tok_fn=id2tok_fn,
    amp_enabled=True,
    amp_dtype="fp16",   
    val_checking = False , save_ckpt_every = 2)

[Epoch 1 | step 1000/7267 | global_step=1000] train_loss=3.2498  ppl=25.79  tok_acc=39.95%  tok/s=53,346
[Epoch 1 | step 2000/7267 | global_step=2000] train_loss=3.2949  ppl=26.97  tok_acc=39.41%  tok/s=53,379
[Epoch 1 | step 3000/7267 | global_step=3000] train_loss=3.3319  ppl=27.99  tok_acc=38.97%  tok/s=53,389
[Epoch 1 | step 4000/7267 | global_step=4000] train_loss=3.3542  ppl=28.62  tok_acc=38.73%  tok/s=53,375
[Epoch 1 | step 5000/7267 | global_step=5000] train_loss=3.3672  ppl=29.00  tok_acc=38.59%  tok/s=53,371
[Epoch 1 | step 6000/7267 | global_step=6000] train_loss=3.3758  ppl=29.25  tok_acc=38.50%  tok/s=53,366
[Epoch 1 | step 7000/7267 | global_step=7000] train_loss=3.3815  ppl=29.42  tok_acc=38.44%  tok/s=53,365
— preview (LM, teacher-forced argmax) —
CTX: " and detection of vibrations underwater . compared to the harbor seal , the california sea lion 's vibrissae are smoother and less specialized and thus perform less when following hydrodynamic trails , although they sti

KeyboardInterrupt: 

In [None]:
from src.inference.generate_text import *


prompt = "are you happy?"
print(generate(model, tokenizer, prompt))

 are you happy?able with your story 's storyline , if anything 's going on the run as ' what they did ' – and you 're getting too much
