In [9]:
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
import torch
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from tokenizers import decoders
import os

DATASET_NAME   = "Ankursingh/openwebtext_10K"
DATASET_CONFIG = "plain_text"   # config por defecto del dataset
VOCAB_SIZE = 16000         
MIN_FREQ  = 2
BLOCK_SIZE   = 256  # Ventana de contexto         
VAL_FRACTION   = 0.1
TOKENIZER_PATH = Path("owt10k_tokenizer.json")

CPU_COUNT   = os.cpu_count() or 2
BATCH_SIZE  = 64
NUM_WORKERS  = 2 if CPU_COUNT <= 2 else min(4, CPU_COUNT - 1)             
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def load_openwebtext10k():
    """
    Carga Ankursingh/openwebtext_10K que ya viene con splits train y val.
    """
    ds = load_dataset(DATASET_NAME)   
    train_ds = ds["train"]
    val_ds   = ds["val"]
    print(train_ds)
    print(val_ds)
    return train_ds, val_ds



def train_tokenizer(train_ds,
                    vocab_size=VOCAB_SIZE,
                    min_freq=MIN_FREQ,
                    save_path=TOKENIZER_PATH):
    """
    Entrena un tokenizer BPE estilo GPT (byte-level) sobre el split de entrenamiento.
    """
    tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))

    tokenizer.normalizer = normalizers.Sequence([
        normalizers.NFKC(),
        normalizers.Lowercase(),])

    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.decoder = decoders.ByteLevel() 

    special_tokens = ["<unk>", "<pad>", "<bos>", "<eos>"]

    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        min_frequency=min_freq,
        special_tokens=special_tokens,)

    def batch_iterator():
        for ex in train_ds:
            txt = ex["text"]
            if txt is not None and len(txt.strip()) > 0:
                yield txt

    print("Entrenando tokenizer BPE...")
    tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
    print("Tamaño vocabulario:", tokenizer.get_vocab_size())

    save_path = Path(save_path)
    tokenizer.save(str(save_path))
    print(f"Tokenizer guardado en {save_path.resolve()}")

    return tokenizer
def load_or_train_tokenizer(train_ds):
    """
    Carga el tokenizer si ya existe en disco, si no lo entrena.
    """
    if TOKENIZER_PATH.exists():
        print(f"Cargando tokenizer desde {TOKENIZER_PATH}...")
        tokenizer = Tokenizer.from_file(str(TOKENIZER_PATH))
    else:
        tokenizer = train_tokenizer(train_ds)
    return tokenizer


class GPTTextDataset(Dataset):
    """
    Construye un dataset para LM autoregresivo:
      - Concatena todos los documentos en una secuencia larga de IDs.
      - Añade <eos> al final de cada documento.
      - Parte en chunks de longitud (BLOCK_SIZE + 1).
      - Input = ids[:-1], Target = ids[1:].
    """
    def __init__(self, hf_split, tokenizer, block_size=BLOCK_SIZE):
        super().__init__()
        self.block_size = block_size

        eos_id = tokenizer.token_to_id("<eos>")
        if eos_id is None:
            raise ValueError("El tokenizer no tiene token <eos>.")

        all_ids = []

        print("Tokenizando y concatenando textos...")
        for ex in hf_split:
            txt = ex["text"]
            if txt is None or len(txt.strip()) == 0:
                continue
            enc = tokenizer.encode(txt)
            # enc.ids es una lista de ints
            all_ids.extend(enc.ids + [eos_id])

        self.data = torch.tensor(all_ids, dtype=torch.long)
        print(f"Total de tokens en este split: {len(self.data):,}")

        n_tokens = len(self.data)
        chunk_len = block_size + 1
        n_chunks = n_tokens // chunk_len

        if n_chunks == 0:
            raise ValueError("Muy pocos tokens para formar un solo chunk. "
                             "Baja BLOCK_SIZE o usa más datos.")

        # Cortar a múltiplo exacto de chunk_len y reshape
        self.data = self.data[: n_chunks * chunk_len]
        self.data = self.data.view(n_chunks, chunk_len)

        # inputs y targets precomputados
        self.inputs  = self.data[:, :-1]  # [N, block_size]
        self.targets = self.data[:, 1:]   # [N, block_size]

        print(f"Número de secuencias: {len(self.inputs):,}")
        print(f"Forma inputs:  {self.inputs.shape}")
        print(f"Forma targets: {self.targets.shape}")

    def __len__(self):
        return self.inputs.size(0)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]


def create_dataloaders(block_size=BLOCK_SIZE,
                       batch_size=BATCH_SIZE,
                       num_workers=NUM_WORKERS):
    train_hf, val_hf = load_openwebtext10k()
    tokenizer = load_or_train_tokenizer(train_hf)

    train_ds = GPTTextDataset(train_hf, tokenizer, block_size=block_size)
    val_ds   = GPTTextDataset(val_hf,   tokenizer, block_size=block_size)

    train_loader = DataLoader(
        train_ds,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True)

    val_loader = DataLoader(
        val_ds,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True)

    return train_loader, val_loader, tokenizer



train_loader, val_loader, tokenizer = create_dataloaders()

x, y = next(iter(train_loader))
print("Batch x shape:", x.shape) 
print("Batch y shape:", y.shape) 


example_ids = x[0].tolist()
text = tokenizer.decode(example_ids)
print("Texto ejemplo (primer sample de x):")
print(text)

Dataset({
    features: ['text'],
    num_rows: 10000
})
Dataset({
    features: ['text'],
    num_rows: 4007
})
Cargando tokenizer desde owt10k_tokenizer.json...
Tokenizando y concatenando textos...
Total de tokens en este split: 11,175,296
Número de secuencias: 43,483
Forma inputs:  torch.Size([43483, 256])
Forma targets: torch.Size([43483, 256])
Tokenizando y concatenando textos...
Total de tokens en este split: 4,808,361
Número de secuencias: 18,709
Forma inputs:  torch.Size([18709, 256])
Forma targets: torch.Size([18709, 256])
Batch x shape: torch.Size([64, 256])
Batch y shape: torch.Size([64, 256])
Texto ejemplo (primer sample de x):
premier.ticketek.com.au

■ make beautiful music with elton john and his band

media_camera elton john performs in adelaide on january 28

the legendary sir elton john is playing all the hits from his brilliant career spanning five decades including songs from his classic album goodbye yellow brick road which recently celebrated its 40th anniversary.


In [10]:
from collections import Counter
import torch

def inspect_autoregressive_loader(train_loader, tokenizer, 
                                  num_batches=2,  # cuántos batches inspeccionar
                                  max_examples=3, # cuántas secuencias imprimir por batch
                                  max_tokens_print=40):  # cuántos tokens decodificar por ejemplo
    """
    Inspecciona un DataLoader de lenguaje autoregresivo (GPT-style).

    Supone que cada batch es:
        x: [B, T]  (inputs)
        y: [B, T]  (targets desplazados 1 a la derecha en el stream original)

    Hace:
    - Verificar qué tan cierto es que y[:, :-1] == x[:, 1:].
    - Analizar la distribución de y[:, 0] (primer token que el modelo debe predecir).
    - Imprimir ejemplos decodificados (input vs target) para entender el shift.
    """
    total_shift_positions = 0
    total_shift_matches   = 0

    first_input_ids  = []
    first_target_ids = []

    for b_idx, (x, y) in enumerate(train_loader):
        B, T = x.shape

        # Comprobar el desplazamiento: y[:, :-1] debería ser igual a x[:, 1:]
        shift_equal = (y[:, :-1] == x[:, 1:])  # [B, T-1] bool
        total_shift_matches   += shift_equal.sum().item()
        total_shift_positions += shift_equal.numel()

        # Guardar los primeros tokens de input y target para estadísticas
        first_input_ids.extend(x[:, 0].tolist())
        first_target_ids.extend(y[:, 0].tolist())

        # Imprimir algunos ejemplos para ver texto real
        print(f"\n=== Batch {b_idx} ===")
        for i in range(min(max_examples, B)):
            inp_ids = x[i].tolist()
            tgt_ids = y[i].tolist()

            inp_ids_short = inp_ids[:max_tokens_print]
            tgt_ids_short = tgt_ids[:max_tokens_print]

            inp_text = tokenizer.decode(inp_ids_short)
            tgt_text = tokenizer.decode(tgt_ids_short)

            print(f"\n--- Ejemplo {i} ---")
            print(f"Input IDs   (primeros {len(inp_ids_short)}): {inp_ids_short}")
            print(f"Target IDs  (primeros {len(tgt_ids_short)}): {tgt_ids_short}")
            print("Input texto (modelo VE):")
            print(repr(inp_text))
            print("Target texto (modelo DEBE predecir):")
            print(repr(tgt_text))

        if b_idx + 1 >= num_batches:
            break

    shift_ratio = total_shift_matches / total_shift_positions
    print("\n================== RESUMEN AUTORREGRESIVO ==================")
    print(f"Total posiciones comparadas (y[:, :-1] vs x[:, 1:]): {total_shift_positions}")
    print(f"Coincidencias: {total_shift_matches}")
    print(f"Proporción de coincidencia (ideal ~1.0): {shift_ratio:.6f}")

    # Estadísticas sobre la PRIMERA posición
    print("\nDistribución de primeros tokens (input vs target):")

    first_input_counts  = Counter(first_input_ids)
    first_target_counts = Counter(first_target_ids)

    def top_tokens(counter, name, k=10):
        print(f"\nTop {k} tokens más frecuentes en {name}:")
        for token_id, cnt in counter.most_common(k):
            text = tokenizer.decode([token_id])
            print(f"  id={token_id:5d} | freq={cnt:6d} | texto={repr(text)}")

    top_tokens(first_input_counts,  "x[:, 0]  (primer token que VE el modelo)")
    top_tokens(first_target_counts, "y[:, 0]  (primer token que DEBE predecir)")

    print("\n============================================================")

  
inspect_autoregressive_loader(train_loader, tokenizer,
                              num_batches=1,  
                              max_examples=2, 
                              max_tokens_print=50)


=== Batch 0 ===

--- Ejemplo 0 ---
Input IDs   (primeros 50): [138, 33, 2872, 401, 138, 138, 46, 294, 182, 505, 5295, 1505, 1704, 4402, 1209, 138, 138, 46, 294, 182, 505, 5295, 235, 1505, 1704, 335, 3488, 10375, 207, 2496, 3041, 4402, 17, 256, 2728, 943, 4617, 209, 9540, 3041, 4402, 3870, 234, 14057, 7731, 831, 4036, 209, 1456, 2571]
Target IDs  (primeros 50): [33, 2872, 401, 138, 138, 46, 294, 182, 505, 5295, 1505, 1704, 4402, 1209, 138, 138, 46, 294, 182, 505, 5295, 235, 1505, 1704, 335, 3488, 10375, 207, 2496, 3041, 4402, 17, 256, 2728, 943, 4617, 209, 9540, 3041, 4402, 3870, 234, 14057, 7731, 831, 4036, 209, 1456, 2571, 204]
Input texto (modelo VE):
"\n> learn more\n\neuler hermes north america insurance company\n\neuler hermes is north america's largest provider of trade credit insurance. we offer both domestic and export credit insurance policies that insure clients against commercial and political risk"
Target texto (modelo DEBE predecir):
"> learn more\n\neuler hermes north am

In [22]:
import torch
import torch.nn as nn

class GPT2Embeddings(nn.Module):
    """
    Embeddings estilo GPT-2:
      - token embeddings aprendidos
      - positional embeddings aprendidos
      - dropout opcional

    input_ids: LongTensor [B, T]
    return:    FloatTensor [B, T, d_model]
    """
    def __init__(self, vocab_size: int, d_model: int, block_size: int, dropout: float = 0.1):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(block_size, d_model)
        self.dropout = nn.Dropout(dropout)
        self.block_size = block_size
        self.d_model = d_model

    def forward(self, input_ids: torch.Tensor):
        """
        input_ids: [B, T] con IDs de tokens.
        """
        
        B, T = input_ids.shape
        if T > self.block_size:
            raise ValueError(f"Secuencia T={T} > block_size={self.block_size}")

        # posiciones [0, 1, ..., T-1]
        device = input_ids.device
        pos_ids = torch.arange(0, T, device=device).unsqueeze(0)  # [1, T]
        pos_ids = pos_ids.expand(B, T)  

        tok = self.tok_emb(input_ids)  # [B, T, d_model]
        pos = self.pos_emb(pos_ids)   
        x = tok + pos                  # [B, T, d_model]

        x = self.dropout(x)
        return x

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"

vocab_size = tokenizer.get_vocab_size()
d_model = 256
block_size = 256

emb = GPT2Embeddings(vocab_size, d_model, block_size, dropout=0.1).to(device)

x_ids, y_ids = next(iter(train_loader))  
x_ids = x_ids.to(device)

x_emb = emb(x_ids)  # [B, T, d_model]
print(x_emb.shape)


torch.Size([64, 256, 256])


In [21]:
import torch.nn.functional as F

def scaled_dot_product_attention(q, k, v, mask=None):
    """
    q: (..., Lq, d)
    k: (..., Lk, d)
    v: (..., Lk, dv)
    mask: broadcastable a (..., Lq, Lk)
          - bool: True = BLOQUEAR (poner -inf)
          - float: 1.0 = permitir, 0.0 = bloquear
    Returns:
        output: (..., Lq, dv)
        attn:   (..., Lq, Lk)

    """
    scores = torch.matmul(q, k.transpose(-2, -1))
    dk = q.size(-1)
    scores = scores / dk**0.5

    if mask is not None:
        # Normalizamos a un tensor float con -inf donde se bloquea
        if mask.dtype == torch.bool:
            scores = scores.masked_fill(mask, float("-inf"))
        else:
            # asumimos máscara en {0,1}: 0 = bloquear
            scores = scores.masked_fill(mask <= 0, float("-inf"))

    attn = F.softmax(scores, dim=-1)
    output = torch.matmul(attn, v)
    return output, attn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int, dropout: float = 0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model debe ser múltiplo de num_heads"
        self.num_heads = num_heads
        self.d_head = d_model // num_heads

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def _split_heads(self, x):
        B, L, _ = x.shape
        return x.view(B, L, self.num_heads, self.d_head).transpose(1, 2)

    def _combine_heads(self, x):
        B, H, L, D = x.shape
        return x.transpose(1, 2).contiguous().view(B, L, H * D)

    def forward(self, x_q, x_kv, mask=None):
        #Primera proyeccion
        q = self._split_heads(self.w_q(x_q))
        k = self._split_heads(self.w_k(x_kv))
        v = self._split_heads(self.w_v(x_kv))


        if mask is not None:
        # Aceptamos:
        # (B, Lk), (B, Lq, Lk), (B, 1, Lq, Lk), (B, H, Lq, Lk)
          if mask.dim() == 2:
              mask = mask[:, None, None, :]

          elif mask.dim() == 3:
              mask = mask[:, None, :, :]

          elif mask.dim() == 4:
              pass # Ya funciona asi
          else:
              raise ValueError(f"Máscara con dims no soportadas: {mask.shape}")

        if mask.dtype != torch.bool:
            mask = (mask <= 0)

        # Aplicamos Atencion y concatenamos
        attn_out, _ = scaled_dot_product_attention(q, k, v, mask)
        attn_out = self._combine_heads(attn_out)

        # Proyeccion Final
        attn_out = self.w_o(attn_out)
        attn_out = self.dropout(attn_out)
        return attn_out

class CausalSelfAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int, block_size: int, dropout: float = 0.1):
        super().__init__()
        
        self.mha = MultiHeadAttention(d_model, num_heads, dropout=dropout)
        self.block_size = block_size

        mask = torch.triu(torch.ones(block_size, block_size, dtype=torch.bool), diagonal=1)
        self.register_buffer("causal_mask", mask, persistent=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: [B, T, d_model]
        """
        B, T, _ = x.shape
        if T > self.block_size:
            raise ValueError(f"T={T} > block_size={self.block_size}")

        # [T, T] -> [1, T, T] para que MultiHeadAttention lo trate como 'batch size = 1'
        mask = self.causal_mask[:T, :T].unsqueeze(0) 

        # Self-attention: q = k = v = x
        out = self.mha(x, x, mask=mask)   # [B, T, d_model]
        return out

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = tokenizer.get_vocab_size()
d_model  = 256
block_size = 256

emb  = GPT2Embeddings(vocab_size, d_model, block_size, dropout=0.1).to(device)
attn = CausalSelfAttention(d_model, num_heads=8, block_size=block_size, dropout=0.1).to(device)

x_ids, y_ids = next(iter(train_loader))  # [B, T]
x_ids = x_ids.to(device)

x = emb(x_ids)  # pesos + input 
x = attn(x)     # [B, T, d_model]
print(x.shape) # [B, T, d_model] (self-attn causal)



torch.Size([64, 256, 256])


In [20]:
class GPT2MLP(nn.Module):
    """
    MLP posición a posición estilo GPT-2:
      Linear(d_model → d_ff) + GELU + Dropout + Linear(d_ff → d_model)
    """
    def __init__(self, d_model: int, d_ff = None, dropout: float = 0.1):
        super().__init__()
        if d_ff is None:
            d_ff = 4 * d_model  # típico en GPT

        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.act = nn.GELU()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.fc1(x)
        x = self.act(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

class GPT2Block(nn.Module):
    """
    Bloque GPT-2:
      x -> LN -> CausalSelfAttention -> +residual
      -> LN -> MLP -> +residual
    Pre-LN (estilo GPT-2 moderno).
    """
    def __init__(self, d_model: int, num_heads: int, block_size: int,
                 d_ff = None, dropout: float = 0.1, layernorm_eps: float = 1e-5):
      
        super().__init__()
        self.ln_1 = nn.LayerNorm(d_model, eps=layernorm_eps)
        self.ln_2 = nn.LayerNorm(d_model, eps=layernorm_eps)
        self.attn = CausalSelfAttention(d_model=d_model, num_heads=num_heads,
                                        block_size=block_size, dropout=dropout)
        self.mlp  = GPT2MLP(d_model=d_model, d_ff=d_ff, dropout=dropout)

    def forward(self, x: torch.Tensor):
      # Pre-LN + atención causal
      normalized_1 = self.ln_1(x)
      attn_output = self.attn(normalized_1)
      x = x + attn_output # residual conection
      
      # Pre-LN + MLP
      normalized_2 = self.ln_2(x)
      mlp_output = self.mlp(normalized_2)
      x = x + mlp_output # residual conection
    
      return x

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = tokenizer.get_vocab_size()
d_model = 256
block_size = 256
num_heads = 8
d_ff = 4 * d_model  # 1024

emb = GPT2Embeddings(vocab_size, d_model, block_size, dropout=0.1).to(device)
mlp = GPT2MLP(d_model, d_ff=d_ff, dropout=0.1).to(device)
block = GPT2Block(d_model, num_heads=num_heads, block_size=block_size, 
                  d_ff=d_ff, dropout=0.1).to(device)


x_ids, y_ids = next(iter(train_loader))  # [B, T]
x_ids = x_ids.to(device)

x = emb(x_ids)  # [B, T, d_model]
print(f"Después de embeddings: {x.shape}")

x_mlp = mlp(x)
print(f"Después de MLP: {x_mlp.shape}")  # [B, T, d_model]

x_block = block(x)
print(f"Después de GPT2Block: {x_block.shape}")  # [B, T, d_model]

print(f"¿Hay NaNs en la salida? {torch.isnan(x_block).any().item()}")



Después de embeddings: torch.Size([64, 256, 256])
Después de MLP: torch.Size([64, 256, 256])
Después de GPT2Block: torch.Size([64, 256, 256])
¿Hay NaNs en la salida? False


In [25]:
def init_gptmini_weights(model):
    with torch.no_grad():
        for name, module in model.named_modules():
            # Embeddings 
            if isinstance(module, nn.Embedding):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)

            # Linears
            elif isinstance(module, nn.Linear):
                if name == "lm_head":
                    # pesos pequeños
                    nn.init.normal_(module.weight, mean=0.0, std=0.02)
                    if module.bias is not None:
                        nn.init.zeros_(module.bias)
                else:
                    # Xavier 
                    nn.init.xavier_uniform_(module.weight)
                    if module.bias is not None:
                        nn.init.zeros_(module.bias)


class GPTMini(nn.Module):
    """
    GPT-2 'mini' decoder-only:
      - Embeddings (token + posición aprendida)
      - n_layers de GPT2Block
      - LayerNorm final
      - LM head (atado a los embeddings de token)
    """
    def __init__(self,
                 vocab_size: int,
                 block_size: int,
                 n_layer: int = 4,
                 n_head: int = 4,
                 d_model: int = 256,
                 dropout: float = 0.1,
                 layernorm_eps: float = 1e-5):
        
        super().__init__()

        self.vocab_size = vocab_size
        self.block_size = block_size
        self.d_model    = d_model

        self.emb = GPT2Embeddings(
            vocab_size=vocab_size,
            d_model=d_model,
            block_size=block_size,
            dropout=dropout)

        self.blocks = nn.ModuleList([
            GPT2Block(
                d_model=d_model,
                num_heads=n_head,
                block_size=block_size,
                d_ff=4*d_model,
                dropout=dropout,
                layernorm_eps=layernorm_eps,) for _ in range(n_layer)])

        self.ln_f = nn.LayerNorm(d_model, eps=layernorm_eps)

        # LM head: proyecta representaciones a logits de vocabulario
        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)

        # Weight tying: compartir pesos con embedding de tokens
        init_gptmini_weights(self)
        self.lm_head.weight = self.emb.tok_emb.weight

    def forward(self, idx: torch.Tensor, targets = None):
      """
      idx:     [B, T] con IDs de tokens de entrada
      targets: [B, T] con IDs objetivo (shifted) o None

      Returns:
        logits: [B, T, vocab_size]
        loss:   escalar (si targets no es None), sino None
      """
      B, T = idx.shape
      if T > self.block_size:
          raise ValueError(f"Secuencia demasiado larga: T={T}, block_size={self.block_size}")

      # Embeddings token + posición
      x = self.emb(idx)  # [B, T, d_model]

      # Pasar por los bloques GPT-2
      for block in self.blocks:
          x = block(x)    # [B, T, d_model]

      # LayerNorm final
      x = self.ln_f(x)   # [B, T, d_model]

      # LM head -> logits
      logits = self.lm_head(x)  # [B, T, vocab_size]

      loss = None
      if targets is not None:
          # Cross-entropy autoregresiva
          logits_flat  = logits.view(-1, self.vocab_size)
          targets_flat = targets.view(-1)
          # Opcional: ignorar padding tokens si usas ignore_index
          loss = F.cross_entropy(logits_flat, targets_flat)

      return logits, loss

In [None]:
vocab_size = tokenizer.get_vocab_size()
block_size = 256  

model = GPTMini(
    vocab_size=vocab_size,
    block_size=block_size,
    n_layer=4,
    n_head=4,
    d_model=256,
    dropout=0.1).to(device)

x_ids, y_ids = next(iter(train_loader))  # [B, T]
x_ids = x_ids.to(device)
y_ids = y_ids.to(device)

logits, loss = model(x_ids, y_ids)
print("logits shape:", logits.shape)  # [B, T, vocab_size]
print("loss:", loss.item())

---

In [26]:
def token_acc(logits: torch.Tensor, targets: torch.Tensor) -> float:
    """
    logits:  [B, T, V]
    targets: [B, T]
    return: accuracy escalar en [0,1]
    """
    preds = logits.argmax(dim=-1)       # [B, T]
    correct = (preds == targets).sum().item()
    total   = targets.numel()
    return correct / total


class WarmupCosineScheduler(torch.optim.lr_scheduler._LRScheduler):
    """
    Warmup lineal + decaimiento coseno:
      - durante 'warmup_steps': lr sube lineal desde 0 hasta base_lr
      - después: decae con coseno hasta ~0 en 'max_steps'
    """
    def __init__(self, optimizer, warmup_steps: int, max_steps: int, last_epoch: int = -1):
        self.warmup_steps = warmup_steps
        self.max_steps = max_steps
        self._step_num = 0
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        self._step_num += 1
        step = self._step_num

        if step <= self.warmup_steps:
            # Warmup lineal: 0 -> 1
            scale = step / float(max(1, self.warmup_steps))
        else:
            # Cosine decay de 1 -> 0
            progress = (step - self.warmup_steps) / float(
                max(1, self.max_steps - self.warmup_steps))
            
            # cos(pi * 0) = 1, cos(pi * 1) = -1  ⇒  scale va de 1 -> 0
            scale = 0.5 * (1.0 + math.cos(math.pi * progress))

        return [base_lr * scale for base_lr in self.base_lrs]


def create_optimizer_and_scheduler(
    model,
    base_lr: float,
    weight_decay: float,
    warmup_steps: int,
    max_steps: int):
  
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=base_lr,
        betas=(0.9, 0.95),
        weight_decay=weight_decay)

    scheduler = WarmupCosineScheduler(
        optimizer=optimizer,
        warmup_steps=warmup_steps,
        max_steps=max_steps)
    
    return optimizer, scheduler

In [27]:
import inspect
from contextlib import contextmanager, nullcontext
import torch

_DTYPE_MAP = {
    "bf16": torch.bfloat16,
    "bfloat16": torch.bfloat16,
    "fp16": torch.float16,
    "float16": torch.float16,}


def _cuda_dtype_supported(dtype: torch.dtype) -> bool:
    if not torch.cuda.is_available():
        return False

    return dtype in (torch.bfloat16, torch.float16)


def make_grad_scaler(device: str = "cuda", enabled: bool = True):
    """
    Devuelve un GradScaler compatible con tu versión de PyTorch.
    - Si AMP no está habilitado, devuelve None.
    - Soporta torch.amp.GradScaler('cuda'|'cpu') o sin args.
    """
    if not enabled:
        return None

    if hasattr(torch, "amp") and hasattr(torch.amp, "GradScaler"):
        try:
            sig = inspect.signature(torch.amp.GradScaler)
            if len(sig.parameters) >= 1:
                return torch.amp.GradScaler(device if device in ("cuda", "cpu") else "cuda")
            else:
                return torch.amp.GradScaler()
        except Exception:
            pass

    if hasattr(torch.cuda, "amp") and hasattr(torch.cuda.amp, "GradScaler"):
        return torch.cuda.amp.GradScaler()

    return None

@contextmanager
def autocast_ctx(
    device: str = "cuda",
    enabled: bool = True,
    dtype: str = "bf16",    
    cache_enabled: bool = True):
  
    """
    Contexto robusto para autocast:
      - CUDA: torch.amp.autocast(device_type="cuda", dtype=...)
      - CPU:  torch.amp.autocast(device_type="cpu",  dtype=torch.bfloat16) si enabled
      - fallback: nullcontext().

    Notas:
      * En BF16 NO uses GradScaler.
      * En FP16 sí puedes usar GradScaler (torch.amp.GradScaler / torch.cuda.amp.GradScaler).
    """
    if not enabled:
        with nullcontext():
            yield
        return

    if device == "cuda":
        want = _DTYPE_MAP.get(dtype.lower(), torch.bfloat16)
        use = want if _cuda_dtype_supported(want) else torch.float16
        with torch.amp.autocast(device_type="cuda", dtype=use, cache_enabled=cache_enabled):
            yield
        return

    if device == "cpu":
        try:
            with torch.amp.autocast(device_type="cpu", dtype=torch.bfloat16, cache_enabled=cache_enabled):
                yield
        except Exception:
            # fallback seguro si el backend no soporta cpu autocast
            with nullcontext():
                yield
        return

    with nullcontext():
        yield


In [35]:
import math
import time
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
import torch.nn.functional as F

def train_gpt_lm(
    model,
    train_loader,
    val_loader=None,
    *,
    epochs: int = 10,
    base_lr: float = 3e-4,
    weight_decay: float = 0.01,
    warmup_steps: int = 2000,   # en steps, no epochs
    label_smoothing: float = 0.0, 
    grad_clip: float | None = 1.0,
    device: str = "cuda",
    ckpt_path: str = "gptmini_best.pt",
    log_every: int = 100,
    preview_every: int | None = None,
    id2tok_fn=None,             # callable: List[int] -> str
    amp_enabled: bool = True,
    amp_dtype: str = "bf16",    
    val_checking: bool = False,     # por defecto NO se hace validación
    save_ckpt_every: int | None = None,  # si no hay val, guardar cada N epochs
):
    """
    Entrena un modelo GPTMini (decoder-only LM) sobre (x, y) donde:
      - x: [B, T]  ids de entrada
      - y: [B, T]  ids objetivo (shifted)

    - Usa AdamW + WarmupCosineScheduler
    - CrossEntropy con label_smoothing opcional (sin PAD)
    - Gradient clipping
    - AMP (bf16/fp16) usando autocast_ctx y make_grad_scaler
    - Si val_checking=True y val_loader no es None:
        * corre validación y guarda mejor checkpoint por val_loss
      Si val_checking=False:
        * NO corre validación; puede guardar checkpoint cada 'save_ckpt_every' epochs
    """
    device = torch.device(device)
    torch.set_float32_matmul_precision("high")
    model.to(device)
    model.train()

    # Estimar total de steps (para el scheduler)
    total_steps = epochs * len(train_loader)
    
    optimizer, scheduler = create_optimizer_and_scheduler(
        model,base_lr=base_lr,weight_decay=weight_decay,
        warmup_steps=warmup_steps,max_steps=total_steps,)

    # Loss con y sin smoothing
    ce_train = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    ce_eval  = nn.CrossEntropyLoss(label_smoothing=0.0)

    # AMP: GradScaler sólo si estamos en fp16; en bf16 normalmente no hace falta
    use_scaler = amp_enabled and (amp_dtype.lower() in ("fp16", "float16"))
    scaler = make_grad_scaler(device="cuda" if device.type == "cuda" else "cpu",
                              enabled=use_scaler)

    best_val = float("inf")
    history = {
        "train_loss": [], "val_loss": [],
        "train_ppl":  [], "val_ppl":  [],
        "train_tok_acc": [], "val_tok_acc": []}

    global_step = 0

    for epoch in range(1, epochs + 1):
        model.train()
        epoch_loss_sum, epoch_tokens = 0.0, 0
        epoch_acc_sum = 0.0
        t0 = time.time()

        for it, (x_ids, y_ids) in enumerate(train_loader, start=1):
            global_step += 1
            x_ids = x_ids.to(device, non_blocking=True)
            y_ids = y_ids.to(device, non_blocking=True)
            B, T = x_ids.shape
            tokens = B * T

            optimizer.zero_grad(set_to_none=True)

            with autocast_ctx(device=device.type, enabled=amp_enabled, dtype=amp_dtype):
                # Usamos la loss interna del modelo
                logits, loss = model(x_ids, y_ids)   # [B, T, V], escalar
                if loss is None:
                    raise RuntimeError("GPTMini.forward debe devolver loss si targets != None")

            if scaler is not None:
                # Asegurarnos de que loss es escalar
                if loss.dim() > 0:
                    loss = loss.mean()
            
                scaler.scale(loss).backward()
                if grad_clip is not None:
                    scaler.unscale_(optimizer)
                    clip_grad_norm_(model.parameters(), grad_clip)
                scaler.step(optimizer)
                scaler.update()
            else:
                # Asegurarnos de que loss es escalar
                if loss.dim() > 0:
                    loss = loss.mean()
            
                loss.backward()
                if grad_clip is not None:
                    clip_grad_norm_(model.parameters(), grad_clip)
                optimizer.step()

            if scheduler is not None:
                scheduler.step()

            with torch.no_grad():
                acc = token_acc(logits, y_ids)

            epoch_loss_sum += loss.item() * tokens
            epoch_acc_sum  += acc * tokens
            epoch_tokens   += tokens

            if it % log_every == 0:
                avg_loss = epoch_loss_sum / max(1, epoch_tokens)
                avg_ppl  = math.exp(avg_loss)
                avg_acc  = epoch_acc_sum / max(1, epoch_tokens)
                tok_per_sec = epoch_tokens / (time.time() - t0 + 1e-9)
                print(f"[Epoch {epoch} | step {it:4d}/{len(train_loader)} | global_step={global_step}] "
                      f"train_loss={avg_loss:.4f}  ppl={avg_ppl:.2f}  "
                      f"tok_acc={avg_acc*100:.2f}%  tok/s={tok_per_sec:,.0f}")


            # Preview LM (teacher forcing, argmax)
            if (preview_every is not None) and (id2tok_fn is not None) and (it % preview_every == 0):
                with torch.no_grad():
                    preds = logits.argmax(dim=-1)  # [B, T]
                    b0 = 0

                    in_ids  = x_ids[b0].tolist()
                    tgt_ids = y_ids[b0].tolist()
                    pred_ids= preds[b0].tolist()

                    max_show = min(80, len(in_ids))
                    in_ids   = in_ids[:max_show]
                    tgt_ids  = tgt_ids[:max_show]
                    pred_ids = pred_ids[:max_show]

                    ctx = id2tok_fn(in_ids)
                    ref = id2tok_fn(tgt_ids)
                    hyp = id2tok_fn(pred_ids)

                    print("— preview (LM, teacher-forced argmax) —")
                    print("CTX:", repr(ctx))
                    print("REF:", repr(ref))
                    print("HYP:", repr(hyp))

        
        # Fin de epoch: promedios train
        train_loss = epoch_loss_sum / max(1, epoch_tokens)
        train_ppl  = math.exp(train_loss)
        train_acc  = epoch_acc_sum / max(1, epoch_tokens)

        history["train_loss"].append(train_loss)
        history["train_ppl"].append(train_ppl)
        history["train_tok_acc"].append(train_acc * 100.0)


        # SIN VALIDACIÓN (val_checking == False o val_loader is None)
        if (not val_checking) or (val_loader is None):
            print(f"Epoch {epoch} done | "
                  f"train_loss={train_loss:.4f}  train_ppl={train_ppl:.2f}  "
                  f"train_tok_acc={train_acc*100:.2f}%")

            # Guardar checkpoint cada 'save_ckpt_every' 
            if save_ckpt_every is not None and (epoch % save_ckpt_every == 0):
                model_to_save = model.module if hasattr(model, "module") else model
                torch.save({
                    "model_state": model_to_save.state_dict(),
                    "optimizer_state": optimizer.state_dict(),
                    "epoch": epoch}, ckpt_path)
                
                print(f"Guardado checkpoint (cada {save_ckpt_every} epochs) -> {ckpt_path}")

            continue

        # CON VALIDACIÓN (val_checking == True y val_loader no es None) 
        model.eval()
        val_loss_sum, val_tokens = 0.0, 0
        val_acc_sum = 0.0

        with torch.no_grad():
            for x_ids, y_ids in val_loader:
                x_ids = x_ids.to(device, non_blocking=True)
                y_ids = y_ids.to(device, non_blocking=True)
                B, T = x_ids.shape
                tokens = B * T

                with autocast_ctx(device=device.type, enabled=amp_enabled, dtype=amp_dtype):
                    logits, _ = model(x_ids, None)
                    V = logits.size(-1)
                    loss = ce_eval(
                        logits.view(B * T, V),
                        y_ids.view(B * T))

                acc = token_acc(logits, y_ids)

                val_loss_sum += loss.item() * tokens
                val_acc_sum  += acc * tokens
                val_tokens   += tokens

        val_loss = val_loss_sum / max(1, val_tokens)
        val_ppl  = math.exp(val_loss)
        val_acc  = val_acc_sum / max(1, val_tokens)

        history["val_loss"].append(val_loss)
        history["val_ppl"].append(val_ppl)
        history["val_tok_acc"].append(val_acc * 100.0)

        print(f"Epoch {epoch} done | "
              f"train_loss={train_loss:.4f}  train_ppl={train_ppl:.2f}  train_tok_acc={train_acc*100:.2f}%  "
              f"val_loss={val_loss:.4f}    val_ppl={val_ppl:.2f}    val_tok_acc={val_acc*100:.2f}%")

        # Guardar mejor checkpoint por val_loss
        if val_loss < best_val:
            best_val = val_loss
            model_to_save = model.module if hasattr(model, "module") else model
            torch.save({
                    "model_state": model_to_save.state_dict(),
                    "optimizer_state": optimizer.state_dict(),
                    "epoch": epoch,
                    "val_loss": val_loss,
                }, ckpt_path)
            
            print(f"Guardado checkpoint (best val_loss={val_loss:.4f}) -> {ckpt_path}")

    return history


In [47]:
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

import torch, gc
del model
gc.collect()
torch.cuda.empty_cache()

In [37]:
vocab_size = tokenizer.get_vocab_size()
block_size = 256

device = "cuda" if torch.cuda.is_available() else "cpu"

model = GPTMini(
    vocab_size=vocab_size,
    block_size=block_size,  
    n_layer=8,             
    n_head=8,            
    d_model=512,dropout=0.1,).to(device)

if torch.cuda.device_count() > 1:
    print(f"Usando {torch.cuda.device_count()} GPUs con DataParallel")
    model = torch.nn.DataParallel(model)
    use_dataparallel = True

def id2tok_fn(ids):
    return tokenizer.decode(ids)


history = train_gpt_lm(
    model,
    train_loader,
    val_loader=val_loader,
    epochs=10,
    base_lr=3e-4,
    weight_decay=0.01,
    warmup_steps=2000,
    label_smoothing=0.1,
    grad_clip=1.0,
    device=device,
    ckpt_path="gptmini_owt10k.pt",
    log_every=150,
    preview_every=500,
    id2tok_fn=id2tok_fn,
    amp_enabled=True,
    amp_dtype="fp16",   
    val_checking = False , save_ckpt_every = 3)

Usando 2 GPUs con DataParallel
[Epoch 1 | step  150/680 | global_step=150] train_loss=8.9753  ppl=7905.66  tok_acc=3.49%  tok/s=62,364
[Epoch 1 | step  300/680 | global_step=300] train_loss=8.2705  ppl=3906.99  tok_acc=5.50%  tok/s=63,175
[Epoch 1 | step  450/680 | global_step=450] train_loss=7.8397  ppl=2539.43  tok_acc=6.93%  tok/s=63,386
— preview (LM, teacher-forced argmax) —
CTX: " eldemar's royal treasury. their priorities during this time consisted of investigating the invasion forces, looking for possible signs of red robe, trying to track down the rest of the missing key pieces and figuring out some way to leave the time loop. of course, since actually retrieving even the known pieces of the key was impossible with their current skills, and they had no idea what kind"
REF: "emar's royal treasury. their priorities during this time consisted of investigating the invasion forces, looking for possible signs of red robe, trying to track down the rest of the missing key pieces and f

In [46]:
history = train_gpt_lm(
    model,
    train_loader,
    val_loader=val_loader,
    epochs=15,
    base_lr=3e-4,
    weight_decay=0.01,
    warmup_steps=2000,
    label_smoothing=0.1,
    grad_clip=1.0,
    device=device,
    ckpt_path="gptmini_owt10k.pt",
    log_every=150,
    preview_every=1000,
    id2tok_fn=id2tok_fn,
    amp_enabled=True,
    amp_dtype="fp16",   
    val_checking = False , save_ckpt_every = 8)

[Epoch 1 | step  150/680 | global_step=150] train_loss=4.2893  ppl=72.91  tok_acc=27.91%  tok/s=62,028
[Epoch 1 | step  300/680 | global_step=300] train_loss=4.2883  ppl=72.84  tok_acc=27.93%  tok/s=63,176
[Epoch 1 | step  450/680 | global_step=450] train_loss=4.2959  ppl=73.40  tok_acc=27.85%  tok/s=63,260
[Epoch 1 | step  600/680 | global_step=600] train_loss=4.3012  ppl=73.79  tok_acc=27.78%  tok/s=63,290
Epoch 1 done | train_loss=4.3020  train_ppl=73.85  train_tok_acc=27.78%
[Epoch 2 | step  150/680 | global_step=830] train_loss=4.2913  ppl=73.06  tok_acc=27.82%  tok/s=63,453
[Epoch 2 | step  300/680 | global_step=980] train_loss=4.2999  ppl=73.69  tok_acc=27.75%  tok/s=63,623
[Epoch 2 | step  450/680 | global_step=1130] train_loss=4.3094  ppl=74.40  tok_acc=27.65%  tok/s=63,628
[Epoch 2 | step  600/680 | global_step=1280] train_loss=4.3149  ppl=74.80  tok_acc=27.60%  tok/s=63,583
Epoch 2 done | train_loss=4.3175  train_ppl=75.00  train_tok_acc=27.58%


KeyboardInterrupt: 

In [45]:
def generate(model, tokenizer, prompt, max_new_tokens=30, temperature=1.0, top_k=50, device="cuda"):
    model.eval()

    ids = tokenizer.encode(prompt, add_special_tokens=False).ids
    x = torch.tensor([ids], dtype=torch.long, device=device)

    block_size = model.module.block_size if hasattr(model, "module") else model.block_size

    for _ in range(max_new_tokens):
        x_cond = x[:, -block_size:]

        # forward
        logits, _ = model(x_cond, None)
        logits = logits[:, -1, :] / temperature

        # top-k truncation
        if top_k is not None:
            values, _ = torch.topk(logits, top_k)
            logits[logits < values[:, [-1]]] = -float("inf")

        probs = torch.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)

        x = torch.cat([x, next_id], dim=1)

    return tokenizer.decode(x[0].tolist())


prompt = "whats your name?"
print(generate(model, tokenizer, prompt))

 whats your name? and if it's not an answer, you could see yourself in the future in order to find yourself.

and that's not true.

