# Python imports

In [None]:
# generic libraries
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
# torch-related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
# hugging face dataset downloader
import datasets
# tokenizer (?)
import tiktoken
# show progress bar in loop
from tqdm.auto import tqdm
# colab download file
from google.colab import files

# Loading Data

In [None]:
ds = datasets.load_dataset("roneneldan/TinyStories")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00004-2d5a1467fff108(…):   0%|          | 0.00/249M [00:00<?, ?B/s]

data/train-00001-of-00004-5852b56a2bd28f(…):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/train-00002-of-00004-a26307300439e9(…):   0%|          | 0.00/246M [00:00<?, ?B/s]

data/train-00003-of-00004-d243063613e5a0(…):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/validation-00000-of-00001-869c898b5(…):   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

In [None]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})


# Tokenizer

In [None]:
# uses a pre-otkenized teokenizer (?)
enc = tiktoken.get_encoding("gpt2")

# Parameters

## Device

In [None]:
# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')
print(f'PyTorch version: {torch.__version__}')

Using device: cpu
PyTorch version: 2.9.0+cpu


## Parameters

In [None]:

# Model hyperparameters
class Config:
    # Vocabulary
    vocab_size = enc.n_vocab
    n_embd = 384
    # Model architecture
    d_model = 384          # Embedding dimension ??
    n_heads = 6            # Number of attention heads
    n_layers = 6           # Number of transformer blocks o:4
    d_ff = 1024             # Feed-forward dimension
    dropout = 0.1          # Dropout rate

    # Training
    block_size = 128        # Maximum context length
    batch_size = 32        # Batch size
    learning_rate = 3e-4   # Learning rate
    max_iters = 100       # Training iterations o:3000
    eval_interval = 100    # Evaluate every N iterations
    eval_iters = 50       # Number of iterations for evaluation

config = Config()

## Learning rate

In [None]:
def auto_lr(iter, i1=1000, v1=2e-4, i2=20000, v2=5e-5):
  if iter < i1:
    # augment from 0 to v0
    beta = np.arccos((v1))/(i1)
    return v1*np.cos(beta*(iter-i1))
    #return iter/1000*v1
  elif iter < i2:
    # diminish learning rate following cosine curve
    beta = np.arccos((v2/v1))/(i2-i1)
    return v1*np.cos(beta*(iter-i1))
  else:
    return v1

lr_iter = []
for k in range(config.max_iters):
  lr_iter.append(auto_lr(k, i1=config.max_iters//10, i2=config.max_iters, v1=4e-4, v2=1e-4))

figure = px.line(x=range(config.max_iters), y=lr_iter, labels={'x': 'Iteration', 'y': 'Learning Rate'}, title='Learning Rate')
figure.show()

# Model Components

## Multi-head Attention
Model components : MHA

In [None]:
class MultiHeadAttention(nn.Module):
    """
    Multi-head causal self-attention.

    Key difference from your previous class:
    - Uses a causal mask to prevent attending to future tokens
    """

    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"

        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads  # Dimension per head

        # Linear projections for Q, K, V (all heads at once)
        self.W_q = nn.Linear(d_model, d_model, bias=True) # passage à true car supposément meilleur
        self.W_k = nn.Linear(d_model, d_model, bias=True)
        self.W_v = nn.Linear(d_model, d_model, bias=True)

        # Output projection
        self.W_o = nn.Linear(d_model, d_model, bias=True)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Args:
            x: Input tensor [batch_size, seq_len, d_model]

        Returns:
            Output tensor [batch_size, seq_len, d_model]
        """
        batch_size, seq_len, d_model = x.shape

        # Linear projections
        Q = self.W_q(x)  # [batch_size, seq_len, d_model]
        K = self.W_k(x)
        V = self.W_v(x)

        # Split into multiple heads and reshape
        # [batch_size, seq_len, d_model] -> [batch_size, seq_len, n_heads, d_k]
        # -> [batch_size, n_heads, seq_len, d_k]
        Q = Q.view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)

        # Scaled dot-product attention
        # Compute attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        # [batch_size, n_heads, seq_len, seq_len]

        # Apply causal mask (prevent attending to future tokens)
        mask = torch.tril(torch.ones(seq_len, seq_len, device=x.device)).bool()
        scores = scores.masked_fill(~mask, float('-inf'))

        # Apply softmax
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Apply attention to values
        out = torch.matmul(attn_weights, V)
        # [batch_size, n_heads, seq_len, d_k]

        # Concatenate heads
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)

        # Output projection
        out = self.W_o(out)

        return out

## Feed-Forward Network
Model components : Feed Forward

In [None]:
class FeedForward(nn.Module):
    """
    Position-wise Feed-Forward Network.
    Two linear transformations with GELU activation.
    """

    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

## Transformer Block
Model components : transformer block

In [None]:
class TransformerBlock(nn.Module):
    """
    A single transformer block:
    - Multi-head attention with residual connection
    - Feed-forward network with residual connection
    - Layer normalization (pre-norm architecture)
    """

    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.ln2 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, d_ff, dropout)

    def forward(self, x):
        # Pre-norm architecture (more stable training)
        # Attention with residual
        x = x + self.attn(self.ln1(x))
        # Feed-forward with residual
        x = x + self.ff(self.ln2(x))
        return x

# Complete transformer model

In [None]:
class GPTModel(nn.Module):
    """
    A minimal GPT-style transformer for next token prediction.

    Architecture:
    1. Token embeddings + positional embeddings
    2. Stack of transformer blocks
    3. Layer norm
    4. Linear head to predict next token
    """

    def __init__(self, vocab_size, config):
        super().__init__()
        self.config = config

        # Token embeddings
        self.token_embedding = nn.Embedding(vocab_size, config.d_model)

        # Positional embeddings (learned)
        self.pos_embedding = nn.Embedding(config.block_size, config.d_model)

        # Transformer blocks
        self.blocks = nn.ModuleList([
            TransformerBlock(config.d_model, config.n_heads, config.d_ff, config.dropout)
            for _ in range(config.n_layers)
        ])

        # Final layer norm
        self.ln_f = nn.LayerNorm(config.d_model)

        # Language modeling head
        self.lm_head = nn.Linear(config.d_model, vocab_size, bias=False)

        # Weight tying (share weights between token embeddings and lm_head)
        self.token_embedding.weight = self.lm_head.weight

        # Initialize weights
        self.apply(self._init_weights)

        # Report number of parameters
        n_params = sum(p.numel() for p in self.parameters())
        print(f"Number of parameters: {n_params/1e6:.2f}M")

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        """
        Args:
            idx: Input token indices [batch_size, seq_len]
            targets: Target token indices [batch_size, seq_len] (optional)

        Returns:
            logits: Output logits [batch_size, seq_len, vocab_size]
            loss: Cross-entropy loss (if targets provided)
        """
        batch_size, seq_len = idx.shape

        # Token embeddings
        tok_emb = self.token_embedding(idx)  # [batch_size, seq_len, d_model]

        # Positional embeddings
        pos = torch.arange(0, seq_len, dtype=torch.long, device=idx.device)
        pos_emb = self.pos_embedding(pos)  # [seq_len, d_model]

        # Add embeddings
        x = tok_emb + pos_emb  # Broadcasting happens automatically

        # Apply transformer blocks
        for block in self.blocks:
            x = block(x)

        # Final layer norm
        x = self.ln_f(x)

        # Language modeling head
        logits = self.lm_head(x)  # [batch_size, seq_len, vocab_size]

        # Compute loss if targets are provided
        loss = None
        if targets is not None:
            # Reshape for cross-entropy
            B, T, C = logits.shape
            logits_flat = logits.view(B * T, C)
            targets_flat = targets.view(B * T)
            loss = F.cross_entropy(logits_flat, targets_flat)

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Generate new tokens autoregressively.

        Args:
            idx: Starting sequence [batch_size, seq_len]
            max_new_tokens: Number of tokens to generate
            temperature: Sampling temperature (higher = more random)
            top_k: If set, only sample from top k tokens

        Returns:
            Generated sequence [batch_size, seq_len + max_new_tokens]
        """
        for _ in range(max_new_tokens):
            # Crop context if needed (to fit block_size)
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]

            # Forward pass
            logits, _ = self(idx_cond)

            # Focus on last time step
            logits = logits[:, -1, :] / temperature  # [batch_size, vocab_size]

            # Optionally crop logits to only top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')

            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)

            # Append to sequence
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

# Instanciate the model

In [None]:
# Create model
model = GPTModel(config.vocab_size, config).to(device)

# Print model architecture
print(model)

Number of parameters: 27.63M
GPTModel(
  (token_embedding): Embedding(50257, 384)
  (pos_embedding): Embedding(128, 384)
  (blocks): ModuleList(
    (0-5): 6 x TransformerBlock(
      (ln1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=384, out_features=384, bias=True)
        (W_k): Linear(in_features=384, out_features=384, bias=True)
        (W_v): Linear(in_features=384, out_features=384, bias=True)
        (W_o): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (ff): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1024, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=1024, out_features=384, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (ln_f): LayerNorm((3

In [None]:
# Generate from untrained model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=200, temperature=1.0, top_k=10)
print('\n=== Untrained Model Generation ===')
print(enc.decode(generated[0].tolist()))
print('=' * 50)


=== Untrained Model Generation ===
!akuya IR relational3333 section section scenic section sellers advise disposal bachelor unpredictable bacheloralia no sellersdam skip unpredictable unpredictable doctr 227ules proteinAllen doctrAllen Petraprivatearnzhouarn unpredictable sequencesShamhasAllen price SadlyMove Frost scorerules WATCHED WATCHED skiparn1arnlipliplip no Alecwitz Frost apparelthey doctrAM Yaz sequences sequences sequences VIP PythAllen democr collideloving MSM guarantees doctrARDS againsthangeen Sadly Toyota honors Restrictaintaint honors honors Petra Third vomizainals Cyn bab� initiative sidebar nicer carbonheed unrealistic simplifiedacs Scenes UFOs canoeт Wil savage honorsAllen price honors vomarchives Stamp pudding 237 KapWant raises folk savage library Gilbert redes against Kap protagonistsalla undecided Hem honorstracking Ox supplieditta honors vom vom Fenprefix Murdrunsitar Tact Tact poisons unforgettable upwards cannedonomy you loose Cout you you you WilStat loose lo

# Train setup

## Optimizer

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)

## Split training data

In [None]:
ds["train"]

Dataset({
    features: ['text'],
    num_rows: 2119719
})

In [None]:
all_stories = ds["train"]["text"]
n_stories = len(all_stories)
n_test = len(all_stories)
n_train = n_stories #- n_test

train_stories = ds["train"]["text"]
test_stories = ds["validation"]["text"]

## Tokenize whole dataset

In [None]:
def tokenize_function(examples):
    # `enc.encode` returns a list of integers.
    return {'input_ids': [enc.encode(text) for text in examples['text']]}

# 1. Tokenize everything
tokenized_ds = ds.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
    num_proc=8 # Use more cores for speed
)


Map (num_proc=8):   0%|          | 0/2119719 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/21990 [00:00<?, ? examples/s]

In [None]:
# print length of train and test data
print(f"Train data length: {len(train_stories)} stories")
print(f"Test data length: {len(test_stories)} stories")

Train data length: 2119719 stories
Test data length: 21990 stories


## Flattening and chunking

In [None]:

# 2. Flatten and Chunk everything
def group_texts(tokenized_texts):
    # Flatten all stories into one long list of IDs
    concatenated_ids = [token for story in tokenized_texts['input_ids'] for token in story]

    # We want chunks of block_size + 1
    chunk_size = config.block_size + 1
    total_length = (len(concatenated_ids) // chunk_size) * chunk_size

    # Slice into equal blocks
    result = {
        "input_ids": [
            concatenated_ids[i : i + chunk_size]
            for i in range(0, total_length, chunk_size)
        ]
    }
    return result

# Apply the grouping
chunked_ds = tokenized_ds.map(
    group_texts,
    batched=True,
    batch_size=config.batch_size,
    num_proc=8
)


Map (num_proc=8):   0%|          | 0/2119719 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/21990 [00:00<?, ? examples/s]

In [None]:
# 3. Split into Train and Test
# This creates a DatasetDict with 'train' and 'test' keys
train_data = chunked_ds['train']
test_data = chunked_ds['validation']

print(f"Train data length: {len(train_data)} stories")
print(f"Test data length: {len(test_data)} stories")

Train data length: 3625009 stories
Test data length: 36430 stories


## Batching

In [None]:
def get_batch_sr(split, config):
    data = train_data if split == 'train' else test_data
    # Randomly select indices for the batch
    ix = torch.randint(len(data), (config.batch_size,))

    # Grab the rows from our pre-chunked dataset
    # Converting to tensor is fast since all rows are equal length
    batch_data = torch.tensor([data[i.item()]['input_ids'] for i in ix])

    # x is the sequence, y is the sequence shifted by one
    x = batch_data[:, :-1]
    y = batch_data[:, 1:]

    # Move to device (GPU/MPS/CPU)
    x, y = x.to(device), y.to(device)

    return x, y

In [None]:
# old versions

def get_batch_r(split, config, iter=0):
  if split == 'train':
    data = train_data
  elif split == 'test':
    data = test_data
  else : return None
  # rnadomly select starting position
  ix = torch.randint(len(data) - config.block_size, (config.batch_size,))
  # in and target sequence
  x = torch.stack([torch.tensor(data[i:i+config.block_size]) for i in ix])
  y = torch.stack([torch.tensor(data[i+1:i+config.block_size+1]) for i in ix])
  # send to device
  x, y = x.to(device), y.to(device)
  return x, y

def get_batch_d(split, config, iter):
  if split == 'train':
    data = train_data
    len_data = len(train_data)
  elif split == 'test':
    data = test_data
    len_data = len(test_data)
  else : return None
  # rnadomly select starting position
  #ix = torch.randint(len(data) - config.block_size, (config.batch_size,))
  ix = torch.ones(config.batch_size, dtype=torch.int64) * (iter * config.batch_size)%len_data
  # in and target sequence
  x = torch.stack([torch.tensor(data[i:i+config.block_size]) for i in ix])
  y = torch.stack([torch.tensor(data[i+1:i+config.block_size+1]) for i in ix])
  # send to device
  x, y = x.to(device), y.to(device)
  return x, y

In [None]:
# test batching batchi batch
print("="*40)
xb, yb = get_batch_sr('train', config)
print('inputs:')
print(xb.shape)
print(enc.decode(xb[0].tolist()))
print('targets:')
print(yb.shape)
print(enc.decode(yb[0].tolist()))
print("="*40)


inputs:
torch.Size([32, 128])
 They stay friends with everyone. They play sport with joy.Sara and Tom were friends who liked to play on the beach. They made sand castles, dug holes, and collected shells. One day, they found a big shiny shell that looked like a rainbow. They both wanted it for themselves.

"Give me the shell, Sara. It is mine. I saw it first," Tom said.

"No, Tom. It is mine. I picked it up first," Sara said.

They pulled and tugged at the shell, but neither of them would let go. They started to argue and shout at
targets:
torch.Size([32, 128])
 stay friends with everyone. They play sport with joy.Sara and Tom were friends who liked to play on the beach. They made sand castles, dug holes, and collected shells. One day, they found a big shiny shell that looked like a rainbow. They both wanted it for themselves.

"Give me the shell, Sara. It is mine. I saw it first," Tom said.

"No, Tom. It is mine. I picked it up first," Sara said.

They pulled and tugged at the shell, b

# Training

In [None]:
@torch.no_grad()
def estimate_loss(model, config):
    """
    Estimate loss on train and validation sets.
    """
    out = {}
    model.eval()
    for split in ['train', 'test']:
        losses = torch.zeros(config.eval_iters)
        for k in range(config.eval_iters):
            X, Y = get_batch_sr(split, config)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
# gemini, pour sauver les fichiers en live sur le drive
from google.colab import drive
drive.mount('/content/drive')
import os

drive_path = '/content/drive/MyDrive/colab_model_checkpoints/'
os.makedirs(drive_path, exist_ok=True)
print(f"Created directory: {drive_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Created directory: /content/drive/MyDrive/colab_model_checkpoints/


In [None]:
# save model parameters
random_n_model = np.random.randint(1000, 10000)
with open(drive_path+f'model_{random_n_model}.txt', 'w') as f:
    f.write(str(model))
print("Saved model description")
#files.download(f'model_{random_n_model}.txt')
print("Downloaded model description")


# Training loop
train_losses = []
val_losses = []
iterations = []

print('Starting training...')
print(f'Training for {config.max_iters} iterations')
print(f'Evaluating every {config.eval_interval} iterations\n')

for iter in tqdm(range(config.max_iters)):
    # Evaluate loss periodically
    if iter % config.eval_interval == 0 or iter == config.max_iters - 1:
        losses = estimate_loss(model, config)
        print(f"Step {iter}: train loss {losses['train']:.4f}, test loss {losses['test']:.4f}")
        train_losses.append(losses['train'])
        val_losses.append(losses['test'])
        iterations.append(iter)

    # Sample a batch
    xb, yb = get_batch_sr('train', config)

    # Forward pass
    logits, loss = model(xb, yb)

    # Backward pass
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    # Update learning rate
    optimizer.param_groups[0]['lr'] = lr_iter[iter]
    optimizer.step()
    # save the model
    if iter % 100 == 0 and iter !=0 :
      torch.save(model.state_dict(), drive_path + f'model_{random_n_model}_i{iter}.pt')
      print("Model saved !")
      #files.download(f'model_{random_n_model}_i{iter}.pt')
      print("Model downloaded !")


print('\nTraining complete!')

Saved model description
Downloaded model description
Starting training...
Training for 100 iterations
Evaluating every 100 iterations



  0%|          | 0/100 [00:00<?, ?it/s]

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

# Results

## Basic

In [None]:
figure = px.line(x=iterations, y=[train_losses, val_losses], labels={'x': 'Iteration', 'y': 'Loss'}, title='Training and Validation Loss')
figure.show()

In [None]:
# Generate from trained model
model.eval()
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=500, temperature=1.0, top_k=10)
print('\n=== Trained Model Generation ===')
print(enc.decode(generated[0].tolist()))
print('=' * 50)

## Advanced

In [None]:
prompts = ["One day, looking over the horizon",
           "In the deep forest of Mirkwood",
           "The sea"]
answers = []

for prompt in prompts :
  p_enc = enc.encode(prompt)
  context = torch.tensor(p_enc, dtype=torch.long, device=device).unsqueeze(0)
  generated = model.generate(context, max_new_tokens=100, temperature=1.0, top_k=10)
  answers.append(generated)
  print('\n=== Trained Model Generation ===')
  print(enc.decode(generated[0].tolist()))
  print('=' * 50)

##