In [None]:
# ============================
#Install dependencies
# ============================
!pip install ipython jupyter matplotlib notebook numpy pandas tqdm scikit-learn scipy plotly spacy textacy tiktoken torch evaluate transformers rouge-score bert-score

Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting textacy
  Downloading textacy-0.13.0-py3-none-any.whl.metadata (5.3 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting jedi>=0.16 (from ipython)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting jupyterlab (from jupyter)
  Downloading jupyterlab-4.3.6-py3-none-any.whl.metadata (16 kB)
Collecting cytoolz>=0.10.1 (from textacy)
  Downloading cytoolz-1.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting floret~=0.10.0 (from textacy)
  Downloading flore

In [None]:
# ============================
#Decoding
# ============================
import json
import torch

def decode_tokens(token_ids, vocab):
    return ''.join(vocab[token_id] for token_id in token_ids)

In [None]:
# ===================================
#Define dataset helper class
# ===================================
import torch

class Dataset:
    def __init__(self, data_tensor, context_size, batch_size, split_factor=0.9):
        """
        data_tensor: A 1D torch tensor containing the entire dataset of token IDs.
        context_size: Number of tokens of context for each training example.
        batch_size: How many examples per batch.
        split_factor: Fraction of data to use for train vs. val.
        """
        self.context_size = context_size
        self.batch_size = batch_size
        self.data = data_tensor
        assert 0 < split_factor < 1
        n = int(len(self.data) * split_factor)
        self.train_data, self.val_data = self.data[:n], self.data[n:]

    def get_batch(self, split, device):
        data = self.train_data if split == 'train' else self.val_data
        ix = torch.randint(len(data) - self.context_size, (self.batch_size,))
        x = torch.stack([data[i : i + self.context_size] for i in ix])
        y = torch.stack([data[i + 1 : i + self.context_size + 1] for i in ix])
        x, y = x.to(device), y.to(device)
        return x, y

In [None]:
# ==================================
# Define GPT model (gpt.py)
# ==================================
import torch
from torch import nn
import torch.nn.functional as F

class Head(nn.Module):
    def __init__(self, head_size, n_embd, context_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(0.2)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,head_size)
        q = self.query(x) # (B,T,head_size)
        # compute attention scores
        wei = q @ k.transpose(-2,-1) * C**-0.5  # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        # weighted aggregation
        v = self.value(x) # (B,T,head_size)
        out = wei @ v     # (B,T,head_size)
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd, context_size):
        super().__init__()
        self.heads = nn.ModuleList([
            Head(head_size, n_embd, context_size) for _ in range(num_heads)
        ])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd * 4),
            nn.ReLU(),
            nn.Linear(n_embd * 4, n_embd),
            nn.Dropout(0.2),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head, context_size):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, context_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embd=32, context_size=8, n_head=4, n_layer=4):
        super().__init__()
        self.context_size = context_size
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(context_size, n_embd)
        self.blocks = nn.Sequential(
            *[Block(n_embd, n_head=n_head, context_size=context_size) for _ in range(n_layer)]
        )
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def generate(self, start_idx, number_of_tokens):
        """
        Autoregressive generation, token by token.
        """
        idx = start_idx
        for _ in range(number_of_tokens):
            idx_cond = idx[:, -self.context_size:]  # crop to last context_size tokens
            logits, _ = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # (B, vocab_size)
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

    def forward(self, idx, targets=None):
        B, T = idx.shape
        token_embeddings = self.token_embedding_table(idx)  # (B, T, n_embd)
        pos_embeddings = self.position_embedding_table(torch.arange(T, device=idx.device))  # (T, n_embd)
        x = token_embeddings + pos_embeddings
        x = self.blocks(x)  # (B, T, n_embd)
        x = self.ln_f(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        loss = None
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

In [None]:
# ============================================
# Training/evaluation logic (train.py)
# ============================================
import torch
import numpy as np

@torch.no_grad()
def estimate_loss(dataset, model, eval_iters=100):
    device = next(model.parameters()).device
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = dataset.get_batch(split, device)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Metrics:
    """
    Compute PPL, ROUGE, and BERTScore on a random batch of generated outputs vs. references.
    """
    def __init__(self, vocab):
        import evaluate
        self.rouge = evaluate.load("rouge")
        self.bertscore = evaluate.load("bertscore")
        self.vocab = vocab  # for decoding (optional)

    @torch.no_grad()
    def __call__(self, dataset, model):
        device = next(model.parameters()).device
        x, y = dataset.get_batch('val', device)
        model.eval()

        _, loss = model(x, y)
        perplexity = torch.exp(loss).item()

        # Generate up to 32 tokens for each sample in the batch
        gen_x = model.generate(x, 32)

        if self.vocab is not None:
            # Convert tokens back to text for evaluation
            generated_texts = [''.join(self.vocab[t] for t in seq.cpu().numpy()) for seq in gen_x]
            reference_texts = [''.join(self.vocab[t] for t in seq.cpu().numpy()) for seq in y]
        else:
            # If no vocab, just treat them as strings of tokens
            generated_texts = [str(seq.cpu().tolist()) for seq in gen_x]
            reference_texts = [str(seq.cpu().tolist()) for seq in y]

        rouge_results = self.rouge.compute(predictions=generated_texts, references=reference_texts)
        bertscore_results = self.bertscore.compute(
            predictions=generated_texts,
            references=reference_texts,
            lang="en"  # set your language
        )

        model.train()
        return {
            "perplexity": perplexity,
            "rouge1": rouge_results["rouge1"].item(),
            "rougeL": rouge_results["rougeL"].item(),
            "bertscore": np.mean(bertscore_results["f1"]).item()
        }

def train_loop(dataset, model, vocab, steps, report_frequency, lr, metrics_enabled):
    device = next(model.parameters()).device
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    metrics = Metrics(vocab) if metrics_enabled else None

    for step in range(steps):
        xb, yb = dataset.get_batch('train', device)
        _, loss = model(xb, yb)

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # Reporting
        if step % report_frequency == 0 or step == steps - 1:
            losses = estimate_loss(dataset, model, eval_iters=50)
            print(f"Step {step}, train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}", flush=True)
            if metrics:
                metrics_dict = metrics(dataset, model)
                print("Metrics:", metrics_dict, flush=True)
            print()

In [None]:
# -------------------------------------------------------------
# helper function that mirrors the "train.py"
# -------------------------------------------------------------
def run_main(
    tokens_file="tokens_10k.json",
    vocab_file="vocab1.json",
    seed=None,
    context_size=256,
    batch_size=32,
    n_embd=384,
    n_head=6,
    n_layer=6,
    dropout=0.2,
    command="train",
    save="model.pth",
    epochs=1000,
    report=100,
    lr=1e-3,
    no_metrics=False,
    load="model.pth",
    prompt=None,
    token_count=100
):
    """
    command: "train" or "eval"
    - "train" trains the model using tokens from `tokens_file`.
    - "eval" loads the trained model (from `load`) and generates text.
    """
    if seed is not None:
        torch.manual_seed(seed)

    # Pick device
    device = 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')
    if device == "cpu":
        print("WARNING: Running on CPU (this may be slow)")

    # Load token IDs (the entire dataset)
    with open(tokens_file, "r", encoding="utf-8") as f:
        data_ids = json.load(f)  # list of integers

    data_tensor = torch.tensor(data_ids, dtype=torch.long)

    # Load  vocab, to get vocab_size and decode
    with open(vocab_file, "r", encoding="utf-8") as f:
        vocab = json.load(f)  # list of strings, indices align with data_ids
    vocab = vocab[:505]
    vocab_size = len(vocab)

    # Create dataset
    dataset = Dataset(data_tensor, context_size, batch_size)

    # Create model
    model = GPTLanguageModel(
        vocab_size=vocab_size,
        n_embd=n_embd,
        context_size=context_size,
        n_head=n_head,
        n_layer=n_layer
    )
    model = model.to(device)

    print(f"Total parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")
    print(f"Using device: {device}\n")

    if command == "eval":
        print("=" * 20, "INFERENCE", "=" * 20)
        model.load_state_dict(torch.load(load, map_location=device))
        model.eval()
    elif command == "train":
        print("=" * 20, "TRAINING", "=" * 20)
        train_loop(dataset, model, vocab, epochs, report, lr, metrics_enabled=(not no_metrics))
        torch.save(model.state_dict(), save)
        print("=" * 50)

    # Generate text from the model (if in eval mode)
    if command == "eval":
        if prompt:
            # Convert  prompt from strings to token IDs
            prompt_ids = []

            try:
                single_id = int(prompt)
                prompt_ids = [single_id]
            except:
                # If we cannot parse it as int, just ignore
                prompt_ids = []

            if not prompt_ids:
                prompt_ids = [0]  # fallback to a single token

            context = torch.tensor([prompt_ids], dtype=torch.long, device=device)
        else:
            # If no prompt, start with a single 0 token
            context = torch.zeros((1, 1), dtype=torch.long, device=device)

        # Generate
        generated_tokens = model.generate(context, token_count)[0].tolist()
        print("=== Generated Tokens ===")
        print(generated_tokens)
        print("=== Generated Text (if vocab is textual) ===")
        gen_text = decode_tokens(generated_tokens, vocab)
        print(gen_text)
    else:
        # After training, show a quick sample
        context = torch.zeros((1, 1), dtype=torch.long, device=device)
        gen_tokens = model.generate(context, 300)[0].tolist()
        print("=== Sample tokens after training ===")
        print(gen_tokens)
        # Decode it
        gen_text = decode_tokens(gen_tokens, vocab)
        print("=== Sample text after training ===")
        print(gen_text)

    return model

In [None]:
# ===========================
# usage
# ===========================
model = run_main(
    tokens_file="/content/drive/MyDrive/tokens_500.json",  #pre-converted token list
    vocab_file="/content/drive/MyDrive/vocab1.json",   #vocab for size & decode
    seed=42,
    context_size=768,
    batch_size=32,
    n_embd=768,
    n_head=6,
    n_layer=6,
    dropout=0.2,
    command="train",
    save="model.pth",
    epochs=3000,  
    report=200,   
    lr=1e-3,
    no_metrics=False
)

Total parameters: 43.88M
Using device: cuda



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Step 0, train loss: 6.9526, val loss: 6.8986


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Metrics: {'perplexity': 997.9855346679688, 'rouge1': 0.9835644150339942, 'rougeL': 0.9835742975615389, 'bertscore': 0.9835342951118946}

Step 200, train loss: 3.6568, val loss: 3.7599
Metrics: {'perplexity': 44.07033157348633, 'rouge1': 0.9745257666565602, 'rougeL': 0.9743965253766038, 'bertscore': 0.9839182682335377}

Step 400, train loss: 3.4541, val loss: 3.5981
Metrics: {'perplexity': 35.52888107299805, 'rouge1': 0.9738985419475712, 'rougeL': 0.9739468975378227, 'bertscore': 0.9844260886311531}

Step 600, train loss: 2.7463, val loss: 3.0040
Metrics: {'perplexity': 20.036231994628906, 'rouge1': 0.9736890362944335, 'rougeL': 0.9737381344392387, 'bertscore': 0.9865731857717037}

Step 800, train loss: 2.3320, val loss: 2.6524
Metrics: {'perplexity': 13.74896240234375, 'rouge1': 0.975971574264201, 'rougeL': 0.9757856571215604, 'bertscore': 0.9878612626343966}

Step 1000, train loss: 2.1577, val loss: 2.5033
Metrics: {'perplexity': 12.018451690673828, 'rouge1': 0.9754817777739575, 'roug

In [8]:
model

GPTLanguageModel(
  (token_embedding_table): Embedding(505, 768)
  (position_embedding_table): Embedding(768, 768)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x Head(
            (key): Linear(in_features=768, out_features=128, bias=False)
            (query): Linear(in_features=768, out_features=128, bias=False)
            (value): Linear(in_features=768, out_features=128, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): ReLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((768,), eps=1e-05, elementwise_aff

In [9]:
torch.save(model.state_dict(), '/content/drive/MyDrive/model2.pth')

FINE TUNING!!

In [None]:
# =========================
#Install dependencies
# =========================
#Some other packages that may be needed:
# !pip install ipython jupyter matplotlib notebook numpy pandas tqdm scikit-learn \
#              scipy plotly spacy textacy tiktoken torch evaluate transformers \
#              rouge-score bert-score

import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [None]:
# ================================================
#Define model + training classes
# ================================================

class Dataset:
    def __init__(self, data_tensor, context_size, batch_size, split_factor=0.9):
        """
        data_tensor: A 1D torch tensor containing the entire dataset of token IDs.
        context_size: Number of tokens of context for each training example.
        batch_size: How many examples per batch.
        split_factor: Fraction of data to use for train vs. val.
        """
        self.context_size = context_size
        self.batch_size = batch_size
        self.data = data_tensor
        assert 0 < split_factor < 1
        n = int(len(self.data) * split_factor)
        self.train_data, self.val_data = self.data[:n], self.data[n:]

    def get_batch(self, split, device):
        data = self.train_data if split == 'train' else self.val_data
        ix = torch.randint(len(data) - self.context_size, (self.batch_size,))
        x = torch.stack([data[i : i + self.context_size] for i in ix])
        y = torch.stack([data[i + 1 : i + self.context_size + 1] for i in ix])
        x, y = x.to(device), y.to(device)
        return x, y

class Head(nn.Module):
    def __init__(self, head_size, n_embd, context_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(0.2)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B,T,head_size)
        q = self.query(x) # (B,T,head_size)
        # compute attention scores
        wei = q @ k.transpose(-2,-1) * C**-0.5  # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        # weighted aggregation
        v = self.value(x) # (B,T,head_size)
        out = wei @ v     # (B,T,head_size)
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd, context_size):
        super().__init__()
        self.heads = nn.ModuleList([
            Head(head_size, n_embd, context_size) for _ in range(num_heads)
        ])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd * 4),
            nn.ReLU(),
            nn.Linear(n_embd * 4, n_embd),
            nn.Dropout(0.2),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head, context_size):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, context_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embd=32, context_size=8, n_head=4, n_layer=4):
        super().__init__()
        self.context_size = context_size
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(context_size, n_embd)
        self.blocks = nn.Sequential(
            *[Block(n_embd, n_head=n_head, context_size=context_size) for _ in range(n_layer)]
        )
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def generate(self, start_idx, number_of_tokens):
        """
        Autoregressive generation, token by token.
        """
        idx = start_idx
        for _ in range(number_of_tokens):
            idx_cond = idx[:, -self.context_size:]  # crop to last context_size tokens
            logits, _ = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # (B, vocab_size)
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

    def forward(self, idx, targets=None):
        B, T = idx.shape
        token_embeddings = self.token_embedding_table(idx)  # (B, T, n_embd)
        pos_embeddings = self.position_embedding_table(torch.arange(T, device=idx.device))  # (T, n_embd)
        x = token_embeddings + pos_embeddings
        x = self.blocks(x)  # (B, T, n_embd)
        x = self.ln_f(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        loss = None
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

# Simple perplexity-based loss estimator
@torch.no_grad()
def estimate_loss(dataset, model, eval_iters=100):
    device = next(model.parameters()).device
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = dataset.get_batch(split, device)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def train_loop(dataset, model, steps, report_frequency, lr):
    device = next(model.parameters()).device
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    for step in range(steps):
        xb, yb = dataset.get_batch('train', device)
        _, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # Reporting
        if step % report_frequency == 0 or step == steps - 1:
            losses = estimate_loss(dataset, model, eval_iters=50)
            print(f"Step {step} => train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}", flush=True)

# Master function that trains or evaluates
def run_main(
    tokens_file="tokens.json",
    vocab_file="vocab.json",
    load="",
    save="model_finetuned.pth",
    seed=42,
    context_size=256,
    batch_size=32,
    n_embd=384,
    n_head=6,
    n_layer=6,
    command="train",
    epochs=1000,
    report=200,
    lr=1e-3
):
    if seed is not None:
        torch.manual_seed(seed)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    # Load the new token IDs
    with open(tokens_file, "r", encoding="utf-8") as f:
        data_ids = json.load(f)
    data_tensor = torch.tensor(data_ids, dtype=torch.long)

    # Load your pruned vocab
    with open(vocab_file, "r", encoding="utf-8") as f:
        vocab = json.load(f)
    vocab_size = len(vocab)
    print(f"Vocab size: {vocab_size}, #tokens in dataset: {len(data_ids)}")

    # Create dataset
    dataset = Dataset(data_tensor, context_size, batch_size)

    # Create or load model
    model = GPTLanguageModel(
        vocab_size=vocab_size,
        n_embd=n_embd,
        context_size=context_size,
        n_head=n_head,
        n_layer=n_layer
    ).to(device)

    total_params = sum(p.numel() for p in model.parameters())
    print(f"Model has {total_params/1e6:.2f}M parameters.")

    if load:
        print(f"Loading model weights from '{load}'...")
        model.load_state_dict(torch.load(load, map_location=device))

    if command == "train":
        print("Starting fine-tuning...")
        train_loop(dataset, model, steps=epochs, report_frequency=report, lr=lr)
        torch.save(model.state_dict(), save)
        print(f"Fine-tuned model saved to '{save}'.")
    else:
        print("Command not recognized or we are in eval mode. Implement if needed.")

In [None]:
# ===========================================================================================
# CODE TO SAVE  CURRENT MODEL, CREATE A PRUNED VOCAB, ADD SPECIAL TOKENS, AND FINE-TUNE
# ===========================================================================================
def save_and_finetune():

    print("Saving current in-notebook 'model' to 'model.pth' ...")
    try:
      torch.save(model.state_dict(), "/content/drive/MyDrive/model.pth")
      print("Model saved successfully.\n")
    except:
      print("Can't save model")

    # ---------------------------
    # Build new pruned vocabulary
    # ---------------------------

    with open("/content/drive/MyDrive/vocab1.json", "r", encoding="utf-8") as vf:
        original_vocab = json.load(vf)  # list of tokens

    # Prune to first 500
    pruned_vocab = original_vocab[:500]

    # Build a dict from token -> index
    encode_dict = { pruned_vocab[i]: i for i in range(len(pruned_vocab)) }

    # Now add special tokens at the next available IDs
    encode_dict["HERO:"] = 500
    encode_dict["VILLIAN:"] = 501
    encode_dict["MOD:"] = 502
    encode_dict["<|END_HERO|>"] = 503     # end of hero text
    encode_dict["<|SEP|>"] = 504          # a separator for x / y or between examples

    #  build the final list (decode array) from the dictionary
    # Sort by the encode_dict value
    final_vocab_size = max(encode_dict.values()) + 1
    final_vocab = [None]*final_vocab_size
    for token_str, token_id in encode_dict.items():
        final_vocab[token_id] = token_str

    # Save the new vocab
    with open("pruned_vocab.json", "w", encoding="utf-8") as f:
        json.dump(final_vocab, f, ensure_ascii=False)
    print(f"Saved pruned + special token vocab to 'pruned_vocab.json'.")
    print(f"New vocab size = {final_vocab_size}.\n")

        # --------------------------------------------------
    # Build a single token list from 'ft_training.json'
    # which is a list of { x: [tokens], y: [tokens] }
    # We'll insert <|SEP|> between x and y, and another
    # <|SEP|> to separate different examples.
    # --------------------------------------------------
    with open("/content/drive/MyDrive/ft_training.json", "r", encoding="utf-8") as f:
        finetune_data = json.load(f)

    big_token_list = []
    sep_id = encode_dict["<|SEP|>"]
    for obj in finetune_data:
        x_ids = obj["x"]
        y_ids = obj["y"]

        big_token_list.extend(x_ids)
        big_token_list.append(sep_id)
        big_token_list.extend(y_ids)
        big_token_list.append(sep_id)

    # Save that big token list
    with open("fine_tune_tokens.json", "w", encoding="utf-8") as f:
        json.dump(big_token_list, f)

    print(f"Built fine-tuning dataset (length {len(big_token_list)} tokens) and saved to 'fine_tune_tokens.json'.\n")

        # --------------------------------------------------
    # 4) Fine-tune the model on 'fine_tune_tokens.json'
    #    loading from 'model.pth' and saving to 'model_finetuned.pth'
    # --------------------------------------------------
    run_main(
        tokens_file="fine_tune_tokens.json",
        vocab_file="pruned_vocab.json",
        load="/content/drive/MyDrive/model.pth",
        save="/content/drive/MyDrive/model_finetuned.pth",
        seed=42,
        context_size=768,  
        batch_size=32,
        n_embd=768,
        n_head=6,
        n_layer=6,
        command="train",
        epochs=200,    
        report=200,        
        lr=1e-4,            #Smaller lr
    )
    print("Fine-tuning complete!\n")

In [None]:
# =========================================
#  run the save_and_finetune()
# =========================================
save_and_finetune()

Saving current in-notebook 'model' to 'model.pth' ...
Can't save model
Saved pruned + special token vocab to 'pruned_vocab.json'.
New vocab size = 505.

Built fine-tuning dataset (length 64269444 tokens) and saved to 'fine_tune_tokens.json'.

Using device: cuda
Vocab size: 505, #tokens in dataset: 64269444
Model has 43.88M parameters.
Loading model weights from '/content/drive/MyDrive/model.pth'...
Starting fine-tuning...
Step 0 => train loss: 1.9344, val loss: 2.1462
Step 199 => train loss: 1.6389, val loss: 1.9527
Fine-tuned model saved to '/content/drive/MyDrive/model_finetuned.pth'.
Fine-tuning complete!



In [14]:
from google.colab import runtime
runtime.unassign()