<a href="https://colab.research.google.com/github/oak-hu/oak-hu.github.io/blob/dependabot%2Fgithub_actions%2Fci-dependencies-cb3525d1d8/sim_williamson.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
from collections import Counter
import re
import requests
from tqdm import tqdm
import pickle
import time

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Download text
url = "https://oak-hu.github.io/Introduction.txt"
response = requests.get(url)
text = response.text
print(f"Downloaded {len(text):,} characters")

# Hyperparameters
VOCAB_SIZE = 2048  # Target vocab size for BPE
batch_size = 64
block_size = 64
max_iters = 10000
eval_interval = 250
learning_rate = 1e-4
n_embd = 128
n_head = 4
n_layer = 4
dropout = 0.2

torch.manual_seed(42)
np.random.seed(42)

class BytePairEncoder:
    def __init__(self, vocab_size=2048):
        self.vocab_size = vocab_size
        self.byte_to_token = {}
        self.token_to_byte = {}
        self.merges = []

    def get_stats(self, ids):
        """Count frequency of adjacent pairs"""
        counts = Counter()
        for pair in zip(ids, ids[1:]):
            counts[pair] += 1
        return counts

    def merge(self, ids, pair, idx):
        """Merge all occurrences of pair into idx"""
        newids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and (ids[i], ids[i+1]) == pair:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        return newids

    def train(self, text, verbose=True):
        """Train BPE on text"""
        # Start with byte-level tokens
        tokens = list(text.encode('utf-8'))

        # Initialize vocab with single bytes
        for i in range(256):
            self.byte_to_token[i] = i
            self.token_to_byte[i] = bytes([i])

        num_merges = self.vocab_size - 256
        ids = list(tokens)

        for i in tqdm(range(num_merges), desc="Training BPE"):
            stats = self.get_stats(ids)
            if not stats:
                break

            # Find most frequent pair
            pair = max(stats, key=stats.get)
            idx = 256 + i

            # Perform merge
            ids = self.merge(ids, pair, idx)
            self.merges.append((pair, idx))

            # Update token mappings
            self.token_to_byte[idx] = self.token_to_byte[pair[0]] + self.token_to_byte[pair[1]]

            if verbose and i % 100 == 0:
                compression_ratio = len(tokens) / len(ids)
                print(f"Merge {i}: {pair} -> {idx}, compression: {compression_ratio:.2f}x")

        # Build reverse mapping
        self.byte_to_token = {v: k for k, v in self.token_to_byte.items() if k < self.vocab_size}

        print(f"Final vocab size: {len(self.token_to_byte)}")
        print(f"Compression ratio: {len(tokens) / len(ids):.2f}x")

    def encode(self, text):
        """Encode text to tokens"""
        tokens = list(text.encode('utf-8'))

        # Apply merges in order
        for pair, idx in self.merges:
            tokens = self.merge(tokens, pair, idx)

        return tokens

    def decode(self, tokens):
        """Decode tokens back to text"""
        byte_list = []
        for token in tokens:
            if token in self.token_to_byte:
                byte_list.extend(self.token_to_byte[token])

        return bytes(byte_list).decode('utf-8', errors='replace')

# Data loading function
def get_batch():
   # Get batch from the full training data
   ix = torch.randint(len(train_data) - block_size, (batch_size,))
   x = torch.stack([train_data[i:i+block_size] for i in ix])
   y = torch.stack([train_data[i+1:i+block_size+1] for i in ix])
   x, y = x.to(device), y.to(device)
   return x, y

@torch.no_grad()
def estimate_loss(model, eval_iters=50):
   # Evaluate loss on the full training data (no validation split)
   out = {}
   model.eval()
   losses = torch.zeros(eval_iters)
   for k in range(eval_iters):
       X, Y = get_batch()
       logits, loss = model(X, Y)
       losses[k] = loss.item()
   out['train'] = losses.mean()
   out['val'] = losses.mean() # Still report 'val' for consistency, using train data
   model.train()
   return out

# Model components - simplified
class Head(nn.Module):
   def __init__(self, head_size):
       super().__init__()
       self.key = nn.Linear(n_embd, head_size, bias=False)
       self.query = nn.Linear(n_embd, head_size, bias=False)
       self.value = nn.Linear(n_embd, head_size, bias=False)
       self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

   def forward(self, x):
       B,T,C = x.shape
       k = self.key(x)
       q = self.query(x)
       wei = q @ k.transpose(-2,-1) * C**-0.5
       wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
       wei = F.softmax(wei, dim=-1)
       v = self.value(x)
       out = wei @ v
       return out

class MultiHeadAttention(nn.Module):
   def __init__(self, num_heads, head_size):
       super().__init__()
       self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
       self.proj = nn.Linear(n_embd, n_embd)

   def forward(self, x):
       out = torch.cat([h(x) for h in self.heads], dim=-1)
       out = self.proj(out)
       return out

class FeedForward(nn.Module):
   def __init__(self, n_embd):
       super().__init__()
       self.net = nn.Sequential(
           nn.Linear(n_embd, 4 * n_embd),
           nn.ReLU(),  # ReLU instead of GELU
           nn.Linear(4 * n_embd, n_embd),
       )

   def forward(self, x):
       return self.net(x)

class Block(nn.Module):
   def __init__(self, n_embd, n_head):
       super().__init__()
       head_size = n_embd // n_head
       self.sa = MultiHeadAttention(n_head, head_size)
       self.ffwd = FeedForward(n_embd)
       self.ln1 = nn.LayerNorm(n_embd)
       self.ln2 = nn.LayerNorm(n_embd)

   def forward(self, x):
       x = x + self.sa(self.ln1(x))
       x = x + self.ffwd(self.ln2(x))
       return x

class GPTLanguageModel(nn.Module):
   def __init__(self, vocab_size):
       super().__init__()
       self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
       self.position_embedding_table = nn.Embedding(block_size, n_embd)
       self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
       self.ln_f = nn.LayerNorm(n_embd)
       self.lm_head = nn.Linear(n_embd, vocab_size)

   def forward(self, idx, targets=None):
       B, T = idx.shape
       tok_emb = self.token_embedding_table(idx)
       pos_emb = self.position_embedding_table(torch.arange(T, device=device))
       x = tok_emb + pos_emb
       x = self.blocks(x)
       x = self.ln_f(x)
       logits = self.lm_head(x)

       if targets is None:
           loss = None
       else:
           B, T, C = logits.shape
           logits = logits.view(B*T, C)
           targets = targets.view(B*T)
           loss = F.cross_entropy(logits, targets)

       return logits, loss

   @torch.no_grad()
   def generate(self, idx, max_new_tokens, temperature=1.0):
       for _ in range(max_new_tokens):
           idx_cond = idx[:, -block_size:]
           logits, _ = self(idx_cond)
           logits = logits[:, -1, :] / temperature
           probs = F.softmax(logits, dim=-1)
           idx_next = torch.multinomial(probs, num_samples=1)
           idx = torch.cat((idx, idx_next), dim=1)
       return idx

Using device: cuda
Downloaded 2,804,543 characters


In [None]:
# Train BPE
print("Training BPE encoder...")
bpe = BytePairEncoder(vocab_size=VOCAB_SIZE)
bpe.train(text, verbose=True)

# Test encoder/decoder
test_strings = [
    "Hello world!",
    "  Multiple   spaces  ",
    "New\nlines\nwork",
    "Special chars: !@#$%^&*()",
    "Numbers: 12345",
    "Mixed: Hello 123 world!\n"
]

print("\n--- Encoder/Decoder Test ---")
for test_str in test_strings:
    encoded = bpe.encode(test_str)
    decoded = bpe.decode(encoded)
    match = "✓" if test_str == decoded else "✗"
    print(f"{match} '{test_str}' -> {len(encoded)} tokens -> '{decoded}'")

# Save the bpe object
output_path = '/content/drive/MyDrive/williamson_bpe.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(bpe, f)

print(f"BPE encoder saved to {output_path}")

In [None]:
def encode_with_checkpoints(bpe, text):
    """Efficiently encode text with progress checkpoints"""

    # Optimized encoding using chunking
    chunk_size = 10000
    encoded_chunks = []
    total_chars = len(text)

    checkpoints = [100, 1000, 10000, 100000]
    next_checkpoint_idx = 0
    chars_processed = 0

    print(f"Encoding {total_chars:,} characters...")
    start_time = time.time()

    for i in range(0, total_chars, chunk_size):
        chunk = text[i:i+chunk_size]
        encoded_chunk = bpe.encode(chunk)
        encoded_chunks.extend(encoded_chunk)

        chars_processed += len(chunk)

        # Check for checkpoints
        while next_checkpoint_idx < len(checkpoints) and chars_processed >= checkpoints[next_checkpoint_idx]:
            cp = checkpoints[next_checkpoint_idx]
            elapsed = time.time() - start_time
            tokens_so_far = len(encoded_chunks)
            compression = cp / tokens_so_far if tokens_so_far > 0 else 0
            print(f"Checkpoint {cp:,} chars: {tokens_so_far:,} tokens, "
                  f"compression: {compression:.2f}x, time: {elapsed:.1f}s")
            next_checkpoint_idx += 1

        # After 100k, checkpoint every 100k
        if chars_processed >= 100000 and chars_processed % 100000 == 0:
            elapsed = time.time() - start_time
            tokens_so_far = len(encoded_chunks)
            compression = chars_processed / tokens_so_far if tokens_so_far > 0 else 0
            print(f"Checkpoint {chars_processed:,} chars: {tokens_so_far:,} tokens, "
                  f"compression: {compression:.2f}x, time: {elapsed:.1f}s")

    total_time = time.time() - start_time
    print(f"\nEncoding complete: {len(encoded_chunks):,} tokens in {total_time:.1f}s")
    print(f"Final compression ratio: {total_chars / len(encoded_chunks):.2f}x")

    return encoded_chunks

# Encode the full text
encoded_text = encode_with_checkpoints(bpe, text)

# Convert to tensor and prepare data splits
data = torch.tensor(encoded_text, dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print(f"\nTrain tokens: {len(train_data):,}")
print(f"Val tokens: {len(val_data):,}")

# Save encoder and data for future use
with open('bpe_encoder.pkl', 'wb') as f:
    pickle.dump(bpe, f)
print("Saved encoder to bpe_encoder.pkl")

# Save the encoded text
encoded_output_path = '/content/drive/MyDrive/williamson_encoded.pkl'
with open(encoded_output_path, 'wb') as f:
    pickle.dump(encoded_text, f)

print(f"Encoded text saved to {encoded_output_path}")

Encoding 2,804,543 characters...
Checkpoint 100 chars: 2,453 tokens, compression: 0.04x, time: 1.6s
Checkpoint 1,000 chars: 2,453 tokens, compression: 0.41x, time: 1.6s
Checkpoint 10,000 chars: 2,453 tokens, compression: 4.08x, time: 1.6s
Checkpoint 100,000 chars: 24,200 tokens, compression: 4.13x, time: 18.6s
Checkpoint 100,000 chars: 24,200 tokens, compression: 4.13x, time: 18.6s
Checkpoint 200,000 chars: 49,897 tokens, compression: 4.01x, time: 38.8s
Checkpoint 300,000 chars: 75,706 tokens, compression: 3.96x, time: 57.3s
Checkpoint 400,000 chars: 102,988 tokens, compression: 3.88x, time: 77.8s
Checkpoint 500,000 chars: 127,903 tokens, compression: 3.91x, time: 97.1s
Checkpoint 600,000 chars: 154,788 tokens, compression: 3.88x, time: 116.0s
Checkpoint 700,000 chars: 180,873 tokens, compression: 3.87x, time: 136.4s
Checkpoint 800,000 chars: 212,471 tokens, compression: 3.77x, time: 157.9s
Checkpoint 900,000 chars: 240,353 tokens, compression: 3.74x, time: 179.1s
Checkpoint 1,000,000 

In [12]:
# Load encoded text and BPE encoder from Drive
encoded_text = None
bpe = None
train_data = None # Initialize train_data here

try:
    with open('/content/drive/MyDrive/williamson_encoded.pkl', 'rb') as f:
        encoded_text = pickle.load(f)
    print("Loaded encoded text from Google Drive.")
    with open('/content/drive/MyDrive/williamson_bpe.pkl', 'rb') as f:
        bpe = pickle.load(f)
    print("Loaded BPE encoder from Google Drive.")

    # Assign the full data to train_data
    train_data = torch.tensor(encoded_text, dtype=torch.long)
    print(f"\nTrain tokens (full dataset): {len(train_data):,}")

except FileNotFoundError:
    print("Error: Encoded text or BPE encoder not found in Google Drive.")
    print("Please ensure 'williamson_encoded.pkl' and 'williamson_bpe.pkl' are in '/content/drive/MyDrive/'.")


# Only proceed with training if data is loaded successfully
if train_data is not None and bpe is not None:
    # Initialize model
    model = GPTLanguageModel(VOCAB_SIZE)
    model = model.to(device)
    print(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

    # Training with compile if available
    if hasattr(torch, 'compile'):
       print("Compiling model with torch.compile()...")
       model = torch.compile(model)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    print("\nStarting training...")
    for iter in range(max_iters):

       # Evaluation and generation at checkpoints
       if iter % eval_interval == 0 or iter == max_iters - 1:
           losses = estimate_loss(model, eval_iters=20)  # Fewer eval iters
           # Report both train and val losses as train loss (no separate val data)
           print(f"\nStep {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

           # Generate sample
           model.eval()
           context = torch.zeros((1, 1), dtype=torch.long, device=device)
           generated = model.generate(context, max_new_tokens=64, temperature=0.5)
           generated_text = bpe.decode(generated[0].tolist())
           print(f"Generated (64 tokens):\n{generated_text}\n")
           model.train()

       # Training step
       xb, yb = get_batch() # Call get_batch without split argument
       logits, loss = model(xb, yb)
       optimizer.zero_grad(set_to_none=True)
       loss.backward()
       optimizer.step()

    print("\nTraining complete!")

    # Final generation
    model.eval()
    context = torch.zeros((1, 1), dtype=torch.long, device=device)
    generated = model.generate(context, max_new_tokens=64, temperature=0.5)
    final_text = bpe.decode(generated[0].tolist())
    print(f"\nFinal generation (500 tokens):\n{final_text}")

    # Save model
    torch.save(model.state_dict(), 'sim_williamson.pth')
    print("\nModel saved to sim_williamson.pth")
else:
    print("\nSkipping model training due to data loading error.")

# Save model
torch.save(model.state_dict(), '/content/drive/MyDrive/sim_williamson_1e4.pth')
print("\nModel saved to /content/drive/MyDrive/sim_williamson_1e4.pth")

Loaded encoded text from Google Drive.
Loaded BPE encoder from Google Drive.

Train tokens (full dataset): 755,587
Model parameters: 1.33M
Compiling model with torch.compile()...

Starting training...

Step 0: train loss 7.7949, val loss 7.7949
Generated (64 tokens):
 obprobabilbeliefs construcknowledgIf withus, . S. Away  i ationown basFregeit ese tiesconditionquirge ilosophences intensioncorreccomplex socistricawForright y to mathematoo best logicilicauscharactercesfailbackus, depends ll ainStion of s. The |furzuch curareoriginal section general prcounterfactualpractic. It is  appro


Step 250: train loss 6.9902, val loss 6.9902
Generated (64 tokens):
 inpr'. snt as . oto ces st erone ms aal, 'its err of teiflbchicalmone )vs 's is , t' more e to yMt or it , itsiis that ely iinsge 


Step 500: train loss 6.9484, val loss 6.9484
Generated (64 tokens):
 a of in oning speC( rbasnot 'and -regun the ytonfor es (for w to es y-is to the innto be ysin the preal ung that posiformrea insitin er

In [15]:
# Sampling/completion script
print("Loading model for inference...")
model = GPTLanguageModel(VOCAB_SIZE)

# Load state dict and handle potential _orig_mod prefix from torch.compile
state_dict = torch.load('sim_williamson.pth')
new_state_dict = {}
for k, v in state_dict.items():
    if k.startswith('_orig_mod.'):
        new_state_dict[k[len('_orig_mod.'):]] = v
    else:
        new_state_dict[k] = v

model.load_state_dict(new_state_dict)
model = model.to(device)
model.eval()

# Load BPE encoder
import pickle
# Update the path to load from Google Drive
bpe_path = '/content/drive/MyDrive/williamson_bpe.pkl'
with open(bpe_path, 'rb') as f:
    bpe = pickle.load(f)
print(f"Loaded BPE encoder from {bpe_path}")

# Prefill prompt - EDIT THIS
prompt = "Knowledge is "

# Encode prompt
tokens = bpe.encode(prompt)
context = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)

print(f"Prompt: {prompt}")
print(f"Tokens: {tokens[:20]}..." if len(tokens) > 20 else f"Tokens: {tokens}")
print("-" * 50)

# Generate with different temperatures
temperatures = [0.1, 0.5, 1.0]

for temp in temperatures:
    print(f"\nTemperature {temp}:")
    generated = model.generate(context, max_new_tokens=200, temperature=temp)
    generated_text = bpe.decode(generated[0].tolist())
    print(generated_text)
    print("-" * 50)

# Interactive mode
print("\n=== Interactive Mode ===")
while True:
    user_prompt = input("\nEnter prompt (or 'quit'): ")
    if user_prompt.lower() == 'quit':
        break

    tokens = bpe.encode(user_prompt)
    context = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)

    temp = float(input("Temperature (0.1-2.0): ") or "0.8")
    max_tokens = int(input("Max tokens (default 100): ") or "100")

    generated = model.generate(context, max_new_tokens=max_tokens, temperature=temp)
    generated_text = bpe.decode(generated[0].tolist())
    print("\n" + generated_text)

Loading model for inference...
Loaded BPE encoder from /content/drive/MyDrive/williamson_bpe.pkl
Prompt: Knowledge is 
Tokens: [1965, 437, 403, 276]
--------------------------------------------------

Temperature 0.1:
Knowledge is to recognize in Section 1.2 There is (1981: 2888–108). But that does not follow that Principle and Peter and Stephen is a fairs (2000a), and so on
2017a: 212121212108-08-20213, 12.2 Replanism, Rosense the category of ' as ' as ' as 'There are literally true belief without knowledge' in believing that the proposition that it is raining. One might be a priori be a priori be a priori be a priori be a priori be a priori be a priori be a priori be a priori be a priori be a priori be a priori 0 p, . . . , . . . , . . . , . . . , . . . , . . . , . . . , . . . , e 
--------------------------------------------------

Temperature 0.5:
Knowledge is any.
Thus the best explanations of thought experiments or less than the attitudes of another reason to prior to heuristic-p