<a href="https://colab.research.google.com/github/oak-hu/oak-hu.github.io/blob/dependabot%2Fgithub_actions%2Fci-dependencies-cb3525d1d8/SimWilliamson.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
from collections import Counter
import re
import requests
from tqdm import tqdm
import pickle
import time

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Download text
url = "https://oak-hu.github.io/Introduction.txt"
response = requests.get(url)
text = response.text
print(f"Downloaded {len(text):,} characters")

# Hyperparameters
VOCAB_SIZE = 2048  # Target vocab size for BPE
batch_size = 32
block_size = 128
max_iters = 3000
eval_interval = 100
learning_rate = 3e-4
n_embd = 128
n_head = 8
n_layer = 6
dropout = 0.1

torch.manual_seed(42)
np.random.seed(42)

Using device: cpu
Downloaded 2,804,543 characters


In [9]:
class BytePairEncoder:
    def __init__(self, vocab_size=2048):
        self.vocab_size = vocab_size
        self.byte_to_token = {}
        self.token_to_byte = {}
        self.merges = []

    def get_stats(self, ids):
        """Count frequency of adjacent pairs"""
        counts = Counter()
        for pair in zip(ids, ids[1:]):
            counts[pair] += 1
        return counts

    def merge(self, ids, pair, idx):
        """Merge all occurrences of pair into idx"""
        newids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and (ids[i], ids[i+1]) == pair:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        return newids

    def train(self, text, verbose=True):
        """Train BPE on text"""
        # Start with byte-level tokens
        tokens = list(text.encode('utf-8'))

        # Initialize vocab with single bytes
        for i in range(256):
            self.byte_to_token[i] = i
            self.token_to_byte[i] = bytes([i])

        num_merges = self.vocab_size - 256
        ids = list(tokens)

        for i in tqdm(range(num_merges), desc="Training BPE"):
            stats = self.get_stats(ids)
            if not stats:
                break

            # Find most frequent pair
            pair = max(stats, key=stats.get)
            idx = 256 + i

            # Perform merge
            ids = self.merge(ids, pair, idx)
            self.merges.append((pair, idx))

            # Update token mappings
            self.token_to_byte[idx] = self.token_to_byte[pair[0]] + self.token_to_byte[pair[1]]

            if verbose and i % 100 == 0:
                compression_ratio = len(tokens) / len(ids)
                print(f"Merge {i}: {pair} -> {idx}, compression: {compression_ratio:.2f}x")

        # Build reverse mapping
        self.byte_to_token = {v: k for k, v in self.token_to_byte.items() if k < self.vocab_size}

        print(f"Final vocab size: {len(self.token_to_byte)}")
        print(f"Compression ratio: {len(tokens) / len(ids):.2f}x")

    def encode(self, text):
        """Encode text to tokens"""
        tokens = list(text.encode('utf-8'))

        # Apply merges in order
        for pair, idx in self.merges:
            tokens = self.merge(tokens, pair, idx)

        return tokens

    def decode(self, tokens):
        """Decode tokens back to text"""
        byte_list = []
        for token in tokens:
            if token in self.token_to_byte:
                byte_list.extend(self.token_to_byte[token])

        return bytes(byte_list).decode('utf-8', errors='replace')

# Train BPE
print("Training BPE encoder...")
bpe = BytePairEncoder(vocab_size=VOCAB_SIZE)
bpe.train(text, verbose=True)

# Test encoder/decoder
test_strings = [
    "Hello world!",
    "  Multiple   spaces  ",
    "New\nlines\nwork",
    "Special chars: !@#$%^&*()",
    "Numbers: 12345",
    "Mixed: Hello 123 world!\n"
]

print("\n--- Encoder/Decoder Test ---")
for test_str in test_strings:
    encoded = bpe.encode(test_str)
    decoded = bpe.decode(encoded)
    match = "✓" if test_str == decoded else "✗"
    print(f"{match} '{test_str}' -> {len(encoded)} tokens -> '{decoded}'")

Training BPE encoder...


Training BPE:   0%|          | 1/1792 [00:01<48:02,  1.61s/it]

Merge 0: (101, 32) -> 256, compression: 1.03x


Training BPE:   6%|▌         | 101/1792 [02:19<31:46,  1.13s/it]

Merge 100: (97, 98) -> 356, compression: 1.74x


Training BPE:  11%|█         | 201/1792 [04:10<26:22,  1.01it/s]

Merge 200: (331, 32) -> 456, compression: 2.05x


Training BPE:  17%|█▋        | 301/1792 [05:51<23:52,  1.04it/s]

Merge 300: (119, 295) -> 556, compression: 2.26x


Training BPE:  22%|██▏       | 401/1792 [07:27<24:14,  1.05s/it]

Merge 400: (568, 548) -> 656, compression: 2.44x


Training BPE:  28%|██▊       | 501/1792 [08:58<18:41,  1.15it/s]

Merge 500: (609, 261) -> 756, compression: 2.59x


Training BPE:  34%|███▎      | 601/1792 [10:27<16:27,  1.21it/s]

Merge 600: (419, 104) -> 856, compression: 2.72x


Training BPE:  39%|███▉      | 701/1792 [11:53<15:10,  1.20it/s]

Merge 700: (624, 341) -> 956, compression: 2.84x


Training BPE:  45%|████▍     | 801/1792 [13:17<16:13,  1.02it/s]

Merge 800: (102, 442) -> 1056, compression: 2.95x


Training BPE:  50%|█████     | 901/1792 [14:39<11:15,  1.32it/s]

Merge 900: (644, 416) -> 1156, compression: 3.05x


Training BPE:  56%|█████▌    | 1001/1792 [16:01<12:54,  1.02it/s]

Merge 1000: (102, 260) -> 1256, compression: 3.15x


Training BPE:  61%|██████▏   | 1101/1792 [17:19<08:44,  1.32it/s]

Merge 1100: (796, 1272) -> 1356, compression: 3.23x


Training BPE:  67%|██████▋   | 1201/1792 [18:38<07:01,  1.40it/s]

Merge 1200: (98, 114) -> 1456, compression: 3.32x


Training BPE:  73%|███████▎  | 1301/1792 [19:55<05:47,  1.41it/s]

Merge 1300: (962, 295) -> 1556, compression: 3.39x


Training BPE:  78%|███████▊  | 1401/1792 [21:12<05:41,  1.14it/s]

Merge 1400: (101, 1215) -> 1656, compression: 3.47x


Training BPE:  84%|████████▍ | 1501/1792 [22:28<04:15,  1.14it/s]

Merge 1500: (1149, 288) -> 1756, compression: 3.54x


Training BPE:  89%|████████▉ | 1601/1792 [23:42<02:35,  1.23it/s]

Merge 1600: (1224, 627) -> 1856, compression: 3.61x


Training BPE:  95%|█████████▍| 1701/1792 [24:56<01:01,  1.48it/s]

Merge 1700: (39, 80) -> 1956, compression: 3.67x


Training BPE: 100%|██████████| 1792/1792 [26:03<00:00,  1.15it/s]

Final vocab size: 2048
Compression ratio: 3.72x

--- Encoder/Decoder Test ---
✓ 'Hello world!' -> 5 tokens -> 'Hello world!'
✓ '  Multiple   spaces  ' -> 13 tokens -> '  Multiple   spaces  '
✓ 'New
lines
work' -> 7 tokens -> 'New
lines
work'
✓ 'Special chars: !@#$%^&*()' -> 16 tokens -> 'Special chars: !@#$%^&*()'
✓ 'Numbers: 12345' -> 9 tokens -> 'Numbers: 12345'
✓ 'Mixed: Hello 123 world!
' -> 14 tokens -> 'Mixed: Hello 123 world!
'





In [17]:
def encode_with_checkpoints(bpe, text):
    """Efficiently encode text with progress checkpoints"""

    # Optimized encoding using chunking
    chunk_size = 10000
    encoded_chunks = []
    total_chars = len(text)

    checkpoints = [100, 1000, 10000, 100000]
    next_checkpoint_idx = 0
    chars_processed = 0

    print(f"Encoding {total_chars:,} characters...")
    start_time = time.time()

    for i in range(0, total_chars, chunk_size):
        chunk = text[i:i+chunk_size]
        encoded_chunk = bpe.encode(chunk)
        encoded_chunks.extend(encoded_chunk)

        chars_processed += len(chunk)

        # Check for checkpoints
        while next_checkpoint_idx < len(checkpoints) and chars_processed >= checkpoints[next_checkpoint_idx]:
            cp = checkpoints[next_checkpoint_idx]
            elapsed = time.time() - start_time
            tokens_so_far = len(encoded_chunks)
            compression = cp / tokens_so_far if tokens_so_far > 0 else 0
            print(f"Checkpoint {cp:,} chars: {tokens_so_far:,} tokens, "
                  f"compression: {compression:.2f}x, time: {elapsed:.1f}s")
            next_checkpoint_idx += 1

        # After 100k, checkpoint every 100k
        if chars_processed >= 100000 and chars_processed % 100000 == 0:
            elapsed = time.time() - start_time
            tokens_so_far = len(encoded_chunks)
            compression = chars_processed / tokens_so_far if tokens_so_far > 0 else 0
            print(f"Checkpoint {chars_processed:,} chars: {tokens_so_far:,} tokens, "
                  f"compression: {compression:.2f}x, time: {elapsed:.1f}s")

    total_time = time.time() - start_time
    print(f"\nEncoding complete: {len(encoded_chunks):,} tokens in {total_time:.1f}s")
    print(f"Final compression ratio: {total_chars / len(encoded_chunks):.2f}x")

    return encoded_chunks

# Encode the full text
encoded_text = encode_with_checkpoints(bpe, text)

# Convert to tensor and prepare data splits
data = torch.tensor(encoded_text, dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print(f"\nTrain tokens: {len(train_data):,}")
print(f"Val tokens: {len(val_data):,}")

# Save encoder and data for future use
with open('bpe_encoder.pkl', 'wb') as f:
    pickle.dump(bpe, f)
print("Saved encoder to bpe_encoder.pkl")

Encoding 2,804,543 characters...
Checkpoint 100 chars: 2,453 tokens, compression: 0.04x, time: 1.6s
Checkpoint 1,000 chars: 2,453 tokens, compression: 0.41x, time: 1.6s
Checkpoint 10,000 chars: 2,453 tokens, compression: 4.08x, time: 1.6s
Checkpoint 100,000 chars: 24,200 tokens, compression: 4.13x, time: 17.8s
Checkpoint 100,000 chars: 24,200 tokens, compression: 4.13x, time: 17.8s
Checkpoint 200,000 chars: 49,897 tokens, compression: 4.01x, time: 36.8s
Checkpoint 300,000 chars: 75,706 tokens, compression: 3.96x, time: 54.5s
Checkpoint 400,000 chars: 102,988 tokens, compression: 3.88x, time: 74.1s
Checkpoint 500,000 chars: 127,903 tokens, compression: 3.91x, time: 91.4s
Checkpoint 600,000 chars: 154,788 tokens, compression: 3.88x, time: 110.4s
Checkpoint 700,000 chars: 180,873 tokens, compression: 3.87x, time: 128.5s
Checkpoint 800,000 chars: 212,471 tokens, compression: 3.77x, time: 150.3s
Checkpoint 900,000 chars: 240,353 tokens, compression: 3.74x, time: 169.0s
Checkpoint 1,000,000 

In [20]:
# Data loading function
def get_batch(split):
   data = train_data if split == 'train' else val_data
   ix = torch.randint(len(data) - block_size, (batch_size,))
   x = torch.stack([data[i:i+block_size] for i in ix])
   y = torch.stack([data[i+1:i+block_size+1] for i in ix])
   x, y = x.to(device), y.to(device)
   return x, y

@torch.no_grad()
def estimate_loss(model, eval_iters=50):
   out = {}
   model.eval()
   for split in ['train', 'val']:
       losses = torch.zeros(eval_iters)
       for k in range(eval_iters):
           X, Y = get_batch(split)
           logits, loss = model(X, Y)
           losses[k] = loss.item()
       out[split] = losses.mean()
   model.train()
   return out

# Override hyperparameters for speed
batch_size = 64  # Increased
block_size = 32  # Decreased significantly
max_iters = 2000  # Reduced
eval_interval = 200  # Less frequent
learning_rate = 1e-3  # More aggressive
n_embd = 64  # Smaller
n_head = 4  # Fewer heads
n_layer = 4  # Fewer layers
dropout = 0.0  # No dropout

# Model components - simplified
class Head(nn.Module):
   def __init__(self, head_size):
       super().__init__()
       self.key = nn.Linear(n_embd, head_size, bias=False)
       self.query = nn.Linear(n_embd, head_size, bias=False)
       self.value = nn.Linear(n_embd, head_size, bias=False)
       self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

   def forward(self, x):
       B,T,C = x.shape
       k = self.key(x)
       q = self.query(x)
       wei = q @ k.transpose(-2,-1) * C**-0.5
       wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
       wei = F.softmax(wei, dim=-1)
       v = self.value(x)
       out = wei @ v
       return out

class MultiHeadAttention(nn.Module):
   def __init__(self, num_heads, head_size):
       super().__init__()
       self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
       self.proj = nn.Linear(n_embd, n_embd)

   def forward(self, x):
       out = torch.cat([h(x) for h in self.heads], dim=-1)
       out = self.proj(out)
       return out

class FeedForward(nn.Module):
   def __init__(self, n_embd):
       super().__init__()
       self.net = nn.Sequential(
           nn.Linear(n_embd, 4 * n_embd),
           nn.ReLU(),  # ReLU instead of GELU
           nn.Linear(4 * n_embd, n_embd),
       )

   def forward(self, x):
       return self.net(x)

class Block(nn.Module):
   def __init__(self, n_embd, n_head):
       super().__init__()
       head_size = n_embd // n_head
       self.sa = MultiHeadAttention(n_head, head_size)
       self.ffwd = FeedForward(n_embd)
       self.ln1 = nn.LayerNorm(n_embd)
       self.ln2 = nn.LayerNorm(n_embd)

   def forward(self, x):
       x = x + self.sa(self.ln1(x))
       x = x + self.ffwd(self.ln2(x))
       return x

class GPTLanguageModel(nn.Module):
   def __init__(self, vocab_size):
       super().__init__()
       self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
       self.position_embedding_table = nn.Embedding(block_size, n_embd)
       self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
       self.ln_f = nn.LayerNorm(n_embd)
       self.lm_head = nn.Linear(n_embd, vocab_size)

   def forward(self, idx, targets=None):
       B, T = idx.shape
       tok_emb = self.token_embedding_table(idx)
       pos_emb = self.position_embedding_table(torch.arange(T, device=device))
       x = tok_emb + pos_emb
       x = self.blocks(x)
       x = self.ln_f(x)
       logits = self.lm_head(x)

       if targets is None:
           loss = None
       else:
           B, T, C = logits.shape
           logits = logits.view(B*T, C)
           targets = targets.view(B*T)
           loss = F.cross_entropy(logits, targets)

       return logits, loss

   @torch.no_grad()
   def generate(self, idx, max_new_tokens, temperature=1.0):
       for _ in range(max_new_tokens):
           idx_cond = idx[:, -block_size:]
           logits, _ = self(idx_cond)
           logits = logits[:, -1, :] / temperature
           probs = F.softmax(logits, dim=-1)
           idx_next = torch.multinomial(probs, num_samples=1)
           idx = torch.cat((idx, idx_next), dim=1)
       return idx

# Initialize model
model = GPTLanguageModel(VOCAB_SIZE)
model = model.to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

# Training with compile if available
if hasattr(torch, 'compile'):
   print("Compiling model with torch.compile()...")
   model = torch.compile(model)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

print("\nStarting training...")
for iter in range(max_iters):

   # Evaluation and generation at checkpoints
   if iter % eval_interval == 0 or iter == max_iters - 1:
       losses = estimate_loss(model, eval_iters=20)  # Fewer eval iters
       print(f"\nStep {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

       # Generate sample
       model.eval()
       context = torch.zeros((1, 1), dtype=torch.long, device=device)
       generated = model.generate(context, max_new_tokens=100, temperature=0.8)
       generated_text = bpe.decode(generated[0].tolist())
       print(f"Generated (100 tokens):\n{generated_text}\n")
       model.train()

   # Training step
   xb, yb = get_batch('train')
   logits, loss = model(xb, yb)
   optimizer.zero_grad(set_to_none=True)
   loss.backward()
   optimizer.step()

print("\nTraining complete!")

# Final generation
model.eval()
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=500, temperature=0.8)
final_text = bpe.decode(generated[0].tolist())
print(f"\nFinal generation (500 tokens):\n{final_text}")

# Save model
torch.save(model.state_dict(), 'gpt_bpe_model.pth')
print("\nModel saved to gpt_bpe_model.pth")

Model parameters: 0.47M
Compiling model with torch.compile()...

Starting training...


W0906 23:21:21.181000 411 torch/utils/cpp_extension.py:118] [0/0] No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'



Step 0: train loss 7.8015, val loss 7.7982
Generated (100 tokens):
 modelther ate the ���gresQsimple plaus to . Aorii still 8ding ilosifexample, give prioriworkappethen norm≤justif attitu�scious reflec they are onsymbecause well theways  this hypothesalso assreason concertorserlogably resulin that but not mo�also ably see understandingorin a "by a erirbelieving nem regXhe ardlow>ongevidence' and �cesstill experiment�/stem(4physices that 198normrowturamestancform ���tic  attituard


Step 200: train loss 6.8871, val loss 6.9342
Generated (100 tokens):
 such as provid
ese plactical  albe in b0otical sometimes ed sociancto generalizepec common oute, dtiome of ong ost ically is , the onfac. Sadfulsemporl�in the clearastivpcompet. Sks termy,  at least reaslogical sayto uc. For t Fno q  that implic the snot t sense necessary 'ingis er e of ofbelief alist beertermis setiknow . Kight sts we 'Iorowreflec0st


Step 400: train loss 5.7515, val loss 5.9361
Generated (100 tokens):
 such for a truth

In [26]:
# Override hyperparameters for speed
batch_size = 64  # Increased
block_size = 64  # Decreased significantly
max_iters = 5000  # Reduced
eval_interval = 500  # Less frequent
learning_rate = 1e-3  # More aggressive
n_embd = 64  # Smaller
n_head = 4  # Fewer heads
n_layer = 4  # Fewer layers
dropout = 0.2  # No dropout

# Initialize model
model = GPTLanguageModel(VOCAB_SIZE)
model = model.to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

# Training with compile if available
if hasattr(torch, 'compile'):
   print("Compiling model with torch.compile()...")
   model = torch.compile(model)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

print("\nStarting training...")
for iter in range(max_iters):

   # Evaluation and generation at checkpoints
   if iter % eval_interval == 0 or iter == max_iters - 1:
       losses = estimate_loss(model, eval_iters=20)  # Fewer eval iters
       print(f"\nStep {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

       # Generate sample
       model.eval()
       context = torch.zeros((1, 1), dtype=torch.long, device=device)
       generated = model.generate(context, max_new_tokens=64, temperature=0.8)
       generated_text = bpe.decode(generated[0].tolist())
       print(f"Generated (64 tokens):\n{generated_text}\n")
       model.train()

   # Training step
   xb, yb = get_batch('train')
   logits, loss = model(xb, yb)
   optimizer.zero_grad(set_to_none=True)
   loss.backward()
   optimizer.step()

print("\nTraining complete!")

# Final generation
model.eval()
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=500, temperature=0.8)
final_text = bpe.decode(generated[0].tolist())
print(f"\nFinal generation (500 tokens):\n{final_text}")

# Save model
torch.save(model.state_dict(), 'gpt_bpe_2.pth')
print("\nModel saved to gpt_bpe_2.pth")

Model parameters: 0.47M
Compiling model with torch.compile()...

Starting training...

Step 0: train loss 7.7861, val loss 7.7871
Generated (64 tokens):
 ing that rainone can ste
could ancific�builhephysical Of upposine far knowledgBlogic discus. I �over suchyme�x ginVedi, a implicisticepenspecial . This understanditselflogicgramno situalitying putcourwhether complope of the cribWilliamworlds ically ordinary if the believe that Qcomppremis:
(finsignific


Step 500: train loss 5.1494, val loss 5.3956
Generated (64 tokens):
 mes ps to be lumink knows per do not bedence. such a ' and analogo knows s' would fability in place, 'S sains to be mey. On a world in Sutis-building subt to they are to justified true interpretationasever know that 


Step 1000: train loss 4.6432, val loss 4.9700
Generated (64 tokens):
 diin a position to know that that if the argument ages.' The defaw in philosophy do not know whether the since the conjunction of sing the relevant mattersing, and Kripke ofference y

In [29]:
# Sampling/completion script
print("Loading model for inference...")
model = GPTLanguageModel(VOCAB_SIZE)

# Load state dict and handle potential _orig_mod prefix from torch.compile
state_dict = torch.load('gpt_bpe_2.pth')
new_state_dict = {}
for k, v in state_dict.items():
    if k.startswith('_orig_mod.'):
        new_state_dict[k[len('_orig_mod.'):]] = v
    else:
        new_state_dict[k] = v

model.load_state_dict(new_state_dict)
model = model.to(device)
model.eval()

# Load BPE encoder
import pickle
with open('bpe_encoder.pkl', 'rb') as f:
    bpe = pickle.load(f)

# Prefill prompt - EDIT THIS
prompt = "Knowledge is "

# Encode prompt
tokens = bpe.encode(prompt)
context = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)

print(f"Prompt: {prompt}")
print(f"Tokens: {tokens[:20]}..." if len(tokens) > 20 else f"Tokens: {tokens}")
print("-" * 50)

# Generate with different temperatures
temperatures = [0.5, 0.8, 1.0]

for temp in temperatures:
    print(f"\nTemperature {temp}:")
    generated = model.generate(context, max_new_tokens=200, temperature=temp)
    generated_text = bpe.decode(generated[0].tolist())
    print(generated_text)
    print("-" * 50)

# Interactive mode
print("\n=== Interactive Mode ===")
while True:
    user_prompt = input("\nEnter prompt (or 'quit'): ")
    if user_prompt.lower() == 'quit':
        break

    tokens = bpe.encode(user_prompt)
    context = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)

    temp = float(input("Temperature (0.1-2.0): ") or "0.8")
    max_tokens = int(input("Max tokens (default 100): ") or "100")

    generated = model.generate(context, max_new_tokens=max_tokens, temperature=temp)
    generated_text = bpe.decode(generated[0].tolist())
    print("\n" + generated_text)

Loading model for inference...
Prompt: Knowledge is 
Tokens: [1965, 437, 403, 276]
--------------------------------------------------

Temperature 0.5:
Knowledge is to prove the logical sand natural science of the suppositions in terms of knowledge and beliefs and distribution to the best explanatory of accessible from narrow and bility to see how much a cite, but the Oxford in the Oxford 202, by Oxford POSIONOSIT entails ONIT obtains' (19983: 4): 113): 40–3004), and Machery (7) No' to believe p. Does ω +1 inchcreally that if I do not know that I know p, then I know the conjunct p, I believe p, for if I believe p then I believe p then you believe p, I believe p, then I k is born + k how I ket p n+1 haelset p (see also section 8) 
--------------------------------------------------

Temperature 0.8:
Knowledge is scurate that two chappen is a futime. They know it, my bick is the premise of a postulated in a way as a of the counterfactual conditionals. The degree of an assertions. For on
=

In [30]:
# Continue training from saved checkpoint
print("Loading saved model...")
model = GPTLanguageModel(VOCAB_SIZE)

# Load state dict and handle potential _orig_mod prefix from torch.compile
state_dict = torch.load('gpt_bpe_2.pth')
new_state_dict = {}
for k, v in state_dict.items():
   if k.startswith('_orig_mod.'):
       new_state_dict[k[len('_orig_mod.'):]] = v
   else:
       new_state_dict[k] = v

model.load_state_dict(new_state_dict)
model = model.to(device)
print(f"Model loaded: {sum(p.numel() for p in model.parameters())/1e6:.2f}M params")

# Load BPE encoder
import pickle
with open('bpe_encoder.pkl', 'rb') as f:
   bpe = pickle.load(f)

# Compile if available
if hasattr(torch, 'compile'):
   print("Compiling model with torch.compile()...")
   model = torch.compile(model)

# Resume training with same learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)  # Same LR as before
additional_iters = 5000

print("Resuming training...")
for iter in range(additional_iters):
   if iter % 100 == 0:
       losses = estimate_loss(model, eval_iters=20)
       print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

       # Generate sample
       model.eval()
       context = torch.zeros((1, 1), dtype=torch.long, device=device)
       generated = model.generate(context, max_new_tokens=64, temperature=0.5)
       generated_text = bpe.decode(generated[0].tolist())
       print(f"Generated: {generated_text[:200]}...\n")
       model.train()

   xb, yb = get_batch('train')
   logits, loss = model(xb, yb)
   optimizer.zero_grad(set_to_none=True)
   loss.backward()
   optimizer.step()

# Save updated model
torch.save(model.state_dict(), 'SimWilliamson.pth')
print("Updated model saved as SimWilliamson.pth!")

Loading saved model...
Model loaded: 0.47M params
Compiling model with torch.compile()...
Resuming training...
Step 0: train loss 3.8395, val loss 4.5512
Generated:  ority to judge of preferred to observer in its negation. They are not just the playing of the relevant proposition that the bargain, the result has not been cattribute of them as not to do so
(1). T...

Step 100: train loss 3.8572, val loss 4.5095
Generated:  or any of the sub-body of understanding of the general heuristic for knowledge of the relevant to the intension of knowing is a mental state, that we are not in one's evidence. The hypothesis transpa...

Step 200: train loss 3.8189, val loss 4.4971
Generated:  or the problem of reductiools to apply with some fiers in philosophy as fruit's seen, as a good case for one must sometimes into the central oness of contesting our intuitions about philosophical soc...

Step 300: train loss 3.8161, val loss 4.5017
Generated:  or a more direct of the thought or of our own mind a