In [1]:
import re
from random import Random

import pandas as pd
import torch
import torch.nn as nn
from faker import Faker
from torch.nn import functional as F
from tqdm.notebook import tqdm

from misc_gpt import GPT

In [2]:
block_size = 100

In [3]:
fake = Faker()

In [4]:
rng = Random(742)

In [5]:
def wrap(s):
    return s + " " * (block_size - len(s))

In [6]:
def get_line():
    a = rng.randint(10, 100_000)
    b = int(2 ** (rng.random() * 19))
    return f" {a} + {b} == {a + b}"

In [7]:
def get_line3():
    a = rng.randint(10, 100_000)
    b = int(2 ** (rng.random() * 19))
    return f" {bin(a)[2:]} + {bin(b)[2:]} == {bin(a + b)[2:]}"

In [8]:
def get_line4():
    a = rng.randint(10, 100_000)
    b = int(2 ** (rng.random() * 19))
    return f" {hex(a)[2:]} + {hex(b)[2:]} == {hex(a + b)[2:]}"

In [9]:
def eval_fun3(s):
    try:
        o = re.sub(" (\d)", r" 0b\1", s)
        return str(eval(o))
    except Exception as e:
        return type(e).__name__

In [10]:
def eval_fun3_subtract(s):
    try:
        o = re.sub(" (\d)", r" 0b\1", s).split("=")[0].replace("+", "-")
        return str(eval(o))
    except Exception as e:
        return type(e).__name__

In [11]:
def eval_fun(s):
    try:
        return str(eval(s))
    except Exception as e:
        return type(e).__name__

In [12]:
TAGS = ["div", "p", "span"]

In [13]:
def get_line2():
    w1 = fake.word().lower()
    w2 = fake.word().lower()
    tags = [rng.choice(TAGS), rng.choice(TAGS)]
    return "".join(f" <{t}>{w}</{t}>" for w, t in zip(sorted([w1, w2]), tags))

In [14]:
def eval_fun2(s):
    return "-".join(re.findall(f"<({'|'.join(TAGS)})>[a-z]+</\\1>", s))

In [15]:
current_eval_fun = eval_fun
current_get_line = get_line

In [16]:
lines = 1_000_000

In [17]:
complete_raw_input = "".join(wrap(current_get_line()) for _ in range(lines))

In [18]:
for i in range(4):
    pr = complete_raw_input[i * block_size : (i + 1) * block_size]
    print(pr)
    print(current_eval_fun(pr))
    print("\n")

 54418 + 346608 == 401026                                                                           
True


 17991 + 279221 == 297212                                                                           
True


 15795 + 13394 == 29189                                                                             
True


 77916 + 90203 == 168119                                                                            
True




In [19]:
chars = sorted(list(set(complete_raw_input)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [
    stoi[c] for c in s
]  # encoder: take a string, output a list of integers
decode = lambda l: "".join(
    [itos[i] for i in l]
)  # decoder: take a list of integers, output a string

In [20]:
torch.manual_seed(742)

data = torch.tensor(encode(complete_raw_input), dtype=torch.long)
n = (
    int(0.9 * (len(data) // block_size)) * block_size
)  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [21]:
batch_size = 64
max_iters = 2000
eval_interval = 100
learning_rate = 3e-4

device = "cuda" if torch.cuda.is_available() else "cpu"

eval_iters = 200
n_head = 3
n_embd = n_head * 16
n_layer = 2
dropout = 0.2

In [22]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) // block_size - 1, (batch_size,)) * block_size
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [23]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [24]:
model = GPT(vocab_size, n_embd, block_size, n_head, n_layer, dropout, device)
m = model.to(device)
print(sum(p.numel() for p in m.parameters()) / 1e6, "M parameters")

0.062413 M parameters


In [25]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [26]:
for iter in range(max_iters * 2):

    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(
            f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}"
        )

    xb, yb = get_batch("train")

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 2.5940, val loss 2.5940
step 100: train loss 0.7243, val loss 0.7235
step 200: train loss 0.4542, val loss 0.4548
step 300: train loss 0.3856, val loss 0.3860
step 400: train loss 0.3608, val loss 0.3609
step 500: train loss 0.3499, val loss 0.3487
step 600: train loss 0.3423, val loss 0.3416
step 700: train loss 0.3375, val loss 0.3370
step 800: train loss 0.3353, val loss 0.3353
step 900: train loss 0.3325, val loss 0.3320
step 1000: train loss 0.3309, val loss 0.3311
step 1100: train loss 0.3298, val loss 0.3303
step 1200: train loss 0.3282, val loss 0.3287
step 1300: train loss 0.3286, val loss 0.3288
step 1400: train loss 0.3276, val loss 0.3275
step 1500: train loss 0.3266, val loss 0.3260
step 1600: train loss 0.3263, val loss 0.3260
step 1700: train loss 0.3241, val loss 0.3238
step 1800: train loss 0.3235, val loss 0.3228
step 1900: train loss 0.3221, val loss 0.3224
step 1999: train loss 0.3207, val loss 0.3218
step 2000: train loss 0.3207, val loss 0.3213


In [27]:
def get_eval_df(prompt=" ", eval_n=1_000):
    eval_recs = []
    for _ in tqdm(range(eval_n)):
        context = torch.tensor(encode(prompt), dtype=torch.long, device=device).reshape(
            1, -1
        )
        res = decode(
            m.generate(context, max_new_tokens=block_size - len(prompt))[0].tolist()
        )
        eval_recs.append(
            {
                "train_example": res in complete_raw_input,
                "result": current_eval_fun(res),
                "res": res,
            }
        )
    return pd.DataFrame(eval_recs)

In [28]:
def piv_evdf(df):
    return df.pivot_table(
        index="result", columns="train_example", values="res", aggfunc="count"
    )

In [29]:
eval_df = get_eval_df()

  0%|          | 0/1000 [00:00<?, ?it/s]

In [31]:
piv_evdf(eval_df)

train_example,False
result,Unnamed: 1_level_1
False,949
SyntaxError,50
True,1
