In [66]:
# Autoimport wherept.py:
%load_ext autoreload
%aimport wherept


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import wandb

torch.manual_seed(42)

<torch._C.Generator at 0x10f46b170>

In [3]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlage[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
run = wandb.init(project="woher", job_type="transformer-train")

In [5]:
dataset = run.use_artifact("woher/cleaned-cities:latest").get("clean")
df_raw = dataset.get_dataframe()
df_raw.head(5)

[34m[1mwandb[0m:   1 of 1 files downloaded.  


Unnamed: 0,name,asciiname,latitude,longitude,country_code
0,Soldeu,Soldeu,42.57688,1.66769,AD
1,El Tarter,El Tarter,42.57952,1.65362,AD
2,Sant Julià de Lòria,Sant Julia de Loria,42.46372,1.49129,AD
3,Pas de la Casa,Pas de la Casa,42.54277,1.73361,AD
4,Ordino,Ordino,42.55623,1.53319,AD


In [26]:
# hyperparameters
TARGET_COL = "asciiname"
START_CHAR = "<"
END_CHAR = ">"
PADDING_CHAR = "#"
BATCH_SIZE = 16
BLOCK_SIZE = 8
N_EMBED = 128
N_HEADS = 4
N_LAYER = 2
DROPOUT = 0.2

In [27]:
df = df_raw.copy()

df[TARGET_COL] = START_CHAR + df[TARGET_COL] + END_CHAR
df["target_len"] = df[TARGET_COL].apply(len)

max_len = max([len(city) for city in df[TARGET_COL].values])
df[TARGET_COL] = df[TARGET_COL].str.pad(max_len, side="right", fillchar=PADDING_CHAR)

chars = sorted(list(set("".join(df[TARGET_COL].values))))
vocab_len = len(chars)
print("Vocabulary length:", vocab_len)
print("Vocabulary:", "".join(chars))

df.head(5)

Vocabulary length: 61
Vocabulary:  #'-.1<>ABCDEFGHIJKLMNOPQRSTUVWXYZ`abcdefghijklmnopqrstuvwxyz


Unnamed: 0,name,asciiname,latitude,longitude,country_code,target_len
0,Soldeu,<Soldeu>######################################...,42.57688,1.66769,AD,8
1,El Tarter,<El Tarter>###################################...,42.57952,1.65362,AD,11
2,Sant Julià de Lòria,<Sant Julia de Loria>#########################...,42.46372,1.49129,AD,21
3,Pas de la Casa,<Pas de la Casa>##############################...,42.54277,1.73361,AD,16
4,Ordino,<Ordino>######################################...,42.55623,1.53319,AD,8


# Tokenize

In [28]:
# Generate a mapping from character to index and vice versa
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}

encode = lambda x: [char_to_idx[char] for char in x]
decode = lambda x: "".join([idx_to_char[idx] for idx in x])

test_sample = df[TARGET_COL].values[0]
print("Encoded:", encode(test_sample))
print("Decoded:", decode(encode(test_sample)))

START_TOKEN = encode(START_CHAR)[0]
END_TOKEN = encode(END_CHAR)[0]
PADDING_TOKEN = encode(PADDING_CHAR)[0]

Encoded: [6, 26, 49, 46, 38, 39, 55, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded: <Soldeu>##################################################


In [29]:
df["tokenized"] = df[TARGET_COL].apply(encode)
df.head(2)

Unnamed: 0,name,asciiname,latitude,longitude,country_code,target_len,tokenized
0,Soldeu,<Soldeu>######################################...,42.57688,1.66769,AD,8,"[6, 26, 49, 46, 38, 39, 55, 7, 1, 1, 1, 1, 1, ..."
1,El Tarter,<El Tarter>###################################...,42.57952,1.65362,AD,11,"[6, 12, 46, 0, 27, 35, 52, 54, 39, 52, 7, 1, 1..."


In [30]:
train_df = df.sample(frac=0.9, random_state=42)
val_df = df.drop(train_df.index)
print("Train size:", len(train_df))

Train size: 173012


In [31]:
def get_batch(split, batch_size=BATCH_SIZE):
    if split == "train":
        df = train_df
    elif split == "val":
        df = val_df
    x = []
    y = []
    sample_idx = torch.randint(0, len(df), (batch_size,))
    for sidx in sample_idx:
        target_len = df.iloc[int(sidx)]["target_len"]
        idx = torch.randint(0, target_len - 1, (1,)).int()
        
        x_tensor = torch.tensor(df.iloc[int(sidx)]["tokenized"][idx:idx+BLOCK_SIZE])
        y_tensor = torch.tensor(df.iloc[int(sidx)]["tokenized"][idx+1:idx+BLOCK_SIZE+1])
        x.append(x_tensor)
        y.append(y_tensor)
        
    x = torch.stack(x)
    y = torch.stack(y)
    return x, y

xb, yb = get_batch("train", 4)

display(xb)
display(yb)

tensor([[52, 56, 43, 48, 49,  7,  1,  1],
        [39, 52, 52, 59,  7,  1,  1,  1],
        [35, 48, 38,  7,  1,  1,  1,  1],
        [39, 52, 60, 35, 41, 35,  7,  1]])

tensor([[56, 43, 48, 49,  7,  1,  1,  1],
        [52, 52, 59,  7,  1,  1,  1,  1],
        [48, 38,  7,  1,  1,  1,  1,  1],
        [52, 60, 35, 41, 35,  7,  1,  1]])

In [32]:
class MaskedTensor:
    def __init__(self, tensor, mask):
        self.tensor = tensor
        self.mask = mask


class CausalSelfAttentionHead(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.query = nn.Linear(N_EMBED, head_size, bias=False)
        self.key = nn.Linear(N_EMBED, head_size, bias=False)
        self.value = nn.Linear(N_EMBED, head_size, bias=False)
        self.dropout = nn.Dropout(DROPOUT)

        self.register_buffer("tril", torch.tril(torch.ones(BLOCK_SIZE, BLOCK_SIZE)))

    def forward(self, x, padding_mask=None):
        B, T, C = x.shape # (batch_size, seq_len, n_embed)

        query = self.query(x) # (B, T, C)
        key = self.key(x) # (B, T, C)

        # Compute attention scores:
        wei = query @ key.transpose(-2, -1) * C**(-0.5) # (B, T, C) @ (B, C, T) = (B, T, T)
        # Mask out future tokens:
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf")) # (B, T, T)
        # Mask out padding tokens:
        if padding_mask is not None:
           wei = wei.masked_fill(padding_mask.unsqueeze(1).expand(-1, T, -1), float("-1e9"))
        
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        v = self.value(x) # (B, T, C)
        return wei @ v # (B, T, T) @ (B, T, C) = (B, T, C)
    
class MultiHeadAttention(nn.Module):

    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([CausalSelfAttentionHead(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(N_EMBED, N_EMBED)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x, padding_mask=None):
        out = torch.cat([h(x, padding_mask) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(DROPOUT),
        )

    def forward(self, x):
        return self.net(x)
    
class TransformerBlock(nn.Module):

    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x: MaskedTensor):
        x.tensor = self.sa(self.ln1(x.tensor), x.mask)
        x.tensor = self.ffwd(self.ln2(x.tensor))
        return x

class WherePT(nn.Module):

    def __init__(self) -> None:
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_len, N_EMBED)
        self.position_embedding = nn.Embedding(BLOCK_SIZE, N_EMBED)

        self.blocks = nn.Sequential(*[TransformerBlock(N_EMBED, N_HEADS) for _ in range(N_LAYER)])
        self.ln_final = nn.LayerNorm(N_EMBED)
        self.lm_head = nn.Linear(N_EMBED, vocab_len)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        padding_mask = (idx == PADDING_TOKEN) # (batch_size, seq_len)

        tok_embeds = self.token_embedding(idx) # (batch_size, seq_len, n_embed)
        pos_embeds = self.position_embedding(torch.arange(T)) # (seq_len, n_embed)
        x = MaskedTensor(tok_embeds + pos_embeds, padding_mask) # (batch_size, seq_len, n_embed)
        x = self.blocks(x)
        x.tensor = self.ln_final(x.tensor)
        logits = self.lm_head(x.tensor) # (batch_size, seq_len, vocab_len)  

        if targets is None:
            loss = None
        else:
            loss = F.cross_entropy(logits.view(-1, vocab_len), targets.view(-1))

        return logits, loss
    
    def generate(self, idx, max_new_tokens=10):
        # idx: (batch_size, seq_len)
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -BLOCK_SIZE:] # (batch_size, block_size)
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, next_token], dim=1)

            if next_token == END_TOKEN:
                break
        return idx
    

model = WherePT()
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")

idx = torch.tensor([START_TOKEN]).unsqueeze(0)
output = model.generate(idx, max_len)[0].tolist()
print(decode(output))



0.412733 M parameters
<m>


In [68]:
wherept_config = wherept.WherePTConfig(
    61,
    N_EMBED,
    N_HEADS,
    N_LAYER,
    BLOCK_SIZE,
    DROPOUT
)
model = wherept.WherePT(wherept_config)
model

AttributeError: 'int' object has no attribute 'n_embed'

In [43]:
lr = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [44]:
EVAL_ITERS = 10

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [45]:
MAX_ITERS = 1000
for iter in range(MAX_ITERS):
    if iter % EVAL_ITERS == 0 or iter == MAX_ITERS - 1:
        losses = estimate_loss(model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    optimizer.zero_grad()
    xb, yb = get_batch("train", BATCH_SIZE)
    logits, loss = model(xb, yb)
    loss.backward()
    optimizer.step()


step 0: train loss 1.8173, val loss 1.7555
step 10: train loss 1.7403, val loss 1.8589
step 20: train loss 1.8814, val loss 1.8569
step 30: train loss 1.9221, val loss 1.9322
step 40: train loss 1.9767, val loss 1.7983
step 50: train loss 1.8991, val loss 1.9242
step 60: train loss 1.9432, val loss 1.8237
step 70: train loss 1.9388, val loss 1.8126
step 80: train loss 1.9666, val loss 1.9387
step 90: train loss 1.9629, val loss 1.9853
step 100: train loss 1.6688, val loss 1.8583
step 110: train loss 1.8294, val loss 1.7157
step 120: train loss 1.8661, val loss 1.7022
step 130: train loss 1.8663, val loss 1.7875
step 140: train loss 1.9394, val loss 1.9591
step 150: train loss 1.7888, val loss 1.8217
step 160: train loss 1.7633, val loss 1.8663
step 170: train loss 1.8800, val loss 1.9446
step 180: train loss 1.8300, val loss 1.8497
step 190: train loss 1.8393, val loss 1.8635
step 200: train loss 1.8111, val loss 1.6579
step 210: train loss 1.8804, val loss 1.7635
step 220: train loss 

In [61]:
idx = torch.tensor([START_TOKEN]).unsqueeze(0)
#idx = torch.tensor(encode("<Stock")).unsqueeze(0)

output = model.generate(idx, max_len)[0].tolist()
print(decode(output))

<Sicla Dhuuardlnnrecswosl>


In [516]:
wandb.run.finish()

