In [48]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
from datasets import GeonamesDataset
import polars as pl

In [50]:
geonames = GeonamesDataset("./data/cities500.txt.gz", max_len=14)

In [51]:
geonames.df.sample(10)

sequence,feature code,country code,population
str,str,str,i64
"""Badarganj""","""PPL""","""BD""",32600
"""Songmai""","""PPLA3""","""CN""",0
"""Fenghua""","""PPLA3""","""CN""",76653
"""Dschang""","""PPL""","""CM""",99582
"""Demirtas""","""PPLA3""","""TR""",6702
"""Komyshuvakha""","""PPL""","""UA""",5211
"""Bareqet""","""PPL""","""IL""",2082
"""Yerevan""","""PPLC""","""AM""",1093485
"""Kilmacow""","""PPL""","""IE""",647
"""Antonimina""","""PPLA3""","""IT""",506


In [5]:
df = geonames.df
alphabet = "".join(
    set("".join(df.get_column("sequence").str.split("").explode().to_list()))
)

In [6]:
from utils import Tokenizer

t = Tokenizer(
    alphabet=alphabet,
    max_len=16,
)

In [7]:
X = t.encode(df)

In [8]:
import torch

total_samples = X.size(0)

# Define the proportions for train, test, and validation sets
train_ratio = 0.8
test_ratio = 0.1
val_ratio = 0.1

# Calculate the number of samples for each set
num_train = int(total_samples * train_ratio)
num_test = int(total_samples * test_ratio)
num_val = total_samples - num_train - num_test

# Generate random indices
indices = torch.randperm(total_samples)

# Split the indices into train, test, and validation sets
train_indices = indices[:num_train]
test_indices = indices[num_train : num_train + num_test]
val_indices = indices[num_train + num_test :]

# Create the train, test, and validation sets
X_train = X[train_indices]
X_test = X[test_indices]
X_val = X[val_indices]

print("Train set size:", X_train.shape)
print("Test set size:", X_test.shape)
print("Validation set size:", X_val.shape)

Train set size: torch.Size([101200, 16])
Test set size: torch.Size([12650, 16])
Validation set size: torch.Size([12651, 16])


In [9]:
from torch.utils.data import TensorDataset, DataLoader

train = TensorDataset(X_train)
test = TensorDataset(X_test)
val = TensorDataset(X_val)

# Model initialization

Let's train a model that would predict a single missing letter in the word.

In [10]:
from torchtyping import TensorType

T = TensorType


class RandomMasker:
    def __init__(self, tokenizer: Tokenizer):
        self.tokenizer = tokenizer
        self.start_token: int = tokenizer.stoi[tokenizer.start_token]
        self.end_token: int = tokenizer.stoi[tokenizer.end_token]
        self.pad_token: int = tokenizer.stoi[tokenizer.pad_token]
        self.mask_token: int = tokenizer.stoi[tokenizer.mask_token]

    def add_mask(self, x: T["b", "max_L", torch.long], p: float = 0.1):  # noqa: F821
        where = (x != self.start_token) & (x != self.end_token) & (x != self.pad_token)
        mask = (torch.randint_like(where.long(), low=0, high=100) < p * 100) & where
        x[mask] = self.mask_token
        return x

In [11]:
from torch import nn
from model import PositionalEncoding


class Model(nn.Module):
    def __init__(
        self,
        embed_dim: int,
        vocab_size: int,
        n_tokens: int,
        hidden_dim: int,
        dropout: float = 0.4,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.vocab_size = vocab_size
        self.n_tokens = n_tokens
        self.hidden_dim = hidden_dim

        self.pe = PositionalEncoding(d_embed=embed_dim, max_L=n_tokens)
        self.emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
        self.layers = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.LayerNorm((n_tokens, hidden_dim)),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.LayerNorm((n_tokens, hidden_dim)),
            nn.Linear(hidden_dim, vocab_size),
            nn.ReLU(),
        )

    def forward(self, x: T["b", "max_L"]):  # noqa: F821
        xe = self.emb(x)
        pe = self.pe(x)
        xe += pe
        return self.layers(xe)

# Training loop

In [33]:
embed_dim = 128
vocab_size = len(t.stoi)
n_tokens = t.max_len
hidden_dim = 64

model = Model(
    embed_dim=embed_dim,
    vocab_size=vocab_size,
    n_tokens=n_tokens,
    hidden_dim=hidden_dim,
)

sum((p.numel() for p in model.parameters()))

27320

In [34]:
model.to("cuda"), X_train.to("cuda"), X_test.to("cuda")

(Model(
   (pe): PositionalEncoding()
   (emb): Embedding(56, 128)
   (layers): Sequential(
     (0): Linear(in_features=128, out_features=64, bias=True)
     (1): ReLU()
     (2): Dropout(p=0.4, inplace=False)
     (3): LayerNorm((16, 64), eps=1e-05, elementwise_affine=True)
     (4): Linear(in_features=64, out_features=64, bias=True)
     (5): ReLU()
     (6): Dropout(p=0.4, inplace=False)
     (7): LayerNorm((16, 64), eps=1e-05, elementwise_affine=True)
     (8): Linear(in_features=64, out_features=56, bias=True)
     (9): ReLU()
   )
 ),
 tensor([[ 1, 19, 44,  ...,  3,  3,  3],
         [ 1, 23,  9,  ...,  3,  3,  3],
         [ 1, 25, 13,  ...,  3,  3,  3],
         ...,
         [ 1, 31, 13,  ...,  3,  3,  3],
         [ 1, 23, 39,  ...,  3,  3,  3],
         [ 1, 46, 13,  ...,  3,  3,  3]], device='cuda:0', dtype=torch.uint8),
 tensor([[ 1, 33, 39,  ...,  3,  3,  3],
         [ 1, 19, 38,  ...,  3,  3,  3],
         [ 1, 12, 13,  ...,  3,  3,  3],
         ...,
         [ 1, 52,

In [35]:
import torch.nn.functional as F
from torch.optim import AdamW
from tqdm.notebook import tqdm
import torch.optim.lr_scheduler as lr_scheduler

batch_size = 16384
num_epochs = 1000
optimizer = AdamW(model.parameters(), lr=0.2)

scheduler = lr_scheduler.ExponentialLR(
    optimizer,
    gamma=0.99,
)
masker = RandomMasker(t)
p_masker = 0.2

for epoch in tqdm(range(num_epochs)):
    # get into training mode
    model.train()
    losses = []
    for batch_num in range(X_train.shape[0] // batch_size):
        optimizer.zero_grad()

        # initialize proper `x`
        ix = torch.randint(0, X_train.shape[0], size=(batch_size,))
        y = X_train[ix].long().to("cuda")  # true values
        x = masker.add_mask(y, p=p_masker).to("cuda")  # noisy values

        # predict demasked tokens
        y_pred = model(x).swapaxes(-1, -2).to("cuda")

        # let's focus our loss on masked tokens only
        is_masked = y_pred == t.stoi[t.mask_token]

        true_labels = (
            F.one_hot(y, num_classes=vocab_size).swapaxes(-1, -2).float().to("cuda")
        )
        true_labels[~is_masked] = y_pred[~is_masked]
        loss = F.cross_entropy(y_pred, y)

        # do backprop
        losses.append(loss.item())
        loss.backward()
    optimizer.step()
    scheduler.step()

    # validation mode
    model.eval()
    with torch.no_grad():
        y_test = X_test.long().to("cuda")
        x_test = masker.add_mask(y_test, p=p_masker).to("cuda")

        y_pred_test = model(x_test).swapaxes(-1, -2).to("cuda")
        test_loss = F.cross_entropy(y_pred_test, y_test)

        if epoch % 10 == 0:
            print(
                f"{epoch=}, train_loss={torch.tensor(losses).mean().item():2.4f}, val_loss={test_loss.item():2.4f}"
            )

  0%|          | 0/1000 [00:00<?, ?it/s]

epoch=0, train_loss=4.0291, val_loss=2.3900
epoch=10, train_loss=1.5841, val_loss=1.2169
epoch=20, train_loss=0.8291, val_loss=0.5130
epoch=30, train_loss=0.4642, val_loss=0.2159
epoch=40, train_loss=0.2525, val_loss=0.0474
epoch=50, train_loss=0.1551, val_loss=0.0291
epoch=60, train_loss=0.1086, val_loss=0.0188
epoch=70, train_loss=0.0844, val_loss=0.0156
epoch=80, train_loss=0.0702, val_loss=0.0134
epoch=90, train_loss=0.0618, val_loss=0.0151
epoch=100, train_loss=0.0551, val_loss=0.0132
epoch=110, train_loss=0.0508, val_loss=0.0138
epoch=120, train_loss=0.0474, val_loss=0.0138
epoch=130, train_loss=0.0442, val_loss=0.0135
epoch=140, train_loss=0.0422, val_loss=0.0130
epoch=150, train_loss=0.0400, val_loss=0.0129
epoch=160, train_loss=0.0390, val_loss=0.0128
epoch=170, train_loss=0.0375, val_loss=0.0118
epoch=180, train_loss=0.0370, val_loss=0.0119
epoch=190, train_loss=0.0351, val_loss=0.0121
epoch=200, train_loss=0.0346, val_loss=0.0121
epoch=210, train_loss=0.0343, val_loss=0.0119

In [36]:
true_labels.shape

torch.Size([16384, 56, 16])

In [37]:
y_pred.shape

torch.Size([16384, 56, 16])

In [38]:
y_pred.shape

torch.Size([16384, 56, 16])

In [39]:
true_labels.shape

torch.Size([16384, 56, 16])

In [40]:
model.eval()

Model(
  (pe): PositionalEncoding()
  (emb): Embedding(56, 128)
  (layers): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.4, inplace=False)
    (3): LayerNorm((16, 64), eps=1e-05, elementwise_affine=True)
    (4): Linear(in_features=64, out_features=64, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.4, inplace=False)
    (7): LayerNorm((16, 64), eps=1e-05, elementwise_affine=True)
    (8): Linear(in_features=64, out_features=56, bias=True)
    (9): ReLU()
  )
)

In [41]:
xval = X_val.clone()

In [42]:
t.decode(X_val[:10])

['Janeng',
 'Vovchynets',
 'Melton',
 'Zhirnovsk',
 'Fahren',
 'Schiesheim',
 'Hodonin',
 'Bellevue',
 'Buenavista',
 'Zhuangshi']

In [46]:
masked_X_val = masker.add_mask(X_val, p=0.05)
t.decode(masked_X_val[:10])

['Ja#eng',
 'Vovchynet#',
 'Melton',
 'Zhirno#sk',
 'Fahr#n',
 'Schiesheim',
 'Hodonin',
 'Bellevue',
 'Bu#navist#',
 'Zh#angshi']

In [47]:
t.decode(model(masked_X_val.long().to("cuda")).argmax(axis=-1))[:10]

['Ja#eng',
 'Vovchynet#',
 'Melton',
 'Zhirno#sk',
 'Fahr#n',
 'Schiesheim',
 'Hodonin',
 'Bellevue',
 'Bu#navist#',
 'Zh#angshi']