In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
from datasets import GeonamesDataset
import polars as pl

In [14]:
geonames = GeonamesDataset("./data/cities500.txt.gz", max_len=14)

In [15]:
geonames.df.sample(10)

sequence,feature code,country code,population
str,str,str,i64
"""Heidelberg""","""PPLX""","""AU""",7360
"""Antagnod""","""PPLA3""","""IT""",242
"""Dunshang""","""PPLA4""","""CN""",0
"""Taiping""","""PPLA4""","""CN""",0
"""Schela""","""PPL""","""RO""",590
"""Apastepeque""","""PPL""","""SV""",5785
"""Monsireigne""","""PPL""","""FR""",795
"""Wonfurt""","""PPLA4""","""DE""",1955
"""Newstead""","""PPLX""","""AU""",4719
"""Mondoteko""","""PPLA4""","""ID""",0


In [16]:
df = geonames.df
alphabet = "".join(
    set("".join(df.get_column("sequence").str.split("").explode().to_list()))
)

In [17]:
from utils import Tokenizer

t = Tokenizer(
    alphabet=alphabet,
    max_len=16,
)

In [23]:
X = t.encode(df)

In [24]:
import torch

total_samples = X.size(0)

# Define the proportions for train, test, and validation sets
train_ratio = 0.8
test_ratio = 0.1
val_ratio = 0.1

# Calculate the number of samples for each set
num_train = int(total_samples * train_ratio)
num_test = int(total_samples * test_ratio)
num_val = total_samples - num_train - num_test

# Generate random indices
indices = torch.randperm(total_samples)

# Split the indices into train, test, and validation sets
train_indices = indices[:num_train]
test_indices = indices[num_train : num_train + num_test]
val_indices = indices[num_train + num_test :]

# Create the train, test, and validation sets
X_train = X[train_indices]
X_test = X[test_indices]
X_val = X[val_indices]

print("Train set size:", X_train.shape)
print("Test set size:", X_test.shape)
print("Validation set size:", X_val.shape)

Train set size: torch.Size([101200, 16])
Test set size: torch.Size([12650, 16])
Validation set size: torch.Size([12651, 16])


In [28]:
from torch.utils.data import TensorDataset, DataLoader

train = TensorDataset(X_train)
test = TensorDataset(X_test)
val = TensorDataset(X_val)

In [37]:
for i, batch in enumerate(DataLoader(train, batch_size=64)):
    break

In [52]:
x = batch[0].float()

In [54]:
from torch import nn

In [104]:
import math


class PositionalEncoding(nn.Module):
    def __init__(
        self,
        d_model: int,
        d_output: int,
        dropout: float = 0.1,
        max_len: int = 5000,
    ):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_output = d_output
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = self.pe[: x.size(0)].flatten().unsqueeze(-1)
        x = x.repeat(
            1,
            self.d_output,
        )
        return self.dropout(x)

In [105]:
class MLPWithTime(nn.Module):
    def __init__(
        self,
        token_embed: nn.Module,
        time_embed: nn.Module,
        hidden: nn.Module,
        output: nn.Module,
    ):
        super().__init__()
        self.token_embed: nn.Module = token_embed
        self.time_embed: nn.Module = time_embed
        self.hidden: nn.Module = hidden
        self.output: nn.Module = output

    def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
        return self.output(self.hidden(self.token_embed(x) + self.time_embed(t)))

In [107]:
token_dim = 16
word_embeding_dim = time_embeding_dim = 128
intermediate_dim = 32
dropout = 0.3
max_T = 100

Input = nn.Sequential(
    nn.Linear(token_dim, word_embeding_dim),
    nn.ReLU(),
    nn.Dropout(dropout),
)
TimeInput = PositionalEncoding(
    d_model=1,
    d_output=time_embeding_dim,
    max_len=max_T,
)
Hidden = nn.Sequential(
    nn.Linear(word_embeding_dim, intermediate_dim),
    nn.ReLU(),
    nn.Dropout(dropout),
    nn.Linear(intermediate_dim, intermediate_dim),
    nn.ReLU(),
    nn.Dropout(dropout),
    nn.Linear(intermediate_dim, intermediate_dim),
    nn.ReLU(),
    nn.Dropout(dropout),
)
Output = nn.Sequential(
    nn.Linear(intermediate_dim, word_embeding_dim),
    nn.Softmax(dim=0),
)

model = MLPWithTime(
    time_embed=TimeInput,
    token_embed=Input,
    hidden=Hidden,
    output=Output,
)

In [108]:
t = torch.randint(0, max_T + 1, size=(x.shape[0],))

In [109]:
model(x, t)

tensor([[0.0061, 0.0159, 0.0159,  ..., 0.0122, 0.0218, 0.0234],
        [0.0381, 0.0244, 0.0093,  ..., 0.0098, 0.0127, 0.0131],
        [0.0175, 0.0066, 0.0223,  ..., 0.0239, 0.0097, 0.0118],
        ...,
        [0.0095, 0.0159, 0.0075,  ..., 0.0103, 0.0121, 0.0091],
        [0.0096, 0.0032, 0.0071,  ..., 0.0066, 0.0242, 0.0058],
        [0.0203, 0.0081, 0.0180,  ..., 0.0060, 0.0274, 0.0078]],
       grad_fn=<SoftmaxBackward0>)