In [1]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import trange
from numba import njit, prange

import torch
from torch import nn


from data import addition_dataset

source = "../source"
sys.path.append(source)

from preprocessing import OneHot, Encoding
from compilation import Compiler, ScalarTracker, ActivationTracker
from data_analysis.automata import to_automaton_history
from visualization.animation import SliderAnimation
from visualization.activations import ActivationsAnimation
from visualization.automata import AutomatonAnimation
from visualization.epochs import EpochAnimation

import publication

is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [2]:
symbols = [str(n) for n in range(10)] + ["+", " "]
onehot = OneHot(symbols)


class SequenceEncoder(Encoding):
    def __init__(self, single_encoder: Encoding):
        self._decoding = lambda x: x
        self.single_encoder = single_encoder

    # @property
    # def symbols(self) -> list:
    #     return None

    # def _update_decoding(self, encoding):
    #     pass

    def __call__(self, data):
        encoded = [self.single_encoder(char) for char in data]
        encoded = encoded[::-1]  # decoded in reverse for better computation
        return encoded

    def decode(self, enc_data):
        enc_data = enc_data.to("cpu").detach()
        decoded = [self.single_encoder.decode(vec) for vec in enc_data]
        decoded = "".join(decoded)
        decoded = decoded[::-1]
        return decoded


encoder = SequenceEncoder(onehot)

In [3]:
## Generate data

full_length = 100
max_train_length = 12
test_length = 13

test_data = addition_dataset(
    device, encoder, n_datapoints=100, seq_len=[test_length], full_length=full_length
)

In [4]:
## Define model


class Model(nn.Module):
    def __init__(self, device, input_size, hidden_size, output_size, num_layers=1):
        super().__init__()
        self.lstm_encoder = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
        )
        self.lstm_decoder = nn.LSTM(
            input_size=hidden_size * 2,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
        )
        self.linear = nn.Linear(hidden_size * 2, output_size)

        self.to(device)

    def forward(self, x):
        x, _ = self.lstm_encoder(x)
        x, _ = self.lstm_decoder(x)
        x = self.linear(x)
        return x


model = Model(
    device,
    input_size=len(symbols),
    hidden_size=100,
    num_layers=1,
    output_size=len(symbols),
)

In [5]:
def predict(input, model):
    out = model(input)
    pred = torch.argmax(out, axis=1).int()
    string = "".join([symbols[index] for index in pred])

    string = string[::-1]
    return string


def score(dataset, model):
    count = 0
    n_datapoints = len(dataset)
    for n in prange(n_datapoints):
        x, y = dataset[n]
        target = encoder.decode(y)
        prediction = predict(x, model)
        if target == prediction:
            count += 1
    acc = count / len(dataset)
    return acc

In [6]:
# model = torch.load("model.pt")

In [7]:
## Train

optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.MSELoss()
# train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=32)

n_epochs = 1000
for epoch in range(n_epochs):
    # Generate data
    train_data = addition_dataset(
        device,
        encoder,
        n_datapoints=1000,
        seq_len=[x for x in range(3, max_train_length) if x != test_length],
        full_length=full_length,
    )
    train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=32)

    # Train
    model.train()
    for X_batch, y_batch in train_loader:
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation
    if epoch % 50 != 0:
        continue
    model.eval()
    with torch.no_grad():
        y_pred = model(train_data.tensors[0])
        train_rmse = np.sqrt(loss_fn(y_pred, train_data.tensors[1]).cpu())
        y_pred = model(test_data.tensors[0])
        test_rmse = np.sqrt(loss_fn(y_pred, test_data.tensors[1]).cpu())
        train_score = score(torch.utils.data.Subset(train_data, np.arange(100)), model)
        test_score = score(test_data, model)
    print(
        f"Epoch {epoch}: train RMSE {train_rmse:.4f}, test RMSE {test_rmse:.4f}, train score {train_score:.2f}, test score {test_score:.2f}"
    )

Epoch 0: train RMSE 0.0765, test RMSE 0.1061, train score 0.00, test score 0.00
Epoch 50: train RMSE 0.0500, test RMSE 0.0728, train score 0.02, test score 0.02
Epoch 100: train RMSE 0.0421, test RMSE 0.0635, train score 0.12, test score 0.05
Epoch 150: train RMSE 0.0387, test RMSE 0.0599, train score 0.31, test score 0.07
Epoch 200: train RMSE 0.0363, test RMSE 0.0575, train score 0.40, test score 0.07
Epoch 250: train RMSE 0.0342, test RMSE 0.0570, train score 0.41, test score 0.09
Epoch 300: train RMSE 0.0320, test RMSE 0.0522, train score 0.50, test score 0.09
Epoch 350: train RMSE 0.0296, test RMSE 0.0524, train score 0.61, test score 0.12
Epoch 400: train RMSE 0.0290, test RMSE 0.0500, train score 0.63, test score 0.16
Epoch 450: train RMSE 0.0283, test RMSE 0.0491, train score 0.56, test score 0.17
Epoch 500: train RMSE 0.0269, test RMSE 0.0478, train score 0.64, test score 0.26
Epoch 550: train RMSE 0.0266, test RMSE 0.0485, train score 0.74, test score 0.25
Epoch 600: train RM

In [8]:
torch.save(model, "model.pt")

In [34]:
index = 5

print(f"input: {encoder.decode(train_data[index][0])}")
print(f"target: {encoder.decode(train_data[index][1])}")
print(f"prediction: {predict(train_data[index][0], model)}")

input: 5+5                                                                                                 
target: 10                                                                                                  
prediction: 10                                                                                                  


In [13]:
index = 1

print(f"input: {encoder.decode(test_data[index][0])}")
print(f"target: {encoder.decode(test_data[index][1])}")
print(f"prediction: {predict(test_data[index][0], model)}")

input: 97018+3037372                                                                                       
target: 3134390                                                                                             
prediction: 3305550                                                                                             


In [56]:
example = "10000000+4000000"
example = example + " " * (full_length - len(example))

print(f"input: {example}")
encoded = torch.tensor(encoder(example), device=device, dtype=torch.float32)
print(f"prediction: {predict(torch.tensor(encoded), model)}")

input: 10000000+4000000                                                                                    
prediction: 14002227                                                                                            


  print(f"prediction: {predict(torch.tensor(encoded), model)}")


In [85]:
int_a_len = 50
10 ** (int_a_len - 1)

10000000000000000000000000000000000000000000000000

In [83]:
np.random.randint(10 ** (int_a_len - 1), 10 ** (int_a_len))

ValueError: high is out of bounds for int64

In [107]:
import random

random.getrandbits(1, 500)

TypeError: Random.getrandbits() takes exactly one argument (2 given)

In [77]:
## Check for large sequence length generalization

# Evaluate model generalization for sequences of varying lengths
N = 20
val_scores = []
for n in trange(3, N + 1):
    val_data = addition_dataset(
        device, encoder, n_datapoints=10, seq_len=[n], full_length=full_length
    )
    val_score = score(val_data, model)
    val_scores.append(val_score)

# Visualize validation error for varying sequence lengths
publication.set_color_mixed()
fig = plt.figure(figsize=(6, 4))
ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
ax.bar(np.arange(3, N + 1), val_scores, color="skyblue")
ax.set_xlabel("Sequence lengths")
ax.set_ylabel("Validation error")
ax.set_title("Model Generalization Across Sequence Lengths")
ax.set_yticks(np.arange(0, 1, 0.1))
publication.pub_show()

 64%|██████▍   | 18/28 [00:01<00:01,  9.73it/s]


ValueError: low >= high