In [19]:
from model import ModelParams, load_model, greedy_predict, tokens_to_text
from hooks import HookPoint, register_decoder_hook
from data import generate_dataset_pairs
from nesymres.architectures.data import tokenize
from nesymres.dataset.generator import Generator
import torch
import re
import sympy
import numpy as np
from typing import Dict

device = "cpu" # NOTE: change to cuda if your GPU can handle it

In [20]:
model = load_model(device=device)

Lightning automatically upgraded your loaded checkpoint from v1.3.3 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../res/100m.ckpt`


In [26]:
np.random.seed(42) # 42

complexity_dataset = generate_dataset_pairs("complexity-bias", 10_000, 1, model.model_cfg, model.eq_cfg, second_dataset_sample_rate=200)

print("Ground truth function")
display(sympy.sympify(complexity_dataset["equations"][0][0]))

complexity_dataset.keys(), complexity_dataset["X0"].shape, complexity_dataset["X1"].shape

Ground truth function


x_3 - sin(exp(cos(x_3)))*sin(sin(cos(x_3)))/cos(x_1) - Abs(x_3)

(dict_keys(['equations', 'X0', 'y0', 'X1', 'y1']),
 torch.Size([1, 10000, 3]),
 torch.Size([1, 50, 3]))

In [27]:
def equation_to_tokens(eq: str, eq_cfg: Dict):
    eq_sympy_prefix = Generator.sympy_to_prefix(sympy.sympify(eq))
    return tokenize(eq_sympy_prefix, eq_cfg["word2id"])

tokenized_eqs = torch.zeros(len(complexity_dataset["equations"]), model.model_cfg.architecture.length_eq, dtype=torch.long)

for i, eq in enumerate(complexity_dataset["equations"]):
    tokenized = torch.tensor(equation_to_tokens(eq[0], model.eq_cfg))
    tokenized_eqs[i, :tokenized.shape[-1]] = tokenized
tokenized_eqs

tensor([[ 1,  9,  6,  9, 18, 27,  7,  6, 18, 27, 18, 19, 12,  4, 27, 18, 20, 16,
         12,  6, 20, 20, 12,  6,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0]])

In [28]:
def compute_mean_complexity_disparity(model: ModelParams, X: torch.Tensor, y: torch.Tensor, tokenized_eqs: torch.Tensor):
    COMPLEX_OPS = ['acos', 'asin', 'atan', 'cos', 'cosh', 'coth', 'exp', 'ln', 'pow', 'sin', 'sinh', 'sqrt', 'tan', 'tanh']
    SIMPLE_OPS = ['add', 'div', 'mul']

    complex_ids = torch.tensor([model.eq_cfg["word2id"][word] for word in COMPLEX_OPS], dtype=torch.long)
    simple_ids = torch.tensor([model.eq_cfg["word2id"][word] for word in SIMPLE_OPS], dtype=torch.long)

    def disparity(probs: torch.Tensor) -> float:
        """ Computes how much the given logit probabilities tend to a complex operator over a non-complex operator. """
        complex_prob = probs[:, complex_ids].sum(dim=-1)
        simple_prob = probs[:, simple_ids].sum(dim=-1)
        return torch.mean(complex_prob - simple_prob).item()

    disparities = []

    # initial token prediction, this initializes the sequence and caches the encoder embedding (saves computation time).
    probs, _, enc_embed = greedy_predict(model.model, model.params_fit, X, y)
    disparities.append(disparity(probs))

    # to test the model, we use the ground truth sequence at each position
    for i in range(2, tokenized_eqs.shape[-1]):
        seq = tokenized_eqs.clone()
        seq[:, i:] = 0
        probs, _, _ = greedy_predict(model.model, model.params_fit, enc_embed=enc_embed, sequence=seq)
        disparities.append(disparity(probs))

    return np.mean(disparities)

bias0 = compute_mean_complexity_disparity(model, complexity_dataset["X0"], complexity_dataset["y0"], tokenized_eqs)
bias1 = compute_mean_complexity_disparity(model, complexity_dataset["X1"], complexity_dataset["y1"], tokenized_eqs)

bias0 - bias1

-0.2921965662959739

In [29]:
def display_autocomplete(model, X, y):
    # initial token prediction, this initializes the sequence and caches the encoder embedding (saves computation time).
    _, seq, enc_embed = greedy_predict(model.model, model.params_fit, X, y)

    # repeatedly predict next token greedily
    for _ in range(30):
        seq = greedy_predict(model.model, model.params_fit, enc_embed=enc_embed, sequence=seq)[1]

    # this should result in (roughly) the correct equation
    greedy_pred = tokens_to_text(seq, model.params_fit)

    print("Greedy predicted equation:")
    for eq in greedy_pred:
        display(sympy.sympify(eq))

print("Ground truth function")
display(sympy.sympify(complexity_dataset["equations"][0][0]))

display_autocomplete(model, complexity_dataset["X0"], complexity_dataset["y0"])
display_autocomplete(model, complexity_dataset["X1"], complexity_dataset["y1"])

Ground truth function


x_3 - sin(exp(cos(x_3)))*sin(sin(cos(x_3)))/cos(x_1) - Abs(x_3)

Greedy predicted equation:


c*tan(c*x_2 + x_1) + x_3 - Abs(c + x_3)

Greedy predicted equation:


c*tan(x_1)/x_1 + x_3 - Abs(x_3)