In [1]:
import os, sys
from tqdm import trange, tqdm
from IPython.utils import io
import itertools

import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
from numba import njit

import torch
from torch import nn
from torch.utils.data import TensorDataset, ConcatDataset
import wandb

source = "../source"
sys.path.append(source)

from data import fun_data, grid_data
from preprocessing import Direct, Encoding, OneHot
from compilation import Compiler, Tracker, ScalarTracker, ActivationTracker
from activations import get_activations
from data_analysis.automata import to_automaton_history
from data_analysis.visualization.animation import SliderAnimation
from data_analysis.visualization.activations import (
    ActivationsAnimation,
    FunctionAnimation,
    PointAnimation,
)
from data_analysis.visualization.automata import AutomatonAnimation
from data_analysis.visualization.epochs import EpochAnimation
import data_analysis.visualization.publication as publication
import simulate
import two_points

import models as models
from models import MLP, CNN, ResNet

is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU available")
else:
    device = torch.device("cpu")
    print("GPU not available")

device = torch.device("cpu")

GPU available


In [2]:
## Load settings
settings = "default"

(
    model_type,
    nonlinearity,
    gain,
    lr,
    P,
    L,
    n_epochs,
    hidden_layer,
    dx2,
    dy2,
    in_dim,
    out_dim,
) = (
    pd.read_csv("model_settings/2 points.txt", sep=" ", header=0)
    .loc[settings]
    .to_numpy()
)
model_type = getattr(models, model_type)
if nonlinearity == "discontinuous":
    nonlinearity = simulate.Discontinuous.apply
elif nonlinearity == "none":
    nonlinearity = None
else:
    nonlinearity = getattr(torch.nn.functional, nonlinearity)

mod = 1
# factor = 4
# n_epochs = int(factor * n_epochs)
# lr = lr / factor

In [3]:
## Generate data

input_dim, output_dim = 1, 1

# data, encoding = two_points.data_set(dx2, dy2, input_dim, output_dim, device)

inputs = np.array([[-1] * input_dim, [-1 + np.sqrt(dx2)] * input_dim]) / np.sqrt(
    input_dim
)
outputs = np.array([[0.6] * output_dim, [0.6 + np.sqrt(dy2)] * output_dim]) / np.sqrt(
    output_dim
)
names = ["A", "B"]
data = TensorDataset(
    torch.from_numpy(inputs.astype(np.float32)).to(device),
    torch.from_numpy(outputs.astype(np.float32)).to(device),
)

encoding = Encoding(dict(zip(names, inputs)))

train_datasets = [data]
val_dataset = [data]

tracked_datasets = val_dataset + train_datasets

In [None]:
losses, Ps, Ls = [], [], []

L = 5
hidden_layer = 2
n_epochs = 10000

for P in range(1, 30, 1):
    print(f"Number of units per layer: {P}")
    gain = 4 / P ** (1 / 2)
    learning_rate = lr * (np.sqrt(P) / 10)
    

    losses_this_P = []
    for _ in range(30):
        ## Instantiate model
        model = model_type(
            encoding=encoding,
            input_size=input_dim,
            output_size=output_dim,
            hidden_dim=P,
            n_hid_layers=L,
            device=device,
            init_std=gain,
            non_linearity=nonlinearity,
        )
        
        ## Setup compiler
        criterion = lambda x, y: 0.5 * nn.functional.mse_loss(x, y)
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
        compiler = Compiler(model, criterion, optimizer)
        compiler.trackers = {
            "loss": ScalarTracker(lambda: compiler.validation(tracked_datasets)),
            "hidden": ActivationTracker(
                model,
                lambda inputs: model(inputs)[1][hidden_layer],
                datasets=tracked_datasets,
            ),
            "output": ActivationTracker(
                model, lambda inputs: model(inputs)[0], datasets=tracked_datasets
            ),
        }
    
        # input_1 = train_datasets[0][0][0]
        # input_2 = train_datasets[0][1][0]
        # pred_1 = model(input_1)[0]
        # pred_2 = model(input_2)[0]
        # input_1, input_2, pred_1, pred_2 = [
        #     a.detach().numpy() for a in (input_1, input_2, pred_1, pred_2)
        # ]
        # G = np.linalg.norm(pred_2 - pred_1) ** 2 / np.linalg.norm(input_2 - input_1) ** 2
        # print(f"Overall gain: {G:.3e}")
    
        ## Training run
        with io.capture_output() as captured_output:
            compiler.training_run(
                train_datasets,
                tracked_datasets,
                n_epochs=n_epochs,
                batch_size=100,   
            )
        train_loss = compiler.trackers["loss"].get_entry(-1)[0][0]
        if train_loss > 1e-2:
            continue
    
        data_hid = compiler.trackers["hidden"].get_trace()
        data_output = compiler.trackers["output"].get_trace()
        query = f"Epoch % {mod} == 0"
        data_hid = data_hid.copy().query(query)
        data_output = data_output.copy().query(query)
        h_A = [
            np.array(data.loc[epoch, 0, "A"])
            for epoch, data in data_hid.query("Dataset == 0").groupby("Epoch")
        ]
        h_B = [
            np.array(data.loc[epoch, 0, "B"])
            for epoch, data in data_hid.query("Dataset == 0").groupby("Epoch")
        ]
        y_A = [
            np.array(data.loc[epoch, 0, "A"])
            for epoch, data in data_output.query("Dataset == 0").groupby("Epoch")
        ]
        y_B = [
            np.array(data.loc[epoch, 0, "B"])
            for epoch, data in data_output.query("Dataset == 0").groupby("Epoch")
        ]
    
        epochs = np.arange(0, len(h_A))
    
        y_true_A, y_true_B = outputs[0], outputs[1]
        dy2 = np.sum((y_true_B - y_true_A) ** 2)
        h2 = np.array([np.sum((h_A[epoch] - h_B[epoch]) ** 2) for epoch in epochs])
        y2 = np.array([np.sum((y_A[epoch] - y_B[epoch]) ** 2) for epoch in epochs])
        w = np.array(
            [
                y2[epoch] - np.dot(y_true_A - y_true_B, y_A[epoch] - y_B[epoch])
                for epoch in epochs
            ]
        )
        y0_mean = np.sum((0.5 * ((y_A[0] + y_B[0]) - (y_true_B + y_true_A))) ** 2)
    
        h0, y0, w0, dy = h2[0], y2[0], w[0], dy2
        epochs = epochs * mod
    
        ## Fit effective learning rates
        eta_h_opt, eta_y_opt, loss = simulate.optimize_eta(
            h2, y2, w, dx2, dy2, guesses=np.logspace(-6, 2, 100)
        )
        losses_this_P.append(loss)
    loss = np.mean(losses_this_P)

    losses.append(loss)
    Ps.append(P)
    Ls.append(L)

Number of units per layer: 1
Loss: 1.9249316765035442


In [None]:
fig = plt.figure(figsize = (4,3))

N_start = 1

publication.set_color_mixed(1)
plt.plot(Ps[N_start-1:], losses[N_start-1:])
plt.xlabel("Number of hidden units per layer")
plt.ylabel("Fit loss")
plt.ylim(0)
publication.plt_show(save_path="plots/2_points/architecture/loss_vs_units")