In [1]:
import benchmarks
import optimizers
import torch

import matplotlib.pyplot as plt
import numpy as np
import gpytorch

In [2]:
data_benchmark = benchmarks.DataModelBenchmark(metric_index=3)
func = data_benchmark._raw_func_with_model_scale # (z, m, x)

Instantiating benchmark with y=metric Wikipedia Cross Entropy


In [3]:
def get_random_points(k):
    rd_prop = np.random.dirichlet(np.ones(5), size=k)
    rd_scale = np.random.choice([2, 15], size=k)
    rd_timestep = np.random.choice(np.arange(1, 197), size=k)
    rd_x = np.concatenate([rd_prop, rd_scale[:, None], rd_timestep[:, None]], axis=1)
    return rd_x


In [4]:
# sample k random points from scale 2 and scale 15
k = 10

# x_shape: [5 categories, 1 scale, 1 timestep]
rd_prop = np.random.dirichlet(np.ones(5), size=k)
rd_scale = np.random.choice([2, 15], size=k)
rd_timestep = np.random.choice(np.arange(1, 197), size=k)

train_x = []
train_y = []
for i in range(k):
    for s in range(1, rd_timestep[i] + 1):
        train_x.append(np.concatenate([rd_prop[i], [rd_scale[i], s]]))
        train_y.append(func(s, rd_scale[i], rd_prop[i]))

train_x = torch.tensor(train_x)
train_y = torch.tensor(train_y)

  train_x = torch.tensor(train_x)


In [12]:
import warnings
from linear_operator.utils.warnings import NumericalWarning
from collections import defaultdict
warnings.filterwarnings('ignore', category=NumericalWarning)

# Train initial GP
import gpytorch
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

likelihood = gpytorch.likelihoods.FixedNoiseGaussianLikelihood(noise=torch.ones(k) * 0.00)
model = ExactGPModel(train_x, train_y, likelihood)
print(f"lengthscale: {model.covar_module.base_kernel.lengthscale.item()}")
print(f"mean: {model.mean_module.constant.item()}")

training_iter = 50

# Find optimal model hyperparameters
model.train()

# Use the adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  # Includes GaussianLikelihood parameters

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

losses = []
for i in range(training_iter):
    # Zero gradients from previous iteration
    optimizer.zero_grad()
    # Output from model
    output = model(train_x)
    # Calc loss and backprop gradients
    loss = -mll(output, train_y)
    loss.backward()
    losses.append(loss.item())
    optimizer.step()
    print(f"Iter {i+1}/{training_iter} - Loss: {loss.item()}")

print(f"Final lengthscale: {model.covar_module.base_kernel.lengthscale.item()}")
print(f"Final mean: {model.mean_module.constant.item()}")

lengthscale: 0.6931471824645996
mean: 0.0
Iter 1/50 - Loss: 0.7713402651153743
Iter 2/50 - Loss: 0.581669325560806
Iter 3/50 - Loss: 0.3926415176653024
Iter 4/50 - Loss: 0.26682625379292263
Iter 5/50 - Loss: 0.13959355535759105
Iter 6/50 - Loss: 0.015600876507603448




Iter 7/50 - Loss: -0.12080163348815856
Iter 8/50 - Loss: -0.2774048277440356
Iter 9/50 - Loss: -0.36915128804477465
Iter 10/50 - Loss: -0.5175718273593863
Iter 11/50 - Loss: -0.6032017767305119
Iter 12/50 - Loss: -0.7254832337767504
Iter 13/50 - Loss: -0.855117562762108
Iter 14/50 - Loss: -0.970593663343508
Iter 15/50 - Loss: -1.0906515175297968
Iter 16/50 - Loss: -1.1386108075972488
Iter 17/50 - Loss: -1.173632932083773
Iter 18/50 - Loss: -1.1429543300388234
Iter 19/50 - Loss: -1.1535128738171572
Iter 20/50 - Loss: -0.9879633367158034
Iter 21/50 - Loss: -0.9011597700587985
Iter 22/50 - Loss: -0.806759350342691
Iter 23/50 - Loss: -0.7766735974103187
Iter 24/50 - Loss: -0.8493836479621969
Iter 25/50 - Loss: -0.9557860749419838
Iter 26/50 - Loss: -1.0679850247563418
Iter 27/50 - Loss: -1.139211012340212
Iter 28/50 - Loss: -1.1937439444525433
Iter 29/50 - Loss: -1.1903507949407277
Iter 30/50 - Loss: -1.1654235053928725
Iter 31/50 - Loss: -1.2217320044624076
Iter 32/50 - Loss: -1.249109464

In [13]:
print(f"Final lengthscale: {model.covar_module.base_kernel.lengthscale}")
print(f"Final noise: {model.covar_module.base_kernel.lengthscale}")

Final lengthscale: tensor([[1.8431]], grad_fn=<SoftplusBackward0>)
Final noise: tensor([[1.8431]], grad_fn=<SoftplusBackward0>)


In [14]:
flops = {
    100: 1,
    2: 2090524455 / 161264981936,
    6: 5211827866 / 161264981936,
    15: 12069704997 / 161264981936,
    30: 23823782173 / 161264981936,
    50: 34933622501 / 161264981936,
    70: 48105020743 / 161264981936,
}
print(flops)

{100: 1, 2: 0.012963288309111339, 6: 0.032318410379188076, 15: 0.07484392986066876, 30: 0.147730659731539, 50: 0.21662249349870538, 70: 0.29829799480020447}


In [15]:
# num of steps in 20M
c2 = np.sum(np.sum(rd_timestep[rd_scale == 2])) * flops[2]
c15 = np.sum(np.sum(rd_timestep[rd_scale == 15])) * flops[15]

print(f"Cost of 20M runs: {c2}")
print(f"Cost of 150M runs: {c15}")

Cost of 20M runs: 8.633550013868152
Cost of 150M runs: 25.746311872070056


In [16]:
def compute_ei(model, likelihood, train_y, test_x, with_grad=False):
    # EI = (\mu - f(xbest)) * \Phi(Z) + \phi(Z). Z = (\mu - f(xbest)) / \sigma
    model.eval()
    likelihood.eval()

    if with_grad:
        m_out = model(test_x)
        observed_pred = likelihood(m_out)
    else:
        with torch.no_grad():
            m_out = model(test_x)
            observed_pred = likelihood(m_out)
    mean = observed_pred.mean

    # Compute EI
    y_best = train_y.min()
    delta = y_best - mean
    sigma = observed_pred.variance.sqrt()
    t1 = delta * torch.distributions.Normal(0, 1).cdf(delta / sigma)
    t2 = sigma * torch.exp(torch.distributions.Normal(0, 1).log_prob(delta / sigma))
    ei = t1 + t2
    return ei


In [21]:
from scipy.optimize import minimize

x0 = get_random_points(1)
x0 = torch.tensor(x0, dtype=torch.float64)

def ei_to_minimize(x):
    x = torch.tensor(x).reshape(1, -1)
    return -compute_ei(model, likelihood, train_y, x).detach().numpy().item()

def ei_grad(x):
    x = torch.tensor(x, dtype=torch.double).reshape(1, -1)
    x.requires_grad = True
    ei = -compute_ei(model, likelihood, train_y, x, with_grad=True)
    ei.backward()
    return x.grad.numpy()


x0 = get_random_points(1)
print(f"Starting points: {x0}")
print(f"Initia EI: {ei_to_minimize(x0)}")

res = minimize(ei_to_minimize, x0=x0.squeeze(0), bounds=[(None, None)] * 5 + [(2, 100)] + [(1, 197)], method='L-BFGS-B', jac=ei_grad)
print(res.x)
print(res.fun)

Starting points: [[3.82926912e-01 2.31461603e-01 2.59254894e-01 2.74900514e-02
  9.88665399e-02 1.50000000e+01 5.90000000e+01]]
Initia EI: -0.004804460251097709
[ 1.12176733  0.59067136  0.70754594 -0.69714324 -0.72284139 15.
 56.82537527]
-0.04273536578538418


In [37]:
import warnings
from linear_operator.utils.warnings import NumericalWarning
from collections import defaultdict
warnings.filterwarnings('ignore', category=NumericalWarning)

# TODO: Limit search space to within each scale.
# TODO: Take gradient over step as well but discretize at the end
from tqdm import tqdm
from scipy.optimize import minimize

num_revealed = 2
num_search_per_fid = 10

for i in tqdm(range(num_revealed), desc="Revealing labels"):
    # Find the point with the highest EI
    def ei_to_minimize(x, scale, timestep):
        x = torch.tensor(x, dtype=torch.double).reshape(1, -1)
        x = torch.exp(x)
        x = x / torch.sum(x)
        x = torch.cat([x, torch.tensor([scale, timestep], dtype=torch.double).reshape(1, -1)], dim=1)

        return -compute_ei(model, likelihood, train_y, x).detach().numpy().item()

    def ei_grad(x, scale, timestep):
        x = torch.tensor(x, dtype=torch.double).reshape(1, -1)
        x = torch.exp(x)
        x = x / torch.sum(x)
        x = torch.cat([x, torch.tensor([scale, timestep], dtype=torch.double).reshape(1, -1)], dim=1)

        x.requires_grad = True
        ei = -compute_ei(model, likelihood, train_y, x, with_grad=True)
        ei.backward()
        grad = x.grad[0, :5].squeeze(0).numpy()

        return grad

    ei_results = defaultdict(defaultdict)
    sampled_points = []
    for scale in tqdm([2, 6, 15, 30, 50, 70, 100], desc="Scales", position=1, leave=False):
        for timestep in [60, 120, 197]:
            x0s = get_random_points(num_search_per_fid)

            # Bound x to sum to 1
            def sum_constraint(x):
                return np.sum(x) - 1  # Will equal 0 when sum is 1
            constraints = ({'type': 'eq', 'fun': sum_constraint})
            results = list(
                map(
                    lambda x0:
                    minimize(
                        lambda x: ei_to_minimize(x, scale, timestep), x0=x0[:5], bounds=[(0, 1)] * 5, method='L-BFGS-B', jac=lambda x: ei_grad(x, scale, timestep),
                        constraints=constraints
                    ), x0s
                )
            )

            # Filter results not successful and not satisfying constraints
            results = list(filter(lambda res: res.success and np.isclose(sum_constraint(res.x), 0.0), results))
            print(f"Remaining results {len(results)}")
            if len(results) == 0:
                print(f"Scale {scale}, timestep {timestep} has no successful optimizing results")
                continue

            best_result = max(results, key=lambda x: -x.fun)
            cur_x = best_result.x
            print(f"Constraint: {sum_constraint(cur_x)}. Success: {best_result.success}")
            ei_results[scale][timestep] = (cur_x, -best_result.fun)

    scaled_ei_results = defaultdict(defaultdict)
    max_scaled_ei = -1

    for scale in ei_results.keys():
        for timestep in ei_results[scale].keys():
            scaled_ei_results[scale][timestep] = (
                ei_results[scale][timestep][0],
                ei_results[scale][timestep][1] / flops[scale] / timestep
            )
            scaled_ei = scaled_ei_results[scale][timestep][1]
            if scaled_ei > max_scaled_ei:
                max_scaled_ei = scaled_ei

                best_scale = scale
                best_timestep = timestep
                best_x = ei_results[scale][timestep][0]

    print(scaled_ei_results)
    print(f"Best ei: {max_scaled_ei},\n\tscale: {best_scale},\n\ttimestep: {best_timestep},\n\tx: {best_x}")
    sampled_points.append((best_scale, best_timestep, best_x))

    # Reveal the values of the point with the highest EI
    # cur_x = best_result.x
    # print(f"Revealing {cur_x}")
    # cur_y = func(z, cur_x)
    # chosen_x.append(cur_x.item())
    # opt_ys.append(cur_y.item())

    # Update the model with the new point
    new_x = torch.tensor([np.concatenate((best_x, np.array([best_scale, t]))) for t in range(1, best_timestep + 1)])
    new_y = torch.tensor([func(t, best_scale, best_x) for t in range(1, best_timestep + 1)])
    train_x = torch.cat([train_x, new_x], dim=0)
    train_y = torch.cat([train_y, new_y], dim=0)

    model.set_train_data(train_x, train_y, strict=False)

  minimize(


Remaining results 10
Constraint: 0.0. Success: True
Remaining results 10
Constraint: -2.220446049250313e-16. Success: True




Remaining results 10
Constraint: 0.0. Success: True
Remaining results 0


In [27]:
sampled_points

[(6, 60, array([1., 0., 0., 0., 0.]))]

In [36]:
np.isclose(1, 1.00001)

True