# IMGP - Supervised Learning - 1D Manifold

## Preamble

This notebook provides an example of how to perform Gaussian Process Regression on a 1D manifold. In this example we consider a supervised learning scenario, namely the number of labeled data points is equivalent to the number of the sampled points from the underlying manifold.

In [1]:
import torch
import gpytorch
import numpy as np

%matplotlib widget
import matplotlib.pyplot as plt

from manifold_gp.kernels.riemann_matern_kernel import RiemannMaternKernel
from manifold_gp.models import RiemannGP, VanillaGP
from manifold_gp.utils import rmnist_dataset, vanilla_train, manifold_informed_train, test_model



## Dataset

In [2]:
num_train = 0.1
scaling, single_digit, regenerate, normalize_x, normalize_y = True, True, False, False, True

In [3]:
sampled_x, sampled_y, test_x, test_y = rmnist_dataset(scaling=scaling, single_digit=single_digit, regenerate=regenerate)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
sampled_x, sampled_y = sampled_x.contiguous().to(device).flatten(start_dim=1), sampled_y.contiguous().to(device)
test_x, test_y = test_x.contiguous().to(device).flatten(start_dim=1), test_y.contiguous().to(device)

torch.manual_seed(1337)
train_idx = torch.zeros(sampled_x.shape[0]).scatter_(0, torch.randperm(sampled_x.shape[0])[:int(num_train*sampled_x.shape[0])], 1).bool()
train_x, train_y = sampled_x[train_idx], sampled_y[train_idx]
    
if normalize_x:
    mu_x, std_x = sampled_x.mean(dim=-2, keepdim=True), sampled_x.std(dim=-2, keepdim=True) + 1e-6
    sampled_x.sub_(mu_x).div_(std_x)
    train_x.sub_(mu_x).div_(std_x)
    test_x.sub_(mu_x).div_(std_x)
if normalize_y:
    mu_y, std_y = train_y.mean(), train_y.std()
    sampled_y.sub_(mu_y).div_(std_y)
    train_y.sub_(mu_y).div_(std_y)
    test_y.sub_(mu_y).div_(std_y)

Loading SRMNIST


In [None]:
knn = NearestNeighbors(sampled_x, nlist=1)
edge_values = self.knn.search(sampled_x, 10)[0][:, 1:]

graphbandwidth_min = edge_values[:,0].max().div(-4*math.log(1e-4)).sqrt()
graphbandwidth_constraint = gpytorch.constraints.GreaterThan(graphbandwidth_min)

median = edge_values.sqrt().mean(dim=1).sort()[0][int(round(edge_values.shape[0]*0.50))]
gamma_rate = 4*median/(median-graphbandwidth_min)**2
gamma_concentration = rate * median + 1
graphbandwidth_prior = gpytorch.priors.GammaPrior(gamma_concentration, gamma_rate)

del knn, edge_values
gc.collect()
torch.cuda.empty_cache()

## Model

In [4]:
%%capture
model_vanilla = VanillaGP(
    train_x, 
    train_y, 
    gpytorch.likelihoods.GaussianLikelihood(), 
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
).to(device)

hypers_vanilla = {
    'likelihood.noise_covar.noise': 1e-2,
    'covar_module.base_kernel.lengthscale': 1.0,
    'covar_module.outputscale': 1.0,
}
model_vanilla.initialize(**hypers_vanilla)

In [7]:
%%capture
likelihood = gpytorch.likelihoods.GaussianLikelihood(
    noise_constraint=gpytorch.constraints.GreaterThan(1e-8),
)

kernel = gpytorch.kernels.ScaleKernel(
    RiemannMaternKernel(
        nu=1,
        x=sampled_x,
        nearest_neighbors=10,
        laplacian_normalization="randomwalk",
        num_modes=50,
        bump_scale=10.0,
        bump_decay=1.0,
    )
)

model = RiemannGP(train_x, train_y, likelihood, kernel, train_idx).to(device)

hypers = {
    'likelihood.noise_covar.noise': 1e-2,
    'covar_module.base_kernel.graphbandwidth': 0.5,
    'covar_module.base_kernel.lengthscale': 1.0,
    'covar_module.outputscale': 1.0,
}
model.initialize(**hypers)

## Train

In [8]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-1, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=50, threshold=1e-3, threshold_mode='rel',
                                                       cooldown=0, min_lr=0, eps=1e-8, verbose=True)
loss = manifold_informed_train(model, optimizer, max_iter=100, tolerance=1e-2, update_norm=100, num_rand_vec=100,
                               max_cholesky=2000, cg_tolerance=1e-2, cg_max_iter=1000, scheduler=scheduler, verbose=True)



Iteration: 0, Loss: 1388.073, Noise Variance: 0.010, Signal Variance: 1.010, Lengthscale: 1.000, Graphbandwidth: 0.500
Iteration: 1, Loss: 1377.730, Noise Variance: 0.009, Signal Variance: 1.075, Lengthscale: 1.064, Graphbandwidth: 0.462
Iteration: 2, Loss: 1364.752, Noise Variance: 0.009, Signal Variance: 1.029, Lengthscale: 1.130, Graphbandwidth: 0.430
Iteration: 3, Loss: 1351.413, Noise Variance: 0.009, Signal Variance: 0.979, Lengthscale: 1.198, Graphbandwidth: 0.399
Iteration: 4, Loss: 1338.276, Noise Variance: 0.008, Signal Variance: 0.935, Lengthscale: 1.268, Graphbandwidth: 0.369
Iteration: 5, Loss: 1325.429, Noise Variance: 0.008, Signal Variance: 0.909, Lengthscale: 1.339, Graphbandwidth: 0.340
Iteration: 6, Loss: 1312.837, Noise Variance: 0.007, Signal Variance: 0.902, Lengthscale: 1.411, Graphbandwidth: 0.312
Iteration: 7, Loss: 1300.907, Noise Variance: 0.007, Signal Variance: 0.909, Lengthscale: 1.485, Graphbandwidth: 0.286
Iteration: 8, Loss: 1290.189, Noise Variance: 0.



NotPSDError: Matrix not positive definite after repeatedly adding jitter up to 1.0e-04.

In [8]:
optimizer_vanilla = torch.optim.Adam(model_vanilla.parameters(), lr=1e-1, weight_decay=0.0)
scheduler_vanilla = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_vanilla, mode='min', factor=0.5, patience=200, threshold=1e-3, 
                                                               threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-8, verbose=False)
loss = vanilla_train(model_vanilla, optimizer_vanilla, max_iter=500, max_cholesky=2000, tolerance=1e-2, cg_tolerance=1e-2, cg_max_iter=1000, scheduler=None, 
              verbose=True)



Iteration: 0, Loss: 7.235, Noise Variance: 0.105, Signal Variance: 1.032, Lengthscale: 0.938
Iteration: 1, Loss: 5.819, Noise Variance: 0.110, Signal Variance: 1.063, Lengthscale: 0.879
Iteration: 2, Loss: 4.722, Noise Variance: 0.116, Signal Variance: 1.093, Lengthscale: 0.824
Iteration: 3, Loss: 3.875, Noise Variance: 0.121, Signal Variance: 1.123, Lengthscale: 0.772
Iteration: 4, Loss: 3.219, Noise Variance: 0.127, Signal Variance: 1.151, Lengthscale: 0.724
Iteration: 5, Loss: 2.711, Noise Variance: 0.132, Signal Variance: 1.178, Lengthscale: 0.680
Iteration: 6, Loss: 2.316, Noise Variance: 0.138, Signal Variance: 1.203, Lengthscale: 0.639
Iteration: 7, Loss: 2.010, Noise Variance: 0.143, Signal Variance: 1.227, Lengthscale: 0.602
Iteration: 8, Loss: 1.775, Noise Variance: 0.148, Signal Variance: 1.250, Lengthscale: 0.568
Iteration: 9, Loss: 1.594, Noise Variance: 0.153, Signal Variance: 1.271, Lengthscale: 0.537
Iteration: 10, Loss: 1.456, Noise Variance: 0.157, Signal Variance: 1.

## Evaluation

In [6]:
rmse, nll = test_model(model, test_x, test_y, noisy_test=True, max_cholesky=2000, cg_tolerance=1e-2, cg_iterations=1000)
print("RMSE Geometric: ", rmse)
print("NLL Geometric: ", nll)

RMSE Geometric:  tensor(0.3881, device='cuda:0')
NLL Geometric:  tensor(-3.2201, device='cuda:0')


In [9]:
rmse_vanilla, nll_vanilla = test_model(model_vanilla, test_x, test_y, noisy_test=True, max_cholesky=2000, cg_tolerance=1e-2, cg_iterations=1000)
print("RMSE Vanilla: ", rmse_vanilla)
print("NLL Vanilla: ", nll_vanilla)

RMSE Vanilla:  tensor(0.9982, device='cuda:0')
NLL Vanilla:  tensor(-3.0384, device='cuda:0')
