# IMGP - Semisupervised Learning - 1D Manifold

## Preamble

This notebook provides an example of how to perform Gaussian Process Regression on a 1D manifold. In this example we consider a supervised learning scenario, namely the number of labeled data points is equivalent to the number of the sampled points from the underlying manifold.

In [1]:
%load_ext autoreload
%autoreload 2
    
import os
import sys
import math
import torch
import gpytorch
import numpy as np

sys.path.insert(0, os.path.abspath(os.path.join('../')))
from manifold_gp.kernels.riemann_matern_kernel import RiemannMaternKernel
from manifold_gp.models import RiemannGP, VanillaGP
from manifold_gp.utils import rmnist_dataset, vanilla_train, manifold_informed_train, test_model, NearestNeighbors

## Dataset

In [2]:
num_train = 0.1
scaling, single_digit, regenerate, shuffle = True, True, False, False
normalize_x, normalize_y = False, True
graphbandwidth_constraint, graphbandwidth_prior = False, False
load_manifold_model, load_vanilla_model = False, False
train_manifold_model, train_vanilla_model = True, True
save_manifold_model, save_vanilla_model = False, False

In [3]:
sampled_x, sampled_y, _, test_x, test_y, _ = rmnist_dataset(scaling=scaling, single_digit=single_digit, regenerate=regenerate, shuffle=shuffle)
torch.manual_seed(1337)
train_idx = torch.zeros(sampled_x.shape[0]).scatter_(0, torch.randperm(sampled_x.shape[0])[:int(num_train*sampled_x.shape[0])], 1).bool()
# sampled_x, sampled_y = torch.from_numpy(np.load('../datasets/srmnist_train_x.npy')).float(), torch.from_numpy(np.load('../datasets/srmnist_train_y.npy')).float()
# test_x, test_y = torch.from_numpy(np.load('../datasets/srmnist_test_x.npy')).float(), torch.from_numpy(np.load('../datasets/srmnist_test_y.npy')).float()
# torch.manual_seed(1337)
# rand_idx = torch.randperm(sampled_x.shape[0])
# sampled_x, sampled_y = sampled_x[rand_idx], sampled_y[rand_idx]
# train_idx = torch.zeros(sampled_x.shape[0]).scatter_(0, torch.arange(0, int(num_train*sampled_x.shape[0])), 1).bool()

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
sampled_x, sampled_y = sampled_x.contiguous().to(device).flatten(start_dim=1), sampled_y.contiguous().to(device)
train_x, train_y = sampled_x[train_idx], sampled_y[train_idx]
test_x, test_y = test_x.contiguous().to(device).flatten(start_dim=1), test_y.contiguous().to(device)
    
if normalize_x:
    mu_x, std_x = sampled_x.mean(dim=-2, keepdim=True), sampled_x.std(dim=-2, keepdim=True) + 1e-6
    sampled_x.sub_(mu_x).div_(std_x)
    train_x.sub_(mu_x).div_(std_x)
    test_x.sub_(mu_x).div_(std_x)
if normalize_y:
    mu_y, std_y = train_y.mean(), train_y.std()
    sampled_y.sub_(mu_y).div_(std_y)
    train_y.sub_(mu_y).div_(std_y)
    test_y.sub_(mu_y).div_(std_y)

Generating SRMNIST


2024-07-31 18:50:12.390640: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-31 18:50:12.614725: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
if graphbandwidth_constraint or graphbandwidth_prior:
    knn = NearestNeighbors(sampled_x, nlist=1)
    edge_values = knn.search(sampled_x, 10)[0][:, 1:]
    
    graphbandwidth_min = edge_values[:,0].max().div(-4*math.log(1e-4)).sqrt()
    median = edge_values.sqrt().mean(dim=1).sort()[0][int(round(edge_values.shape[0]*0.50))]
    gamma_rate = 4*median/(median-graphbandwidth_min)**2
    gamma_concentration = gamma_rate * median + 1
    
    del knn, edge_values

## Model

In [5]:
%%capture
model_vanilla = VanillaGP(
    train_x, 
    train_y, 
    gpytorch.likelihoods.GaussianLikelihood(), 
    # gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=2.5))
).to(device)

hypers_vanilla = {
    'likelihood.noise_covar.noise': 1e-2,
    'covar_module.base_kernel.lengthscale': 1.0,
    'covar_module.outputscale': 1.0,
}
model_vanilla.initialize(**hypers_vanilla)

In [6]:
%%capture
likelihood = gpytorch.likelihoods.GaussianLikelihood(
    noise_constraint=gpytorch.constraints.GreaterThan(1e-8),
)

kernel = gpytorch.kernels.ScaleKernel(
    RiemannMaternKernel(
        nu=2,
        x=sampled_x,
        nearest_neighbors=50,
        laplacian_normalization="randomwalk",
        num_modes=100,
        bump_scale=10.0,
        bump_decay=0.01,
        graphbandwidth_constraint=gpytorch.constraints.GreaterThan(graphbandwidth_min) if graphbandwidth_constraint else None,
        graphbandwidth_prior=gpytorch.priors.GammaPrior(gamma_concentration, gamma_rate) if graphbandwidth_prior else None
    )
)

model = RiemannGP(train_x, train_y, likelihood, kernel, train_idx).to(device)

hypers = {
    'likelihood.noise_covar.noise': 1e-2,
    'covar_module.base_kernel.graphbandwidth': kernel.base_kernel.graphbandwidth_prior.sample() if graphbandwidth_prior else 0.5,
    'covar_module.base_kernel.lengthscale': 1.0,
    'covar_module.outputscale': 1.0,
}
model.initialize(**hypers)

## Train

In [7]:
kernel.outputscale = 0.2433
likelihood.noise = 0.0026
kernel.base_kernel.graphbandwidth = 0.2020
kernel.base_kernel.lengthscale = 1.8958

In [7]:
if load_manifold_model:
    model.load_state_dict(torch.load('../models/srmnist_manifold_semisupervised.pth' if single_digit else '../models/rmnist_manifold_semisupervised.pth'))

if train_manifold_model:
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, weight_decay=0.0)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=50, threshold=1e-3, threshold_mode='rel',
                                                           cooldown=0, min_lr=0, eps=1e-8)
    loss = manifold_informed_train(model, optimizer, max_iter=100, tolerance=1e-2, update_norm=None, num_rand_vec=100,
                                   max_cholesky=1000, cg_tolerance=1e-2, cg_max_iter=1000, scheduler=scheduler, verbose=True)
    
if save_manifold_model:
    torch.save(model.state_dict(), '../models/srmnist_manifold_semisupervised.pth' if single_digit else '../models/rmnist_manifold_semisupervised.pth')

Iteration: 0, Loss: 1791.816, Lr: 0.01,	Noise Variance: 0.010,	Signal Variance: 1.985,	Lengthscale: 1.000, Graphbandwidth: 0.500
Iteration: 1, Loss: 1758.000, Lr: 0.01,	Noise Variance: 0.010,	Signal Variance: 1.976,	Lengthscale: 1.006, Graphbandwidth: 0.504
Iteration: 2, Loss: 1722.714, Lr: 0.01,	Noise Variance: 0.010,	Signal Variance: 1.968,	Lengthscale: 1.013, Graphbandwidth: 0.508
Iteration: 3, Loss: 1689.639, Lr: 0.01,	Noise Variance: 0.010,	Signal Variance: 1.959,	Lengthscale: 1.019, Graphbandwidth: 0.512
Iteration: 4, Loss: 1658.169, Lr: 0.01,	Noise Variance: 0.010,	Signal Variance: 1.950,	Lengthscale: 1.025, Graphbandwidth: 0.516
Iteration: 5, Loss: 1629.954, Lr: 0.01,	Noise Variance: 0.011,	Signal Variance: 1.942,	Lengthscale: 1.032, Graphbandwidth: 0.520
Iteration: 6, Loss: 1603.465, Lr: 0.01,	Noise Variance: 0.011,	Signal Variance: 1.934,	Lengthscale: 1.038, Graphbandwidth: 0.524
Iteration: 7, Loss: 1578.016, Lr: 0.01,	Noise Variance: 0.011,	Signal Variance: 1.925,	Lengthscal

In [8]:
if load_vanilla_model:
    model_vanilla.load_state_dict(torch.load('../models/srmnist_vanilla_semisupervised.pth' if single_digit else '../models/rmnist_vanilla_semisupervised.pth'))

if train_vanilla_model:
    optimizer_vanilla = torch.optim.Adam(model_vanilla.parameters(), lr=1e-2, weight_decay=0.0)
    scheduler_vanilla = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_vanilla, mode='min', factor=0.5, patience=200, threshold=1e-3, 
                                                                   threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-8)
    loss = vanilla_train(model_vanilla, optimizer_vanilla, max_iter=100, max_cholesky=1000, tolerance=1e-2, 
                         cg_tolerance=1e-2, cg_max_iter=1000, scheduler=None, verbose=True)
    
if save_vanilla_model:
    torch.save(model_vanilla.state_dict(), '../models/srmnist_vanilla_semisupervised.pth' if single_digit else '../models/rmnist_vanilla_semisupervised.pth')

Iteration: 0, Loss: 0.143, Lr: 0.01,	Noise Variance: 0.010,	Signal Variance: 0.994,	Lengthscale: 1.006
Iteration: 1, Loss: 0.122, Lr: 0.01,	Noise Variance: 0.010,	Signal Variance: 0.987,	Lengthscale: 1.013
Iteration: 2, Loss: 0.096, Lr: 0.01,	Noise Variance: 0.010,	Signal Variance: 0.981,	Lengthscale: 1.019
Iteration: 3, Loss: 0.083, Lr: 0.01,	Noise Variance: 0.010,	Signal Variance: 0.975,	Lengthscale: 1.025
Iteration: 4, Loss: 0.110, Lr: 0.01,	Noise Variance: 0.010,	Signal Variance: 0.969,	Lengthscale: 1.032
Iteration: 5, Loss: 0.076, Lr: 0.01,	Noise Variance: 0.009,	Signal Variance: 0.963,	Lengthscale: 1.038
Iteration: 6, Loss: 0.073, Lr: 0.01,	Noise Variance: 0.009,	Signal Variance: 0.956,	Lengthscale: 1.045
Iteration: 7, Loss: 0.065, Lr: 0.01,	Noise Variance: 0.009,	Signal Variance: 0.950,	Lengthscale: 1.051
Iteration: 8, Loss: 0.081, Lr: 0.01,	Noise Variance: 0.009,	Signal Variance: 0.944,	Lengthscale: 1.058
Iteration: 9, Loss: 0.077, Lr: 0.01,	Noise Variance: 0.009,	Signal Varian

## Evaluation

In [9]:
rmse_vanilla, nll_vanilla = test_model(model_vanilla, test_x, test_y, noisy_test=True, base_model=None, max_cholesky=1000, cg_tolerance=1e-2, cg_iterations=1000)
print("RMSE Vanilla: ", rmse_vanilla)
print("NLL Vanilla: ", nll_vanilla)

RMSE Vanilla:  tensor(0.0666, device='cuda:0')
NLL Vanilla:  tensor(-0.8721, device='cuda:0')


In [10]:
rmse, nll = test_model(model, test_x, test_y, noisy_test=True, base_model=model_vanilla, max_cholesky=1000, cg_tolerance=1e-2, cg_iterations=1000)
print("RMSE Geometric: ", rmse)
print("NLL Geometric: ", nll)

RMSE Geometric:  tensor(0.0191, device='cuda:0')
NLL Geometric:  tensor(-1.2322, device='cuda:0')
