# Benchmark Manifold GP Semi-Supervised Learning

## Preamble

This notebook provides an example of how to perform Gaussian Process Regression on a 1D manifold. In this example we consider a supervised learning scenario, namely the number of labeled data points is equivalent to the number of the sampled points from the underlying manifold.

In [1]:
import torch
import gpytorch
import numpy as np
import os
import scipy.spatial as ss
from scipy.io import loadmat
from time import time
from manifold_gp.kernels.riemann_matern_kernel import RiemannMaternKernel
from manifold_gp.models.riemann_gp import RiemannGP
from manifold_gp.models.vanilla_gp import VanillaGP
from gpytorch.priors import NormalPrior, GammaPrior

## Dataset Preprocessing

### Load & Settings

In [2]:
# bike, buzz_tomshardware, buzz_twitter, ctslices, elevators, protein, song, mnist, mnist_single
dataset = 'ctslices'
samples_split = 0.5
preprocess = True
normalize_features = False
normalize_labels = True
# torch.manual_seed(1337)
sample_dist = False
train_dist = False

data = np.load('../datasets/'+dataset+'.npy')
data = data[np.random.permutation(data.shape[0])]
num_samples = int(samples_split * len(data))
sampled_x, sampled_y = data[:num_samples, :-1], data[:num_samples, -1]
test_x, test_y = data[num_samples:, :-1], data[num_samples:,-1]

# x = x - x.min(0)[0]
# x = 2 * (x/ x.max(0)[0]) - 1
# mu, std = x.mean(axis=0), x.std(axis=0)
# x = (x - mu)/(std+ 1e-6)

In [3]:
if preprocess:
    # remove coincident points
    sampled_x, id_unique = np.unique(sampled_x, axis=0, return_index=True)
    sampled_y = sampled_y[id_unique]

    # # cut between x% and y% percentile of distances
    # num_avg = 1
    # p_start, p_end = 0.1, 0.9
    # num_samples = sampled_x.shape[0]
    # import faiss
    # res = faiss.StandardGpuResources()
    # knn = faiss.GpuIndexIVFFlat(res, sampled_x.shape[1], 1, faiss.METRIC_L2)
    # knn.train(sampled_x)
    # knn.add(sampled_x)
    # v = np.sqrt(knn.search(sampled_x, num_avg+1)[0][:,1:])
    # idx = np.argsort(v.mean(axis=1))
    # idx = np.delete(idx, np.arange(int(num_samples*p_start),int(num_samples*p_end)))
    # sampled_x = np.delete(sampled_x, idx, axis=0)
    # sampled_y = np.delete(sampled_y, idx)
    # del knn

    # randomized dataset
    idx = np.random.permutation(sampled_x.shape[0])
    sampled_x = sampled_x[idx]
    sampled_y = sampled_y[idx]
m = sampled_x.shape[0]

In [4]:
if sample_dist:
    import faiss
    import matplotlib.pyplot as plt
    res = faiss.StandardGpuResources()
    knn = faiss.GpuIndexIVFFlat(res, sampled_x.shape[1], 1, faiss.METRIC_L2)
    knn.train(sampled_x)
    knn.add(sampled_x)
    v, i = knn.search(sampled_x, 2)
    v = np.sqrt(v[:,1:]).mean(axis=1)
    
    plt.hist(np.sort(v), density=False, bins=100)  # density=False would make counts
    plt.ylabel('Probability')
    plt.xlabel('Data')

### Trainset & Testset

In [5]:
train_split = int(0.25 * m)
train_idx = torch.arange(0, train_split)
train_x, train_y = sampled_x[:train_split], sampled_y[:train_split]

sampled_x = torch.from_numpy(sampled_x).float()
train_x, train_y = torch.from_numpy(train_x).float(), torch.from_numpy(train_y).float()
test_x, test_y = torch.from_numpy(test_x).float(), torch.from_numpy(test_y).float()

if normalize_features:
    mu_x, std_x = sampled_x.mean(dim=-2, keepdim=True), sampled_x.std(dim=-2, keepdim=True) + 1e-6
    sampled_x.sub_(mu_x).div_(std_x)
    train_x.sub_(mu_x).div_(std_x)
    test_x.sub_(mu_x).div_(std_x)
    
if normalize_labels:
    mu_y, std_y = train_y.mean(), train_y.std()
    train_y.sub_(mu_y).div_(std_y)
    test_y.sub_(mu_y).div_(std_y)

### Move Data to Device

In [6]:
sampled_x = sampled_x.contiguous()
train_idx = train_idx.contiguous()
train_x, train_y = train_x.contiguous(), train_y.contiguous()
test_x, test_y = test_x.contiguous(), test_y.contiguous()

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

sampled_x = sampled_x.to(device)
train_idx = train_idx.to(device)
train_x, train_y = train_x.to(device), train_y.to(device)
test_x, test_y = test_x.to(device), test_y.to(device)

## Vanilla Model

In [7]:
%%capture
model_vanilla = VanillaGP(
    train_x, 
    train_y, 
    gpytorch.likelihoods.GaussianLikelihood(), 
    gpytorch.kernels.ScaleKernel(
        # gpytorch.kernels.RBFKernel()
        gpytorch.kernels.MaternKernel(nu=2.5)
    ) # gpytorch.kernels.RBFKernel(), gpytorch.kernels.RFFKernel(100)
).to(device)
model_vanilla_name = dataset + '_vanilla'

In [8]:
model_vanilla.vanilla_train(lr=1e-1, iter=200, verbose=True)

Iteration: 0, Loss: 1.381, Noise Variance: 0.833, Signal Variance: 0.833, Lengthscale: 0.693
Iteration: 1, Loss: 1.360, Noise Variance: 0.803, Signal Variance: 0.803, Lengthscale: 0.744
Iteration: 2, Loss: 1.340, Noise Variance: 0.773, Signal Variance: 0.774, Lengthscale: 0.798
Iteration: 3, Loss: 1.319, Noise Variance: 0.745, Signal Variance: 0.745, Lengthscale: 0.855
Iteration: 4, Loss: 1.297, Noise Variance: 0.716, Signal Variance: 0.719, Lengthscale: 0.914
Iteration: 5, Loss: 1.276, Noise Variance: 0.689, Signal Variance: 0.694, Lengthscale: 0.975
Iteration: 6, Loss: 1.253, Noise Variance: 0.662, Signal Variance: 0.673, Lengthscale: 1.040
Iteration: 7, Loss: 1.229, Noise Variance: 0.636, Signal Variance: 0.657, Lengthscale: 1.107
Iteration: 8, Loss: 1.201, Noise Variance: 0.610, Signal Variance: 0.645, Lengthscale: 1.176
Iteration: 9, Loss: 1.170, Noise Variance: 0.585, Signal Variance: 0.639, Lengthscale: 1.249
Iteration: 10, Loss: 1.138, Noise Variance: 0.560, Signal Variance: 0.

In [9]:
# torch.save(model_vanilla.state_dict(), '../outputs/models/' + model_vanilla_name + '.pth')
# model_vanilla.load_state_dict(torch.load('../outputs/models/' + model_vanilla_name + '.pth'))

## Model

In [10]:
%%capture
likelihood = gpytorch.likelihoods.GaussianLikelihood(
    noise_constraint=gpytorch.constraints.GreaterThan(1e-8),
    noise_prior=None  # NormalPrior(torch.tensor([0.0]).to(device),  torch.tensor([1/9]).sqrt().to(device))
)

kernel = gpytorch.kernels.ScaleKernel(
    RiemannMaternKernel(
        nu=3,
        nodes=sampled_x,
        neighbors=10,
        operator="randomwalk",
        method="exact",
        modes=1000,
        ball_scale=10.0,
        ball_decay=0.01,
        prior_bandwidth=False,
    ),
    outputscale_prior=None # NormalPrior(torch.tensor([1.0]).to(device),  torch.tensor([1/9]).sqrt().to(device))
)

model = RiemannGP(train_x, train_y, likelihood, kernel, train_idx).to(device)
model_name = dataset+'_semisup_nu'+str(int(kernel.base_kernel.nu.item()))+'_k'+str(kernel.base_kernel.neighbors)

## Train

In [11]:
%%capture
hypers = {
    'likelihood.noise_covar.noise': 1e-2,
    'covar_module.base_kernel.epsilon': 1.5, # kernel.base_kernel.epsilon_prior.sample()
    'covar_module.base_kernel.lengthscale': 10.0,
    'covar_module.outputscale': 1.0,
}
model.initialize(**hypers)

In [12]:
model.manifold_informed_train(lr=1e-1, iter=100,
                              decay_step_size=100, decay_magnitude=1.0, 
                              norm_step_size=100, norm_rand_vec=100, 
                              verbose=True, save=False)
print("NoiseVar: ", likelihood.noise.item(), "SignalVar: ", kernel.outputscale.item(), 
      "Lengthscale: ", kernel.base_kernel.lengthscale.item(), "Epsilon", kernel.base_kernel.epsilon.item())
torch.cuda.empty_cache()

Iter: 0, Loss: 2762.475, NoiseVar: 0.010, SignalVar: 0.00106, Lengthscale: 10.000, Epsilon: 1.500
Iter: 1, Loss: 1975.751, NoiseVar: 0.009, SignalVar: 0.00096, Lengthscale: 9.900, Epsilon: 1.423
Iter: 2, Loss: 1153.589, NoiseVar: 0.008, SignalVar: 0.00087, Lengthscale: 9.801, Epsilon: 1.348
Iter: 3, Loss: 572.540, NoiseVar: 0.007, SignalVar: 0.00079, Lengthscale: 9.703, Epsilon: 1.275
Iter: 4, Loss: -99.045, NoiseVar: 0.007, SignalVar: 0.00073, Lengthscale: 9.609, Epsilon: 1.205
Iter: 5, Loss: -774.453, NoiseVar: 0.006, SignalVar: 0.00067, Lengthscale: 9.523, Epsilon: 1.136
Iter: 6, Loss: -1716.743, NoiseVar: 0.006, SignalVar: 0.00062, Lengthscale: 9.459, Epsilon: 1.069
Iter: 7, Loss: -2853.274, NoiseVar: 0.007, SignalVar: 0.00057, Lengthscale: 9.436, Epsilon: 1.003
Iter: 8, Loss: -3789.977, NoiseVar: 0.007, SignalVar: 0.00053, Lengthscale: 9.457, Epsilon: 0.940
Iter: 9, Loss: -5309.148, NoiseVar: 0.008, SignalVar: 0.00053, Lengthscale: 9.504, Epsilon: 0.897
Iter: 10, Loss: -4460.978, 

In [13]:
# torch.save(model.state_dict(), '../outputs/models/'+model_name+'.pth')
# model.load_state_dict(torch.load('../outputs/models/'+model_name+'.pth'))

In [14]:
print("NoiseVar: ", likelihood.noise.item(), "SignalVar: ", kernel.outputscale.item(), 
      "Lengthscale: ", kernel.base_kernel.lengthscale.item(), "Epsilon", kernel.base_kernel.epsilon.item())

NoiseVar:  0.02284744195640087 SignalVar:  0.19280467927455902 Lengthscale:  12.972372055053711 Epsilon 1.3252463340759277


### Extract EigenPairs

In [15]:
kernel.base_kernel.method = 'lanczos'
kernel.base_kernel.modes = 2000
kernel.base_kernel.generate_eigenpairs()

lanczos


### Train with Fixed number of Eigenfunctions

In [16]:
model.vanilla_train(lr=1e-1, iter=200, verbose=True)
print("NoiseVar: ", likelihood.noise.item(), "SignalVar: ", kernel.outputscale.item(), 
      "Lengthscale: ", kernel.base_kernel.lengthscale.item(), "Epsilon", kernel.base_kernel.epsilon.item())

Iter: 0, Loss: -0.070, NoiseVar: 0.023, SignalVar: 0.19280, Lengthscale: 12.972, Epsilon: 1.325
Iter: 1, Loss: -0.113, NoiseVar: 0.021, SignalVar: 0.21108, Lengthscale: 13.072, Epsilon: 1.325
Iter: 2, Loss: -0.154, NoiseVar: 0.019, SignalVar: 0.23079, Lengthscale: 13.172, Epsilon: 1.325
Iter: 3, Loss: -0.191, NoiseVar: 0.017, SignalVar: 0.25194, Lengthscale: 13.271, Epsilon: 1.325
Iter: 4, Loss: -0.227, NoiseVar: 0.015, SignalVar: 0.27453, Lengthscale: 13.370, Epsilon: 1.325
Iter: 5, Loss: -0.260, NoiseVar: 0.014, SignalVar: 0.29853, Lengthscale: 13.468, Epsilon: 1.325
Iter: 6, Loss: -0.292, NoiseVar: 0.013, SignalVar: 0.32392, Lengthscale: 13.565, Epsilon: 1.325
Iter: 7, Loss: -0.322, NoiseVar: 0.011, SignalVar: 0.35068, Lengthscale: 13.661, Epsilon: 1.325
Iter: 8, Loss: -0.351, NoiseVar: 0.010, SignalVar: 0.37880, Lengthscale: 13.756, Epsilon: 1.325
Iter: 9, Loss: -0.380, NoiseVar: 0.009, SignalVar: 0.40827, Lengthscale: 13.849, Epsilon: 1.325
Iter: 10, Loss: -0.408, NoiseVar: 0.009,

In [17]:
# torch.save(model.state_dict(), '../outputs/models/'+model_name+'.pth')
# model.load_state_dict(torch.load('../outputs/models/'+model_name+'.pth'))

## Evaluation

In [18]:
%%capture
likelihood.eval()
model.eval()
model_vanilla.likelihood.eval()
model_vanilla.eval()
torch.cuda.empty_cache()
model.vanilla_model = model_vanilla


## Metrics

In [19]:
with torch.no_grad(), gpytorch.settings.fast_pred_var(), gpytorch.settings.cg_tolerance(10000):
    model.posterior(test_x, noise=True)
    bump_scale = 1-kernel.base_kernel.bump_function(test_x)

### Vanilla

In [20]:
with torch.no_grad(), gpytorch.settings.fast_pred_var(), gpytorch.settings.cg_tolerance(10000):
    error = test_y - model.mean("vanilla")
    covar = model.posterior_vanilla.lazy_covariance_matrix.evaluate_kernel()
    inv_quad, logdet = covar.inv_quad_logdet(inv_quad_rhs=error.unsqueeze(-1), logdet=True)
    rmse = (error.square().mean()).sqrt()
    nll = 0.5 * sum([inv_quad, logdet, error.size(-1)* np.log(2 * np.pi)])/error.size(-1)
torch.cuda.empty_cache()
print("RMSE: ", rmse)
print("NLL: ", nll)

RMSE:  tensor(0.1152, device='cuda:0')
NLL:  tensor(-1.3309, device='cuda:0')


### Manifold

In [21]:
with torch.no_grad(), gpytorch.settings.fast_pred_var(), gpytorch.settings.cg_tolerance(10000):
    error = test_y - model.mean("manifold")
    covar = model.posterior_manifold.lazy_covariance_matrix.evaluate_kernel()
    inv_quad, logdet = covar.inv_quad_logdet(inv_quad_rhs=error.unsqueeze(-1), logdet=True)
    rmse = (error.square().mean()).sqrt()
    nll = 0.5 * sum([inv_quad, logdet, error.size(-1)* np.log(2 * np.pi)])/error.size(-1)
torch.cuda.empty_cache()
print("RMSE: ", rmse)
print("NLL: ", nll)

RMSE:  tensor(1.2250, device='cuda:0')
NLL:  tensor(-1.2051, device='cuda:0')


### Hybrid

In [22]:
with torch.no_grad(), gpytorch.settings.fast_pred_var(), gpytorch.settings.cg_tolerance(10000):
    error = test_y - model.mean("hybrid")
    covar = model.posterior_manifold.lazy_covariance_matrix.evaluate_kernel() 
    + torch.outer(bump_scale,bump_scale) * model.posterior_vanilla.lazy_covariance_matrix.evaluate_kernel()
    inv_quad, logdet = covar.inv_quad_logdet(inv_quad_rhs=error.unsqueeze(-1), logdet=True)
    rmse = (error.square().mean()).sqrt()
    nll = 0.5 * sum([inv_quad, logdet, error.size(-1)* np.log(2 * np.pi)])/error.size(-1)
torch.cuda.empty_cache()
print("RMSE: ", rmse)
print("NLL: ", nll)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.67 GiB. GPU 0 has a total capacty of 23.64 GiB of which 2.00 GiB is free. Including non-PyTorch memory, this process has 21.42 GiB memory in use. Of the allocated memory 17.78 GiB is allocated by PyTorch, and 1.42 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF