This notebook will walk you through how to set-up a GP from any given bank of spectra. 

The GPs come from the python package `george` and we "train" them using the package `emcee`. 

Once the GP is trained, we export it as a pickle object to then use with PTA data.

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

from __future__ import division
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
import sys,os,glob,h5py,time
import scipy.signal as ssig
import scipy.interpolate as interp

import scipy.linalg as sl
import scipy.special as ss
import scipy.constants as sc
import scipy.misc as scmisc
import scipy.integrate as si

import george
import george.kernels as kernels
import emcee, corner, pickle

In [None]:
import warnings
import matplotlib as mpl
# Silence annoying numpy errors
np.seterr(divide='ignore', invalid='ignore', over='ignore')
warnings.filterwarnings("ignore", category=UserWarning)

# Plotting settings
mpl.rc('font', **{'family': 'serif', 'sans-serif': ['Times'], 'size': 15})
mpl.rc('lines', solid_capstyle='round')
mpl.rc('mathtext', fontset='cm')
mpl.style.use('default')   # avoid dark backgrounds from dark theme vscode
plt.rcParams.update({'grid.alpha': 0.5})

# Load Spectra

    The first step is to load the bank of spectra. 
    Make sure to double check that dimensionality of the parameter space, and get the parameter limits.

In [None]:
# Start with Spectra from Luke
fname = "/Users/lzkelley/programs/nanograv/ng15yr_astro_interp/Illustris_TestSpectra_N100_Obs20yr__mm13.hdf5"
spectra = h5py.File(fname, 'r')

In [None]:
list(spectra.keys())

In [None]:
spectra['gwb'].shape

In [None]:
print(np.min(spectra['eccs_mu']), np.max(spectra['eccs_mu']))
print(np.min(spectra['hard_gamma']), np.max(spectra['hard_gamma']))
print(np.min(spectra['mm13_amp']), np.max(spectra['mm13_amp']))
print(np.min(spectra['mm13_slope']), np.max(spectra['mm13_slope']))
print(np.min(spectra['tdelay']), np.max(spectra['tdelay']))

## Compute the mean and std from all spectra realizations
    At each point in parameter space, we need to find the mean value and the standard deviation from all of the realizations that we have.

In [None]:
## NOTE - Only need to train GP on number of frequencies in PTA analysis !
NFREQS = 6

gwb_spectra = spectra['gwb'][:,:NFREQS,:]**2

# Find all of the zeros and set them to be h_c = 1e-20
low_ind = np.where(gwb_spectra < 1e-40)
gwb_spectra[low_ind] = 1e-40


# Find mean over 100 realizations
mean = np.log10(np.mean(gwb_spectra, axis=-1))

# Smooth Mean Spectra
## NOTE FOR LUKE - HOW MUCH SMOOTHING DO WE WANT TO DO ?
# smooth_mean = ssig.savgol_filter(mean, 7, 3)

# Find std
err = np.std(np.log10(gwb_spectra), axis=-1)

if np.any(np.isnan(err)):
    print('Got a NAN issue')

In [None]:
## Here is an example plot of the smoothed mean, the mean and standard deviation
## and all of the spectra realizations, for a random point in parameter space.

for ii in range(100):
    plt.loglog(spectra['freqs'][:NFREQS], spectra['gwb'][0,:NFREQS,ii]**2, color='C0', alpha=0.3, zorder=0)
plt.loglog(spectra['freqs'][:NFREQS], spectra['gwb'][0,:NFREQS,0]**2, color='C0', alpha=0.3, zorder=0, label='100 Spectra')
plt.loglog(spectra['freqs'][:NFREQS], 10**mean[0], color='C1', label='Mean')
# plt.loglog(spectra['freqs'][:NFREQS], 10**smooth_mean[0], color='C3', label='Smoothed Mean')
plt.fill_between(spectra['freqs'][:NFREQS], 10**(mean[0]-err[0]), 10**(mean[0]+err[0]), color='C1', alpha=0.5)
plt.legend(loc=2)
plt.xlabel(r'GW Frequency [yr$^{-1}$]')
plt.ylabel(r'$h_{c}^{2}$')

# Train GP

    The next step is to set up the GP class.
    Things to note:
        - need to make sure that the GP has the same dimensionality as the parameter space from the spectra.
        - the GPs work better when they are trained on zero-mean data, so it's very important that we remove the mean values for the spectra at each frequency, BUT these values HAVE TO BE SAVED, because they are required to extract meaningful information back out of the GP once it is trained!

In [None]:
# Define a GP class containing the kernel parameter priors and a log-likelihood

class gaussproc(object):
    
    def __init__(self, x, y, yerr=None):
        
        self.x = x
        self.y = y
        self.yerr = yerr
        
        # The number of GP parameters is one more than the number of spectra parameters.
        self.pmax = np.array([20.0, 20.0, 20.0, 20.0, 20.0, 20.0]) # sampling ranges
        self.pmin = np.array([-20.0, -20.0, -20.0, -20.0, -20.0, -20.0])
        self.emcee_flatchain = None
        self.emcee_flatlnprob = None
        self.emcee_kernel_map = None
    
    def lnprior(self, p):
    
        logp = 0.
    
        if np.all(p <= self.pmax) and np.all(p >= self.pmin):
            logp = np.sum(np.log(1/(self.pmax-self.pmin)))
        else:
            logp = -np.inf

        return logp

    def lnlike(self, p):

        # Update the kernel and compute the lnlikelihood.
        a, tau = np.exp(p[0]), np.exp(p[1:])
        
        lnlike = 0.0
        try:
            gp = george.GP(a * kernels.ExpSquaredKernel(tau, ndim=len(tau)))
            #gp = george.GP(a * kernels.Matern32Kernel(tau))
            gp.compute(self.x, self.yerr)
            lnlike = gp.lnlikelihood(self.y, quiet=True)
        except np.linalg.LinAlgError:
            lnlike = -np.inf
        
        return lnlike
    
    def lnprob(self, p):
        return self.lnprior(p) + self.lnlike(p)

In [None]:
yobs.shape

In [None]:
## Load in the spectra data!

# The "y" data are the means and errors for the spectra at each point in parameter space
# yobs = smooth_mean.copy() #mean.copy()
yobs = mean.copy() #mean.copy()
yerr = err.copy()
GP_freqs = spectra['freqs'][:NFREQS].copy()

## Find mean in each frequency bin (remove it before analyzing with the GP) ##
# This allows the GPs to oscillate around zero, where they are better behaved.
yobs_mean = np.mean(yobs,axis=0)
# MAKE SURE TO SAVE THESE VALUES - THE GP IS USELESS WITHOUT THEM !
np.save('./Luke_Spectra_MEANS.npy', yobs_mean)

yobs -= yobs_mean[None,:]

xx = GP_freqs
for ii in range(yobs.shape[0]):
    plt.loglog(xx, yobs[ii]**2, color='C0', alpha=0.3, zorder=0, label='100 Spectra')

# plt.loglog(xx, 10**mean[0], color='C1', label='Mean')
# # plt.loglog(spectra['freqs'][:NFREQS], 10**smooth_mean[0], color='C3', label='Smoothed Mean')
plt.fill_between(xx, 10**(-err[0]), 10**(err[0]), color='C1', alpha=0.5)
# plt.legend(loc=2)
# plt.xlabel(r'GW Frequency [yr$^{-1}$]')
# plt.ylabel(r'$h_{c}^{2}$')

plt.show()

In [None]:
## The "x" data are the actual parameter values
xobs = np.zeros((120, 5))

# ['eccs_mu', 'hard_gamma', 'mm13_amp', 'mm13_slope', 'tdelay']
xobs[:, 0] = spectra['eccs_mu'][:]
xobs[:, 1] = spectra['hard_gamma'][:]
xobs[:, 2] = spectra['mm13_amp'][:]
xobs[:, 3] = spectra['mm13_slope'][:]
xobs[:, 4] = spectra['tdelay'][:]

In [None]:
# Instanciate a list of GP kernels and models [one for each frequency]

gp_george = []
k = []

for freq_ind in range(len(GP_freqs)):
    
    gp_george.append(gaussproc(xobs, yobs[:,freq_ind], yerr[:,freq_ind]))
    k.append( 1.0 * kernels.ExpSquaredKernel([2.0,2.0,2.0,2.0,2.0], ndim=5) )
    num_kpars = len(k[freq_ind])
    
print(num_kpars)

In [None]:
# Sample the posterior distribution of the kernel parameters 
# to find MAP value for each frequency. 

# THIS WILL TAKE A WHILE... (~ 1 min per frequency)

sampler = [0.0]*len(GP_freqs)
for freq_ind in range(len(GP_freqs)):
    t_start = time.time()
    
    # Set up the sampler.
    nwalkers, ndim = 36, num_kpars
    sampler[freq_ind] = emcee.EnsembleSampler(nwalkers, ndim, gp_george[freq_ind].lnprob)

    # Initialize the walkers.
    p0 = [np.log([1.,1.,1.,1.,1.,1.]) + 1e-4 * np.random.randn(ndim)
          for i in range(nwalkers)]

    print(freq_ind, "Running burn-in")
    p0, lnp, _ = sampler[freq_ind].run_mcmc(p0, 750)
    sampler[freq_ind].reset()

    print(freq_ind, "Running second burn-in")
    p = p0[np.argmax(lnp)]
    p0 = [p + 1e-8 * np.random.randn(ndim) for i in range(nwalkers)]
    p0, _, _ = sampler[freq_ind].run_mcmc(p0, 750)
    sampler[freq_ind].reset()

    print(freq_ind, "Running production")
    p0, _, _ = sampler[freq_ind].run_mcmc(p0, 1500)
    
    print('Completed in {} min'.format((time.time()-t_start)/60.) , '\n')

In [None]:
## Let's take a look at the posterior distribution of the 
# kernel parameters at a frequency [ind] of our choice.

ind = 0

fig = corner.corner(sampler[ind].flatchain, bins=50)
plt.show()

In [None]:
## Populate the GP class with the details of the kernel 
## MAP values for each frequency.

for ii in range(len(GP_freqs)):
    
    gp_george[ii].chain = None 
    gp_george[ii].lnprob = None 
    
    gp_george[ii].kernel_map = sampler[ii].flatchain[np.argmax(sampler[ii].flatlnprobability)] 
    #print(ii, gp_george[ii].kernel_map)
    
    # add-in mean yobs (freq) values
    gp_george[ii].mean_spectra = yobs_mean[ii]

In [None]:
## Save the trained GP as a pickle to be used with PTA data!
pickle.dump(gp_george, open( "LukeSpectra_GP_120nodes_20yr_30freqs.pkl", "wb" ))

# Testing the GP
    The following is some example code looking at how to extract predictions from the GP and test it against the input data.

In [None]:
with open( "LukeSpectra_GP_120nodes_20yr_30freqs.pkl", "rb") as f:
    gp_george = pickle.load(f)

In [None]:
## Set-up GP predictions ##
# If you are running this part of the code separately from the section above, 
# you will need to re-define the GP class from above for this step to work!

gp = []
GP_freqs = np.arange(1.,31.) / (20*365.25*86400.) 

for ii in range(len(GP_freqs)):
    gp_kparams = np.exp(gp_george[ii].kernel_map)

    gp.append(george.GP(gp_kparams[0] * \
            george.kernels.ExpSquaredKernel(gp_kparams[1:],ndim=len(gp_kparams[1:])) ) )

    gp[ii].compute(gp_george[ii].x, gp_george[ii].yerr)

In [None]:
## Make a realization from the GP ##

#  A reminder of the spectra parameters:
# ['eccs_mu', 'hard_gamma', 'mm13_amp', 'mm13_slope', 'tdelay']
env_param = np.array([5.6249, -0.0807,  8.8394,  1.284 ,  5.9822])

rho_pred = np.zeros((len(GP_freqs),2))
for ii,freq in enumerate(GP_freqs):
    mu_pred, cov_pred = gp[ii].predict(gp_george[ii].y, [env_param])
    if np.diag(cov_pred) < 0.0:
        rho_pred[ii,0], rho_pred[ii,1] = mu_pred, 1e-5 * mu_pred
        print(bad)
    else:
        rho_pred[ii,0], rho_pred[ii,1] = mu_pred, np.sqrt(np.diag(cov_pred))

## transforming from zero-mean unit-variance variable to rho
rho = np.array([gp_george[ii].mean_spectra for ii in range(len(GP_freqs))]) + rho_pred[:,0]

hc = np.sqrt(10**rho)

In [None]:
## Making a plot ##

# the raw spectra #
for ii in range(100):
    plt.loglog(spectra['freqs'][:30]/(365.25*86400.), spectra['gwb'][3,:30,ii], color='C0', alpha=0.2, zorder=0)
plt.loglog(spectra['freqs'][:30]/(365.25*86400.), spectra['gwb'][3,:30,ii], color='C0', alpha=0.2, zorder=0, label='Original Spectra')

# the smoothed mean #
plt.loglog(spectra['freqs'][:30]/(365.25*86400.), np.sqrt(10**smooth_mean[3]), color='C1', label='Smoothed Mean', lw=2)

# the GP realization #
plt.semilogx(GP_freqs/(365.25*86400.), hc, color='C3', lw=2.5, label='GP')
plt.fill_between(GP_freqs/(365.25*86400.), np.sqrt(10**(rho+rho_pred[:,1])), np.sqrt(10**(rho-rho_pred[:,1])), color='C3', alpha=0.5)


plt.xlabel('Observed GW Frequency [Hz]')
plt.xlim(1e-9,7e-8)
plt.ylabel(r'$h_{c} (f)$')
plt.ylim(1e-16, 1e-13)

plt.legend(loc=3)
#plt.savefig('./TrainedGP.pdf', bbox_inches='tight', dpi=500)