# Create and test the generalized Stochastic Sampling (gSS) model 

## In this notebook do it "by hand", ie using granular interfaces such as the ``Keras`` functional interface

Here we create a ``hidim`` version of the model with the ``Adam`` optimizer for the frequency bounds (aka scales) and linear regression for the outer (linear) weights

In [None]:
%load_ext autoreload
%autoreload 2
#%matplotlib widget

In [None]:
import numpy as np
import tensorflow as tf
from matplotlib import cm
from tensorflow import keras
from functools import reduce
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm.keras import TqdmCallback
import tensorflow.keras.backend as K
from mpl_toolkits.mplot3d import Axes3D
from timeit import default_timer as timer


# our stuff
from nnu import points_generator as pgen
from nnu import laplace_kernel, fit_function_factory
from nnu import gss_kernels, gss_layer

# globals
np.set_printoptions(precision =3, suppress=False)

In [None]:
# use floatXX
keras_dtype = 'float32'
tf.keras.backend.set_floatx(keras_dtype)
tf_dtype = tf.float32 if keras_dtype == 'float32' else tf.float64
np_dtype = np.float32 if keras_dtype == 'float32' else np.float64


## Create the function we want to fit

In [None]:
ndim = 2
laplace_mixture = fit_function_factory.KernelType.LpM
stds = [1.5, 1.0, 0.5][-ndim:]
off_diag_correl = 0.0
laplace_shift = 1
means = np.array([[1, 1, 0], [-1, -1, 0], [0.5, -0.5, 0]])
means = means[:, :ndim]
cov_multipliers = [0.5, 0.3, 0.1]  
mix_weights = [0.4, 0.35, 0.35]  
covar_matr = laplace_kernel.simple_covar_matrix(stds, off_diag_correl)

function_to_fit = fit_function_factory.generate_nd(
    laplace_mixture, covar_matr, 
    shift=laplace_shift,
    means=means,
    cov_multipliers=cov_multipliers,
    mix_weights=mix_weights)


## Generate the learning set (inputX, inputY)


In [None]:

nsamples = 10000
input_seed = 1917
sim_range = 4

points_type = "random"
inputX = pgen.generate_points(sim_range, nsamples, ndim, points_type, seed = input_seed)[0]
inputY = function_to_fit(inputX)


## Generate nodes for our model


In [None]:

nodes_type = "random"
# nodes_type="regular"
nodes_seed = 2022

nnodes = 200
nsr_stretch = 1.2
nodes_sim_range = sim_range * nsr_stretch 

nodes = pgen.generate_points(
    nodes_sim_range, nnodes, ndim, nodes_type, seed = nodes_seed, plot = 0)[0]
nnodes = len(nodes)

nnodes_per_dim = round(pow(nnodes, 1./ndim))
global_scale = pgen.average_distance(nodes)

## Set up a generalized stochastic sampling (gSS) model
We use Keras's functional interface, where we perform regression on the outer coefs for each guess of the inner ones
Note that here we are setting up a ``hidim`` (see paper) version of the model, but this could be changed by setting ``scales_dim`` to other values of ``gss_layer.ScalesDim`` enum

In [None]:
optimize_knots=False
optimize_scales=True
scales_dim=gss_layer.ScalesDim.OnePerKnot # this is for hidim flavour of the model
apply_final_aggr = False

scales_init = np.ones((1,ndim)) if scales_dim != gss_layer.ScalesDim.OnePerKnot else np.ones_like(nodes)

print_model_summary = True

fake_dim = 1
k_fake_input = keras.Input((fake_dim,), name = 'input', dtype=tf_dtype)

# "sideload" inputX -- a node that always returns inputX
xvals = keras.layers.Lambda(
    lambda _: tf.constant(inputX),
    input_dim = fake_dim,
    name='xpts'
)(k_fake_input)

# "sideload" inputY -- a node that always returns inputY
yvals = keras.layers.Lambda(
    lambda _: tf.expand_dims(tf.constant(inputY),-1),
    input_dim = fake_dim,
    name='ypts'
)(k_fake_input)

# Construct  ProdKernelLayer for inputX. Here we have some trainable parameters that will later be optimized
# most typically scales. Nodes have been pre-set
prod_kernel = gss_kernels.lanczos_kernel(a=2.0, freq_bound=0.9/global_scale, tf_dtype = tf_dtype)
per_coord_kernels_at_input_x = gss_layer.ProdKernelLayer(
    input_dim = ndim,
    knots=nodes,
    scales=scales_init,
    optimize_knots=optimize_knots,
    optimize_scales=optimize_scales,
    scales_dim=scales_dim,
    activation = prod_kernel,
    name = 'prodkernel'
)(xvals)

# Apply coordinatewise product to the output of ProdKernelLayer to get actual kernels for inputX
kernels_at_input_x = keras.layers.Lambda(
    lambda x: K.prod(x, axis=2),
    name='product'
)(per_coord_kernels_at_input_x)

# Regress inputY on the product kernels evaluated at inputX
regr_xy = keras.layers.Lambda(
    lambda xy: tf.linalg.lstsq(xy[0],xy[1],l2_regularizer=0.01),
    name = 'regr_xy'
)([kernels_at_input_x,yvals])

regr_model = keras.Model(
    inputs=k_fake_input, outputs=regr_xy, name="regr_xy_model")
regr_model.build(input_shape=(fake_dim,))
if print_model_summary:
    regr_model.summary()

regr_model_output = regr_model(k_fake_input)

# Now predict the values of y from the regression (in the optimizer, for fixed values of scales)
predict_y  = keras.layers.Lambda(
    lambda xc: tf.matmul(xc[0],xc[1]),
    name = 'predict_y'
)([kernels_at_input_x, regr_model_output])

# first build a model that predicts y so we can examine the results later
predict_model = keras.Model(inputs=k_fake_input, outputs=predict_y, name="predict_y_model")
predict_model.build(input_shape = (fake_dim,))

if print_model_summary:
    predict_model.summary()

predict_model_output = predict_model(k_fake_input)

# now add the residual
resid_xy  = keras.layers.Subtract(
    name = 'resid_xy'
)([yvals,predict_model_output])

# optionally sum up the squares inside the model 
if apply_final_aggr:
    resid_xy = keras.layers.Lambda(
        lambda z:tf.reduce_sum(tf.square(z)),
        name = 'sumsq' 
    )(resid_xy)

# and create a model for the residual -- this is the one to optimize against zero
model = keras.Model(inputs=k_fake_input, outputs=resid_xy, name="fit_model")

model.build(input_shape = (fake_dim,))

if print_model_summary:
    model.summary()

# record starting point
init_weights = model.get_weights().copy()


## Set up and run the Adam optimizer for the frequency bounds (aka inner weights/scales)

In [None]:
n_epochs = 1 # x100
batch_size = nsamples
learn_rate = 0.1
use_tb_callback = False
use_tqdm = True

In [None]:
model.set_weights(init_weights)

output_dim = 1 if apply_final_aggr else batch_size
fake_x = np.zeros((output_dim, fake_dim), dtype=np_dtype)
fake_y = np.zeros(output_dim, dtype=np_dtype)

opt = keras.optimizers.Adam(learning_rate=learn_rate)
model.compile(loss="mean_squared_error", optimizer=opt, metrics=['mse','mae'])

logdir = os.path.join("logs", datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)

callbacks = [TqdmCallback(verbose=0)] if use_tqdm else []
if use_tb_callback:
    callbacks.append(tensorboard_callback)

start_time = timer()
stats_before = model.evaluate(fake_x, fake_y, batch_size= batch_size)
model.fit(fake_x, fake_y, epochs=int(100*n_epochs), batch_size=batch_size, 
    verbose=0, callbacks=callbacks)
stats_after = model.evaluate(fake_x, fake_y, batch_size= batch_size)
fit = predict_model.predict(fake_x, batch_size= batch_size)
end_time = timer()
print(f'Time ellapsed = {end_time - start_time:.1f}')


In [None]:
#%matplotlib auto
%matplotlib inline
print(f'stats before fit = {(stats_before)}') 
print(f'stats after fit =  {(stats_after)}')
learn_error=np.linalg.norm(fit[:,0] - inputY)/np.linalg.norm(inputY)
print(f'r2 = {1 - learn_error:.4f}')
plt.plot(fit[:,0], inputY, '.')
plt.title('learn: actual vs fit')
plt.show()

In [None]:
# %matplotlib auto
%matplotlib inline
plot_step = 1
ax = plt.axes(projection='3d')

ax.scatter(inputX[::plot_step,0], inputX[::plot_step,1],  inputY[::plot_step],
            cmap=cm.coolwarm, marker='.', alpha = 0.75, s=1, label = 'actual')
ax.scatter(inputX[::plot_step,0], inputX[::plot_step,1],  fit[::plot_step,0],
            cmap=cm.coolwarm, marker='.', alpha = 0.75, s = 1, label = 'fit')

plt.xlabel('x1')
plt.ylabel('x2')
plt.legend(loc = 'best')
plt.show()

## Create a model suitable for prediction on xs other than inputX

In [None]:
test_model = keras.Sequential()

# our main layer
test_model.add(gss_layer.ProdKernelLayer(
    input_dim=ndim,
    knots=nodes,
    scales=scales_init,
    optimize_knots=optimize_knots,
    optimize_scales=optimize_scales,
    scales_dim=scales_dim,
    activation=prod_kernel,
    name='prodkernel'
))

# coordniate-wise product
test_model.add(keras.layers.Lambda(
    lambda x: K.prod(x, axis=2),
    name='product'
))

# final aggregation
test_model.add(keras.layers.Dense(
    1,
    activation='linear',
    name='final',
    use_bias=False,
))

test_model.build()


In [None]:
output_shape = model.layers[-1].output_shape
if output_shape == ():
    output_size = 1
else:
    output_size = output_shape[0]


predict_model = model.get_layer('predict_y_model')
regr_model = predict_model.get_layer('regr_xy_model')
inner_weights = regr_model.get_layer('prodkernel').get_weights()
outer_weights = regr_model.predict(fake_x, batch_size=output_size)


test_model.get_layer('prodkernel').set_weights(inner_weights)
test_model.get_layer('final').set_weights([outer_weights])

## Check that we recover the same outputs for model and test_model when using the same inputs

In [None]:
test_y = test_model.predict(inputX)
plt.plot(fit[:, 0], test_y, '.')
plt.show()


# Compare prediction vs actual for an an independent set of points (test set)


In [None]:

testX = pgen.generate_points(
    sim_range, nsamples, ndim, points_type, seed=123)[0]
testY = function_to_fit(testX)

mc_error = 1-np.linalg.norm(inputY)/np.linalg.norm(testY)

testFit = test_model.predict(testX)

test_error = np.linalg.norm(testFit[:, 0] - testY)/np.linalg.norm(testY)
print(f'testing set r2 = {1 - test_error:.4f}')

%matplotlib inline
plt.plot(testFit[:, 0], testY, '.')
plt.title('test: actual vs fit')
plt.show()

print(f"mc error  :{mc_error:.4f}")
print(f"test error: {test_error:.4f}")
