/
dgp_rff.py
423 lines (340 loc) · 20.7 KB
/
dgp_rff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
## Copyright 2016 Kurt Cutajar, Edwin V. Bonilla, Pietro Michiardi, Maurizio Filippone
##
## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
from __future__ import print_function
import tensorflow as tf
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from dataset import DataSet
import utils
import likelihoods
import time
current_milli_time = lambda: int(round(time.time() * 1000))
class DgpRff(object):
def __init__(self, likelihood_fun, num_examples, d_in, d_out, n_layers, n_rff, df, kernel_type, kernel_arccosine_degree, is_ard, local_reparam, feed_forward, q_Omega_fixed, theta_fixed, learn_Omega):
"""
:param likelihood_fun: Likelihood function
:param num_examples: total number of input samples
:param d_in: Dimensionality of the input
:param d_out: Dimensionality of the output
:param n_layers: Number of hidden layers
:param n_rff: Number of random features for each layer
:param df: Number of GPs for each layer
:param kernel_type: Kernel type: currently only random Fourier features for RBF and arccosine kernels are implemented
:param kernel_arccosine_degree: degree parameter of the arccosine kernel
:param is_ard: Whether the kernel is ARD or isotropic
:param feed_forward: Whether the original inputs should be fed forward as input to each layer
:param Omega_fixed: Whether the Omega weights should be fixed throughout the optimization
:param theta_fixed: Whether covariance parameters should be fixed throughout the optimization
:param learn_Omega: How to treat Omega - fixed (from the prior), optimized, or learned variationally
:param local_reparam: Whether to use the local reparameterization trick or not
"""
self.likelihood = likelihood_fun
self.kernel_type = kernel_type
self.is_ard = is_ard
self.feed_forward = feed_forward
self.q_Omega_fixed = q_Omega_fixed
self.theta_fixed = theta_fixed
self.q_Omega_fixed_flag = q_Omega_fixed > 0
self.theta_fixed_flag = theta_fixed > 0
self.learn_Omega = learn_Omega
self.local_reparam = local_reparam
self.arccosine_degree = kernel_arccosine_degree
## These are all scalars
self.num_examples = num_examples
self.nl = n_layers ## Number of hidden layers
self.n_Omega = n_layers ## Number of weigh matrices is "Number of hidden layers"
self.n_W = n_layers
## These are arrays to allow flexibility in the future
self.n_rff = n_rff * np.ones(n_layers, dtype = np.int32)
self.df = df * np.ones(n_layers, dtype=np.int32)
## Dimensionality of Omega matrices
if self.feed_forward:
self.d_in = np.concatenate([[d_in], self.df[:(n_layers - 1)] + d_in])
else:
self.d_in = np.concatenate([[d_in], self.df[:(n_layers - 1)]])
self.d_out = self.n_rff
## Dimensionality of W matrices
if self.kernel_type == "RBF":
self.dhat_in = self.n_rff * 2
self.dhat_out = np.concatenate([self.df[:-1], [d_out]])
if self.kernel_type == "arccosine":
self.dhat_in = self.n_rff
self.dhat_out = np.concatenate([self.df[:-1], [d_out]])
## When Omega is learned variationally, define the right KL function and the way Omega are constructed
if self.learn_Omega == "var_resampled":
self.get_kl = self.get_kl_Omega_to_learn
self.sample_from_Omega = self.sample_from_Omega_to_learn
## When Omega is optimized, fix some standard normals throughout the execution that will be used to construct Omega
if self.learn_Omega == "var_fixed":
self.get_kl = self.get_kl_Omega_to_learn
self.sample_from_Omega = self.sample_from_Omega_optim
self.z_for_Omega_fixed = []
for i in range(self.n_Omega):
tmp = utils.get_normal_samples(1, self.d_in[i], self.d_out[i])
self.z_for_Omega_fixed.append(tf.Variable(tmp[0,:,:], trainable = False))
## When Omega is fixed, fix some standard normals throughout the execution that will be used to construct Omega
if self.learn_Omega == "prior_fixed":
self.get_kl = self.get_kl_Omega_fixed
self.sample_from_Omega = self.sample_from_Omega_fixed
self.z_for_Omega_fixed = []
for i in range(self.n_Omega):
tmp = utils.get_normal_samples(1, self.d_in[i], self.d_out[i])
self.z_for_Omega_fixed.append(tf.Variable(tmp[0,:,:], trainable = False))
## Parameters defining prior over Omega
self.log_theta_sigma2 = tf.Variable(tf.zeros([n_layers]), name="log_theta_sigma2")
if self.is_ard:
self.llscale0 = []
for i in range(self.nl):
self.llscale0.append(tf.constant(0.5 * np.log(self.d_in[i]), 'float32'))
else:
self.llscale0 = tf.constant(0.5 * np.log(self.d_in))
if self.is_ard:
self.log_theta_lengthscale = []
for i in range(self.nl):
self.log_theta_lengthscale.append(tf.Variable(tf.multiply(tf.ones([self.d_in[i]]), self.llscale0[i]), name="log_theta_lengthscale"))
else:
self.log_theta_lengthscale = tf.Variable(self.llscale0, name="log_theta_lengthscale")
self.prior_mean_Omega, self.log_prior_var_Omega = self.get_prior_Omega(self.log_theta_lengthscale)
## Set the prior over weights
self.prior_mean_W, self.log_prior_var_W = self.get_prior_W()
## Initialize posterior parameters
if self.learn_Omega == "var_resampled":
self.mean_Omega, self.log_var_Omega = self.init_posterior_Omega()
if self.learn_Omega == "var_fixed":
self.mean_Omega, self.log_var_Omega = self.init_posterior_Omega()
self.mean_W, self.log_var_W = self.init_posterior_W()
## Set the number of Monte Carlo samples as a placeholder so that it can be different for training and test
self.mc = tf.placeholder(tf.int32)
## Batch data placeholders
Din = d_in
Dout = d_out
self.X = tf.placeholder(tf.float32, [None, Din])
self.Y = tf.placeholder(tf.float32, [None, Dout])
## Builds whole computational graph with relevant quantities as part of the class
self.loss, self.kl, self.ell, self.layer_out = self.get_nelbo()
## Initialize the session
self.session = tf.Session()
## Definition of a prior for Omega - which depends on the lengthscale of the covariance function
def get_prior_Omega(self, log_lengthscale):
if self.is_ard:
prior_mean_Omega = []
log_prior_var_Omega = []
for i in range(self.nl):
prior_mean_Omega.append(tf.zeros([self.d_in[i],1]))
for i in range(self.nl):
log_prior_var_Omega.append(-2 * log_lengthscale[i])
else:
prior_mean_Omega = tf.zeros(self.nl)
log_prior_var_Omega = -2 * log_lengthscale
return prior_mean_Omega, log_prior_var_Omega
## Definition of a prior over W - these are standard normals
def get_prior_W(self):
prior_mean_W = tf.zeros(self.n_W)
log_prior_var_W = tf.zeros(self.n_W)
return prior_mean_W, log_prior_var_W
## Function to initialize the posterior over omega
def init_posterior_Omega(self):
mu, sigma2 = self.get_prior_Omega(self.llscale0)
mean_Omega = [tf.Variable(mu[i] * tf.ones([self.d_in[i], self.d_out[i]]), name="q_Omega") for i in range(self.n_Omega)]
log_var_Omega = [tf.Variable(sigma2[i] * tf.ones([self.d_in[i], self.d_out[i]]), name="q_Omega") for i in range(self.n_Omega)]
return mean_Omega, log_var_Omega
## Function to initialize the posterior over W
def init_posterior_W(self):
mean_W = [tf.Variable(tf.zeros([self.dhat_in[i], self.dhat_out[i]]), name="q_W") for i in range(self.n_W)]
log_var_W = [tf.Variable(tf.zeros([self.dhat_in[i], self.dhat_out[i]]), name="q_W") for i in range(self.n_W)]
return mean_W, log_var_W
## Function to compute the KL divergence between priors and approximate posteriors over model parameters (Omega and W) when q(Omega) is to be learned
def get_kl_Omega_to_learn(self):
kl = 0
for i in range(self.n_Omega):
kl = kl + utils.DKL_gaussian(self.mean_Omega[i], self.log_var_Omega[i], self.prior_mean_Omega[i], self.log_prior_var_Omega[i])
for i in range(self.n_W):
kl = kl + utils.DKL_gaussian(self.mean_W[i], self.log_var_W[i], self.prior_mean_W[i], self.log_prior_var_W[i])
return kl
## Function to compute the KL divergence between priors and approximate posteriors over model parameters (W only) when q(Omega) is not to be learned
def get_kl_Omega_fixed(self):
kl = 0
for i in range(self.n_W):
kl = kl + utils.DKL_gaussian(self.mean_W[i], self.log_var_W[i], self.prior_mean_W[i], self.log_prior_var_W[i])
return kl
## Returns samples from approximate posterior over Omega
def sample_from_Omega_to_learn(self):
Omega_from_q = []
for i in range(self.n_Omega):
z = utils.get_normal_samples(self.mc, self.d_in[i], self.d_out[i])
Omega_from_q.append(tf.add(tf.multiply(z, tf.exp(self.log_var_Omega[i] / 2)), self.mean_Omega[i]))
return Omega_from_q
## Returns Omega values calculated from fixed random variables and mean and variance of q() - the latter are optimized and enter the calculation of the KL so also lengthscale parameters get optimized
def sample_from_Omega_optim(self):
Omega_from_q = []
for i in range(self.n_Omega):
z = tf.multiply(self.z_for_Omega_fixed[i], tf.ones([self.mc, self.d_in[i], self.d_out[i]]))
Omega_from_q.append(tf.add(tf.multiply(z, tf.exp(self.log_var_Omega[i] / 2)), self.mean_Omega[i]))
return Omega_from_q
## Returns samples from prior over Omega - in this case, randomness is fixed throughout learning (and Monte Carlo samples)
def sample_from_Omega_fixed(self):
Omega_from_q = []
for i in range(self.n_Omega):
z = tf.multiply(self.z_for_Omega_fixed[i], tf.ones([self.mc, self.d_in[i], self.d_out[i]]))
if self.is_ard == True:
reshaped_log_prior_var_Omega = tf.tile(tf.reshape(self.log_prior_var_Omega[i] / 2, [self.d_in[i],1]), [1,self.d_out[i]])
Omega_from_q.append(tf.multiply(z, tf.exp(reshaped_log_prior_var_Omega)))
if self.is_ard == False:
Omega_from_q.append(tf.add(tf.multiply(z, tf.exp(self.log_prior_var_Omega[i] / 2)), self.prior_mean_Omega[i]))
return Omega_from_q
## Returns samples from approximate posterior over W
def sample_from_W(self):
W_from_q = []
for i in range(self.n_W):
z = utils.get_normal_samples(self.mc, self.dhat_in[i], self.dhat_out[i])
self.z = z
W_from_q.append(tf.add(tf.multiply(z, tf.exp(self.log_var_W[i] / 2)), self.mean_W[i]))
return W_from_q
## Returns the expected log-likelihood term in the variational lower bound
def get_ell(self):
Din = self.d_in[0]
MC = self.mc
N_L = self.nl
X = self.X
Y = self.Y
batch_size = tf.shape(X)[0] # This is the actual batch size when X is passed to the graph of computations
## The representation of the information is based on 3-dimensional tensors (one for each layer)
## Each slice [i,:,:] of these tensors is one Monte Carlo realization of the value of the hidden units
## At layer zero we simply replicate the input matrix X self.mc times
self.layer = []
self.layer.append(tf.multiply(tf.ones([self.mc, batch_size, Din]), X))
## Forward propagate information from the input to the output through hidden layers
Omega_from_q = self.sample_from_Omega()
for i in range(N_L):
layer_times_Omega = tf.matmul(self.layer[i], Omega_from_q[i]) # X * Omega
## Apply the activation function corresponding to the chosen kernel - PHI
if self.kernel_type == "RBF":
Phi = tf.exp(0.5 * self.log_theta_sigma2[i]) / tf.cast(tf.sqrt(1. * self.n_rff[i]), 'float32') * tf.concat(values=[tf.cos(layer_times_Omega), tf.sin(layer_times_Omega)], axis=2)
if self.kernel_type == "arccosine":
if self.arccosine_degree == 0:
Phi = tf.exp(0.5 * self.log_theta_sigma2[i]) / tf.cast(tf.sqrt(1. * self.n_rff[i]), 'float32') * tf.concat(values=[tf.sign(tf.maximum(layer_times_Omega, 0.0))], axis=2)
if self.arccosine_degree == 1:
Phi = tf.exp(0.5 * self.log_theta_sigma2[i]) / tf.cast(tf.sqrt(1. * self.n_rff[i]), 'float32') * tf.concat(values=[tf.maximum(layer_times_Omega, 0.0)], axis=2)
if self.arccosine_degree == 2:
Phi = tf.exp(0.5 * self.log_theta_sigma2[i]) / tf.cast(tf.sqrt(1. * self.n_rff[i]), 'float32') * tf.concat(values=[tf.square(tf.maximum(layer_times_Omega, 0.0))], axis=2)
if self.local_reparam:
z_for_F_sample = utils.get_normal_samples(self.mc, tf.shape(Phi)[1], self.dhat_out[i])
mean_F = tf.tensordot(Phi, self.mean_W[i], [[2], [0]])
var_F = tf.tensordot(tf.pow(Phi,2), tf.exp(self.log_var_W[i]), [[2],[0]])
F = tf.add(tf.multiply(z_for_F_sample, tf.sqrt(var_F)), mean_F)
else:
W_from_q = self.sample_from_W()
F = tf.matmul(Phi, W_from_q[i])
if self.feed_forward and not (i == (N_L-1)): ## In the feed-forward case, no concatenation in the last layer so that F has the same dimensions of Y
F = tf.concat(values=[F, self.layer[0]], axis=2)
self.layer.append(F)
## Output layer
layer_out = self.layer[N_L]
## Given the output layer, we compute the conditional likelihood across all samples
ll = self.likelihood.log_cond_prob(Y, layer_out)
## Mini-batch estimation of the expected log-likelihood term
ell = tf.reduce_sum(tf.reduce_mean(ll, 0)) * self.num_examples / tf.cast(batch_size, "float32")
return ell, layer_out
## Maximize variational lower bound --> minimize Nelbo
def get_nelbo(self):
kl = self.get_kl()
ell, layer_out = self.get_ell()
nelbo = kl - ell
return nelbo, kl, ell, layer_out
## Return predictions on some data
def predict(self, data, mc_test):
out = self.likelihood.predict(self.layer_out)
nll = - tf.reduce_sum(-np.log(mc_test) + utils.logsumexp(self.likelihood.log_cond_prob(self.Y, self.layer_out), 0))
#nll = - tf.reduce_sum(tf.reduce_mean(self.likelihood.log_cond_prob(self.Y, self.layer_out), 0))
pred, neg_ll = self.session.run([out, nll], feed_dict={self.X:data.X, self.Y: data.Y, self.mc:mc_test})
mean_pred = np.mean(pred, 0)
return mean_pred, neg_ll
## Return the list of TF variables that should be "free" to be optimized
def get_vars_fixing_some(self, all_variables):
if (self.q_Omega_fixed_flag == True) and (self.theta_fixed_flag == True):
variational_parameters = [v for v in all_variables if (not v.name.startswith("q_Omega") and not v.name.startswith("log_theta"))]
if (self.q_Omega_fixed_flag == True) and (self.theta_fixed_flag == False):
variational_parameters = [v for v in all_variables if (not v.name.startswith("q_Omega"))]
if (self.q_Omega_fixed_flag == False) and (self.theta_fixed_flag == True):
variational_parameters = [v for v in all_variables if (not v.name.startswith("log_theta"))]
if (self.q_Omega_fixed_flag == False) and (self.theta_fixed_flag == False):
variational_parameters = all_variables
return variational_parameters
## Function that learns the deep GP model with random Fourier feature approximation
def learn(self, data, learning_rate, mc_train, batch_size, n_iterations, optimizer = None, display_step=100, test = None, mc_test=None, loss_function=None, duration = 1000000, less_prints=False):
total_train_time = 0
if optimizer is None:
optimizer = tf.train.AdadeltaOptimizer(learning_rate)
## Set all_variables to contain the complete set of TF variables to optimize
all_variables = tf.trainable_variables()
## Define the optimizer
train_step = optimizer.minimize(self.loss, var_list=all_variables)
## Initialize all variables
init = tf.global_variables_initializer()
##init = tf.initialize_all_variables()
## Fix any variables that are supposed to be fixed
train_step = optimizer.minimize(self.loss, var_list=self.get_vars_fixing_some(all_variables))
## Initialize TF session
self.session.run(init)
## Set the folder where the logs are going to be written
# summary_writer = tf.train.SummaryWriter('logs/', self.session.graph)
summary_writer = tf.summary.FileWriter('logs/', self.session.graph)
if not(less_prints):
nelbo, kl, ell, _ = self.session.run(self.get_nelbo(), feed_dict={self.X: data.X, self.Y: data.Y, self.mc: mc_train})
print("Initial kl=" + repr(kl) + " nell=" + repr(-ell) + " nelbo=" + repr(nelbo), end=" ")
print(" log-sigma2 =", self.session.run(self.log_theta_sigma2))
## Present data to DGP n_iterations times
for iteration in range(n_iterations):
## Stop after a given budget of minutes is reached
if (total_train_time > 1000 * 60 * duration):
break
## Present one batch of data to the DGP
start_train_time = current_milli_time()
batch = data.next_batch(batch_size)
monte_carlo_sample_train = mc_train
if (current_milli_time() - start_train_time) < (1000 * 60 * duration / 2.0):
monte_carlo_sample_train = 1
self.session.run(train_step, feed_dict={self.X: batch[0], self.Y: batch[1], self.mc: monte_carlo_sample_train})
total_train_time += current_milli_time() - start_train_time
## After reaching enough iterations with Omega fixed, unfix it
if self.q_Omega_fixed_flag == True:
if iteration >= self.q_Omega_fixed:
self.q_Omega_fixed_flag = False
train_step = optimizer.minimize(self.loss, var_list=self.get_vars_fixing_some(all_variables))
if self.theta_fixed_flag == True:
if iteration >= self.theta_fixed:
self.theta_fixed_flag = False
train_step = optimizer.minimize(self.loss, var_list=self.get_vars_fixing_some(all_variables))
## Display logs every "FLAGS.display_step" iterations
if iteration % display_step == 0:
start_predict_time = current_milli_time()
if less_prints:
print("i=" + repr(iteration), end = " ")
else:
nelbo, kl, ell, _ = self.session.run(self.get_nelbo(),
feed_dict={self.X: data.X, self.Y: data.Y, self.mc: mc_train})
print("i=" + repr(iteration) + " kl=" + repr(kl) + " nell=" + repr(-ell) + " nelbo=" + repr(nelbo), end=" ")
print(" log-sigma2=", self.session.run(self.log_theta_sigma2), end=" ")
# print(" log-lengthscale=", self.session.run(self.log_theta_lengthscale), end=" ")
# print(" Omega=", self.session.run(self.mean_Omega[0][0,:]), end=" ")
# print(" W=", self.session.run(self.mean_W[0][0,:]), end=" ")
if loss_function is not None:
pred, nll_test = self.predict(test, mc_test)
elapsed_time = total_train_time + (current_milli_time() - start_predict_time)
print(loss_function.get_name() + "=" + "%.4f" % loss_function.eval(test.Y, pred), end = " ")
print(" nll_test=" + "%.5f" % (nll_test / len(test.Y)), end = " ")
print(" time=" + repr(elapsed_time), end = " ")
print("")