In [2]:
import sys
import itertools

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
import sonnet as snt
import tqdm.notebook as tqdm
import gym

from util.expectiles import expectile

print(f"Python version:      {sys.version.split()[0]}")
print(f"NumPy version:       {np.__version__}")
print(f"TensorFlow version:  {tf.__version__}")
print(f" Eager execution:     {tf.executing_eagerly()}")
print(f" GPU availability:    {bool(tf.config.experimental.list_physical_devices('GPU'))}")
print(f"Sonnet version:      {snt.__version__}")

Python version:      3.8.2
NumPy version:       1.18.1
TensorFlow version:  2.2.0-rc4
 Eager execution:     True
 GPU availability:    False
Sonnet version:      2.0.0


In [3]:
env = gym.make('Pong-v0')

In [4]:
state = env.reset()
env.render()
done = False
for t in itertools.count(1):
    action = env.action_space.sample()
    state, reward, done, _ = env.step(action)
    env.render()
    if done: break
print(f"episode finished after {t} timesteps")
env.close()

episode finished after 1120 timesteps


In [61]:
class DistributionalActorCritic(snt.Module):
    def __init__(self, taus, n_actions):
        super(DistributionalActorCritic, self).__init__()
        self.norm = tf.constant(255, dtype=tf.float32)
        self.cortex = snt.Sequential([
            snt.Conv2D(16, 3, 1), tf.nn.relu, # conv layer 1
            snt.Conv2D(16, 3, 1), tf.nn.relu, # conv layer 2
            snt.Flatten(),
            snt.Linear(32), tf.nn.relu,        # fully-connected layer 1
            snt.Linear(16), tf.nn.relu,        # output (to actor/critic)
        ])
        self.critic = snt.Linear(taus.size)
        self.actor  = snt.Linear(n_actions)
        self.tau_factors = tf.convert_to_tensor(np.sqrt(taus/(1-taus)), dtype=tf.float32)
    def __call__(self, states):
        represn = self.cortex(states/self.norm)
        values = self.critic(represn)
        action_logits = self.actor(represn)
        print(len(action_logits[0]))
        print(states.shape[0])
        action = tf.random.categorical(action_logits, states.shape[0])[:, 0]
#         print(action.numpy())
        return (values, action)
    def loss(self, value_predictions, targets):
        rpes = tf.subtract(targets[:, tf.newaxis], value_predictions)
        loss = tf.reduce_mean(tf.pow(self.tau_factors, tf.sign(rpes)) * tf.square(rpes))
        return loss

In [54]:
model = DistributionalActorCritic(np.array([0.5]), env.action_space.n)
print(model)
model(state[tf.newaxis])

DistributionalActorCritic(taus=array([0.5]), n_actions=6)


(<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.13548037]], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=int64, numpy=array([0])>)

In [70]:
print("Begin training!")
done = False
x = env.reset()
env.render()
optimizer = snt.optimizers.SGD(learning_rate=0.00001)
for t in itertools.count(1):
    print('step', t, 'last action', a, end="\r")
    with tf.GradientTape() as tx:
        vx, a = model(x[tf.newaxis])
        # WHERE TF ARE THESE NANS COMING FROM?
    y, r, done, _ = env.step(a[0])
    env.render()
    r = tf.convert_to_tensor(r, dtype=tf.float32)
    vy, _ = model(y[tf.newaxis])
    with tx:
        loss = model.loss(vx, r+vy)
    # do gradient update!
    variables = model.trainable_variables
    gradients = tx.gradient(loss, variables)
    optimizer.apply(gradients, variables)
    if done: break
print(f"episode finished after {t} timesteps")
env.close()

Begin training!
step 1 last action tf.Tensor([6], shape=(1,), dtype=int64)

IndexError: index 6 is out of bounds for axis 0 with size 6

In [89]:
# def train(model, labels_rewards, loss_fn, data=TRAIN_DATA,
#           num_items=NUM_REDUCED_IMAGES, num_epochs=NUM_EPOCHS,
#           learning_rate=LEARNING_RATE, **loss_kwargs):
#     progress = tqdm.tqdm(total=num_items*num_epochs, unit="images")
#     loss_log = []
#     loss_log_2 = []
#     optimizer = snt.optimizers.SGD(learning_rate=learning_rate)
#     for minibatch in data.repeat(num_epochs):
#         images, labels = minibatch
#         # generate rewards for batch
#         rewards = np.zeros(labels.shape[0])
#         for label, rs in labels_rewards.items():
#             ids = np.where(labels == label)
#             num = ids[0].size
#             rewards[ids] = np.random.choice(rs, num)
#         rewards = tf.convert_to_tensor(rewards, tf.float32)
#         # predict rewards, compute loss
#         with tf.GradientTape() as tape:
#             value_predictions = model(images)
#             loss = loss_fn(value_predictions, rewards, **loss_kwargs)
#         # apply update
#         variables = model.trainable_variables
#         gradients = tape.gradient(loss, variables)
#         optimizer.apply(gradients, variables)
#         # track progress
#         print(f"loss {loss.numpy():15.3f}", end="\r")
#         loss_log.append(loss.numpy())
#         progress.update(n=labels.shape[0])
#     print(f"loss {loss.numpy():15.3f} (done)")
#     progress.close()
#     return model, loss_log