In [88]:
import random

import gym
import numpy as np
import tensorflow as tf
from tensorflow_core import initializers

env = gym.make('Copy-v0')
actions = [(0, 0, 0),
           (0, 0, 1),
           (0, 0, 2),
           (0, 0, 3),
           (0, 0, 4),
           (0, 1, 0),
           (0, 1, 1),
           (0, 1, 2),
           (0, 1, 3),
           (0, 1, 4),
           (1, 0, 0),
           (1, 0, 1),
           (1, 0, 2),
           (1, 0, 3),
           (1, 0, 4),
           (1, 1, 0),
           (1, 1, 1),
           (1, 1, 2),
           (1, 1, 3),
           (1, 1, 4)]

In [86]:
# constants
EPSILON = 0.2
ALPHA = 0.2
GAMMA = 0.9

In [81]:
print('State:', env.observation_space)
print('Action:', env.action_space)
print('Num actions:', len(actions))

State: Discrete(6)
Action: Tuple(Discrete(2), Discrete(2), Discrete(5))
Num actions: 20


In [82]:
# initial state
# for each action:
#   compute features from (state, action)
#   forward pass features
#   output is single q value
# taken: max(q_vals) or random action

def make_input(state, action):
    return np.matrix([state, action[0], action[1], action[2]])

In [114]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(4,)),
    tf.keras.layers.Dense(25, activation='relu', kernel_initializer=initializers.he_normal()),
    tf.keras.layers.Dense(25, activation='relu', kernel_initializer=initializers.he_normal()),
    # tf.keras.layers.Dense(25, activation='relu', kernel_initializer=initializers.he_normal()),
    tf.keras.layers.Dense(1, activation='linear', kernel_initializer=initializers.he_normal())
])

In [115]:
# inp = make_input(1, (1, 0, 2))
# model(inp).numpy()

In [117]:
observation = env.reset()
for i in range(5):
    with tf.GradientTape() as tape:
        rand = random.random()
        if rand < EPSILON:
            action = env.action_space.sample()
        else:
            q_vals = []
            # get max q value of actions from the current state
            for poss_action in actions:
                q_vals.append(model(make_input(observation, poss_action)).numpy()[0][0])

            best_idx = np.argmax(q_vals)
            action = actions[best_idx]
            print('Q vals:', q_vals)
            print(f'Action taken: {action} with index {best_idx}')

        next_state, reward, done, info = env.step(action)

        # obtain next best q value
        if done:
            max_next_q = 0
            print(f'Episode finished after {i} timesteps')
        else:
            next_q_vals = []
            for next_action in actions:
                next_q_vals.append(tf.stop_gradient(model(make_input(next_state, next_action)).numpy()[0][0]))
            max_next_q = max(next_q_vals)
            print(f'Max next q: {max_next_q}')

        # perform updates
        curr_q = tf.stop_gradient(model(make_input(observation, action)).numpy()[0][0])
        td_err = (reward + (GAMMA * max_next_q)) - curr_q
        print(f'TD-Error: {td_err}')

        # apply gradients
        opt = tf.keras.optimizers.Adam(learning_rate=ALPHA)
        grads = tape.gradient(td_err, model.trainable_variables)
        print(grads)
        opt.apply_gradients(zip(grads, model.trainable_variables))

        observation = next_state

Q vals: [1.1734979, 1.4622911, 2.0552993, 2.5353537, 2.7863402, 1.3610632, 1.6465725, 2.20942, 2.8411965, 3.3930848, 2.3801765, 2.117188, 2.3431318, 2.6185634, 3.0114305, 1.7441291, 1.94804, 2.405537, 2.7704277, 3.139946]
Action taken: (0, 1, 4) with index 9
Episode finished after 0 timesteps
TD-Error: -3.893084764480591
[None, None, None, None, None, None]


ValueError: No gradients provided for any variable: ['dense_84/kernel:0', 'dense_84/bias:0', 'dense_85/kernel:0', 'dense_85/bias:0', 'dense_86/kernel:0', 'dense_86/bias:0'].