## Initialize

In [1]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow import keras


from numpy.random import default_rng
rng = default_rng()
!pip install gym[box2d]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting swig==4.*
  Downloading swig-4.0.2-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (3.7 MB)
[K     |████████████████████████████████| 3.7 MB 5.2 MB/s 
[?25hCollecting pygame==2.1.0
  Downloading pygame-2.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[K     |████████████████████████████████| 18.3 MB 1.5 MB/s 
[?25hCollecting box2d-py==2.3.5
  Downloading box2d_py-2.3.5-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 33.0 MB/s 
Installing collected packages: swig, pygame, box2d-py
Successfully installed box2d-py-2.3.5 pygame-2.1.0 swig-4.0.2


### Initialize Display

In [2]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

In [3]:
#Render animation

def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

In [4]:
import matplotlib.pyplot as plt

def display_state():
    img = env.render(mode="rgb_array")

    plt.figure(figsize=(6, 8))
    plt.imshow(img)
    plt.axis("off")
    #save_fig("breakout_plot")
    plt.show()
    return

In [5]:

def display_run(policy =lambda x : 0):
    frames = []
    reward_buffer = []
    obs_buffer = []
    n_steps = 0
    obs = env.reset()
    frames.append( env.render(mode = "rgb_array"))
    done = False
    while done == False:
        obs, reward, done, info = env.step(policy(obs))
        frames.append(env.render(mode = "rgb_array"))
        reward_buffer += [reward]
        obs_buffer += [obs]
        n_steps += 1
    data = (reward_buffer, obs_buffer) 
    return plot_animation(frames), data

## Set up Environment

In [6]:
env = gym.make("LunarLander-v2")
obs = env.reset()

  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."


In [None]:
print( "Action space:      ", env.action_space)
print( "Observation Space: ", env.observation_space)
print( "Metadata:          ", env.metadata )

Action space:       Discrete(4)
Observation Space:  Box([-1.5       -1.5       -5.        -5.        -3.1415927 -5.
 -0.        -0.       ], [1.5       1.5       5.        5.        3.1415927 5.        1.
 1.       ], (8,), float32)
Metadata:           {'render_modes': ['human', 'rgb_array', 'single_rgb_array'], 'render_fps': 50}


## Some Testing

In [None]:
obs = env.reset()
obs

array([ 0.00770826,  1.4023967 ,  0.78074086, -0.37885413, -0.00892509,
       -0.17684948,  0.        ,  0.        ], dtype=float32)

In [None]:
def random_policy(obs):
    return rng.integers(0,4)

In [None]:
(anim, data) = display_run(lambda x: 1)
print("Reward:", sum(data[0]), ",  Steps:", len(data[0]))
anim

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  "The argument mode in render method is deprecated; "


Reward: -575.1786046952159 ,  Steps: 69


## Simple NN policy

In [11]:
n_inputs = env.observation_space.shape[0] # 8 for Lunar Lander

In [21]:
model = keras.models.Sequential([
    keras.layers.Dense(32, activation="elu", input_shape=[n_inputs]),
    #keras.layers.Dense(64, activation="elu"),
    keras.layers.Dense(4, activation="softmax"),
])


Take a step based on the Neural Network policy

In [13]:
def NN_action(obs):
    probabs = model(obs[np.newaxis])
    probabs_64 = (probabs.numpy()).astype('float64')
    probabs_64 = probabs_64 / np.sum(probabs_64)
    return np.argmax(rng.multinomial(1, probabs_64[0]))

Test

In [14]:
(anim, data) = display_run(NN_action)
print("Reward:", sum(data[0]), ",  Steps:", len(data[0]))
anim

Reward: -103.59365800436048 ,  Steps: 86


Now implement policy gradient

In [15]:
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape: # How does this work?
        probabs = model(obs[np.newaxis])
        probabs_64 = (probabs.numpy()).astype('float64')
        probabs_64 = probabs_64 / np.sum(probabs_64)
        action = np.argmax(rng.multinomial(1,probabs_64[0]))
        y_target = tf.one_hot( [action], 4, dtype = tf.float32)
        loss = tf.reduce_mean(loss_fn(action, probabs[np.newaxis, :]))
    grads = tape.gradient(loss, model.trainable_variables) #??
    obs, reward, done, info = env.step(int(action))
    return obs, reward, done, grads

! Note that there is an issue with calling `rng.multinomial` when the last probability is very low (due to rounding errors when casting from float32 to float64). The solution is probably to replace `rng.multinomial` by some tensorflow functionality, but I couldn't find an elegant one. There is a discussion here https://stackoverflow.com/questions/23257587/how-can-i-avoid-value-errors-when-using-numpy-random-multinomial

Play multiple episodes

In [16]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

Discounting Function

In [17]:
def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_factor
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor)
                for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
            for discounted_rewards in all_discounted_rewards]

def just_average(all_rewards):
    all_total_rewards = np.array([sum(rewards) for rewards in all_rewards])
    return all_total_rewards.mean()

Testing

In [None]:
discount_and_normalize_rewards([[10, 0, -50], [10, 20]],
     discount_factor=0.8)

[array([-0.28435071, -0.86597718, -1.18910299]),
 array([1.26665318, 1.0727777 ])]

### Training

In [18]:
n_iterations = 300
n_episodes_per_update = 10
n_max_steps = 10000
discount_factor = 0.99

In [19]:
optimizer = keras.optimizers.Adam(learning_rate=0.01)
loss_fn = keras.losses.sparse_categorical_crossentropy

Training Loop

In [None]:
for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(
            env, n_episodes_per_update, n_max_steps, model, loss_fn)
    all_final_rewards = discount_and_normalize_rewards(all_rewards,
            discount_factor)
    all_mean_grads = []
    print( "Iteration Number:", iteration )
    print(" Average reward =", just_average(all_rewards) )
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean(
                [final_reward * all_grads[episode_index][step][var_index]
                    for episode_index, final_rewards in enumerate(all_final_rewards)
                for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

Iteration Number: 0
 Average reward = -190.3590206988153
Iteration Number: 1
 Average reward = -233.01159143863237
Iteration Number: 2
 Average reward = -145.0436595285376
Iteration Number: 3
 Average reward = -153.29911325776308
Iteration Number: 4
 Average reward = -143.01404773250312
Iteration Number: 5
 Average reward = -106.9229984321494
Iteration Number: 6
 Average reward = -120.2148978376382
Iteration Number: 7
 Average reward = -96.87697299774351
Iteration Number: 8
 Average reward = -120.68202281706917
Iteration Number: 9
 Average reward = -108.14304604819604
Iteration Number: 10
 Average reward = -129.24666259123262
Iteration Number: 11
 Average reward = -129.73851964321048
Iteration Number: 12
 Average reward = -121.32118544346062
Iteration Number: 13
 Average reward = -105.44099324291433
Iteration Number: 14
 Average reward = -104.58816526979089
Iteration Number: 15
 Average reward = -102.43586252960017
Iteration Number: 16
 Average reward = -104.67498917463013
Iteration Nu

In [None]:
(anim, data) = display_run(NN_action)
print("Reward:", sum(data[0]), ",  Steps:", len(data[0]))
anim

Output hidden; open in https://colab.research.google.com to view.

In [None]:
(anim, data) = display_run(NN_action)
print("Reward:", sum(data[0]), ",  Steps:", len(data[0]))
anim

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  "The argument mode in render method is deprecated; "


Reward: 50.13783161539607 ,  Steps: 178


In [None]:
(anim, data) = display_run(NN_action)
print("Reward:", sum(data[0]), ",  Steps:", len(data[0]))
anim

Output hidden; open in https://colab.research.google.com to view.

In [None]:
dir()

['In',
 'NN_action',
 'Out',
 '_',
 '_10',
 '_14',
 '_18',
 '_22',
 '_23',
 '_24',
 '_8',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_exit_code',
 '_i',
 '_i1',
 '_i10',
 '_i11',
 '_i12',
 '_i13',
 '_i14',
 '_i15',
 '_i16',
 '_i17',
 '_i18',
 '_i19',
 '_i2',
 '_i20',
 '_i21',
 '_i22',
 '_i23',
 '_i24',
 '_i25',
 '_i3',
 '_i4',
 '_i5',
 '_i6',
 '_i7',
 '_i8',
 '_i9',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 'all_final_rewards',
 'all_grads',
 'all_mean_grads',
 'all_rewards',
 'anim',
 'animation',
 'data',
 'default_rng',
 'discount_and_normalize_rewards',
 'discount_factor',
 'discount_rewards',
 'display_run',
 'display_state',
 'env',
 'exit',
 'get_ipython',
 'gym',
 'iteration',
 'just_average',
 'keras',
 'loss_fn',
 'mean_grads',
 'model',
 'mpl',
 'n_episodes_per_update',
 'n_inputs',
 'n_iterations',
 'n_max_steps',
 'np',
 'obs',
 'optimizer',
 'play_multiple_episodes',
 'play_one_step',
 'plot_anima