In [1]:
import warnings, random, gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from IPython.display import clear_output

import tensorflow as tf
from tensorflow import keras
from keras import layers

MAX_EPISODE = 100
T = 64
GAMMA = 0.99
TAU=0.005
EPS = 1e-8
REPLAY_BUFFER_MAX_LEN = 50000
BATCH_SIZE = 64
UPDATE_TARGET_NETWORK = 500

warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [3]:
env = gym.make("LunarLander-v2")
states_shape = env.observation_space.shape[0]
n_actions = env.action_space.n

In [6]:
def get_actor_critic_model(input_shape, actor_out_shape, hidden_size):
    inputs = layers.Input(shape=input_shape)
    x = layers.Dense(hidden_size, activation='relu')(inputs)
    x = layers.Dense(hidden_size, activation='relu')(x)
    actor = layers.Dense(actor_out_shape, activation='softmax', name='actor')(x)
    critic = layers.Dense(1, activation='linear', name='critic')(x)

    return keras.Model(inputs=inputs, outputs=[actor,critic])

In [8]:
actor_critic = get_actor_critic_model(input_shape=states_shape,
                                      actor_out_shape=n_actions,
                                      hidden_size=256)
actor_critic.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 8)]                  0         []                            
                                                                                                  
 dense_4 (Dense)             (None, 256)                  2304      ['input_4[0][0]']             
                                                                                                  
 dense_5 (Dense)             (None, 256)                  65792     ['dense_4[0][0]']             
                                                                                                  
 actor (Dense)               (None, 4)                    1028      ['dense_5[0][0]']             
                                                                                            

In [None]:
class MemorySteps:
    def __init__(self):
        super(MemorySteps, ).__init__()
        
        self.action_probs = []
        self.critic_values = []
        self.rewards = []
    
    def store(self, state, action, reward, next_state, done):
        experience = (state, action, np.array([reward]), next_state, done)
        self.buffer.append(experience)

    def sample(self, batch_size):
        batch_samples = random.sample(self.buffer, batch_size)
        
        states = [sample[0] for sample in batch_samples]
        actions = [sample[1] for sample in batch_samples]
        rewards = [sample[2] for sample in batch_samples]
        next_states = [sample[3] for sample in batch_samples]
        dones = [sample[4] for sample in batch_samples]
        
        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)

In [None]:
optimizer = keras.optimizers.legacy.Adam(learning_rate=0.001)
critic_loss_fn = keras.losses.Huber()

running_reward = 0
memory = MemorySteps()

for episode in range(1, MAX_EPISODE+1):
    state, _ = env.reset()
    episode_reward = 0
    
    # Collect trajectory
    for t in range(1, T+1):
        state = tf.convert_to_tensor(state)
        state = tf.expand_dims(state, 0)
        
        actions_prob, critic_val = actor_critic(state)
        
       
   
    
    if episode % 100 == 0:
        print(f"Reward at {episode} episode: {running_reward:.2f}")
    if running_reward > 200: #termination condition
        print(f"Solved at {episode} episode: score={running_reward}")
        break
    if episode % 1000 == 0:
        clear_output()

actor_critic.save_weights("checkpoints/ppo_lunarlander.h5")