# Lunar Lander policy gradient

In [1]:
import gym
import numpy as np
import keras
import gc
import random
from keras.models import Sequential
from keras.optimizers import Adam
from keras.utils import to_categorical
import keras.backend as K

from keras.layers import Concatenate, Input, Dense
from keras.layers import merge
from keras.layers import Reshape
from keras.models import Model

import matplotlib.pyplot as plt

MAX_EPISODES = 400
MAX_STEPS = 10000
MAX_BUFFER = 40000
MAX_TOTAL_REWARD = 500

BATCH_SIZE = 100

TAU = 0.1
LAYER_1 = 128
LAYER_2 = 64
ALPHA = 0.0000001
GAMMA = 0.8

Using TensorFlow backend.


In [2]:
from collections import deque
class Memory:
    def __init__(self, size):
        self.buffer = deque(maxlen=size)

    def sample(self, size):
        batch = random.sample(self.buffer, size)
        random_index_list = np.random.choice(np.arange(len(self.buffer)), size = size, replace = False)
        sublist =  [self.buffer[index] for index in random_index_list]
        states, actions, rewards, next_states, done = zip(*sublist)
        return states, actions, rewards, next_states, done

    def remember(self, experience):
        self.buffer.append(experience)

In [3]:
class Actor():
    def __init__(self, state_dim, action_dim):
        self.model = Sequential()
        self.model.add(Dense(LAYER_1, activation = 'relu', input_shape = (state_dim, ), kernel_initializer='he_uniform'))
        self.model.add(Dense(LAYER_2, activation = 'relu', kernel_initializer='he_uniform'))
        self.model.add(Dense(action_dim, activation = 'tanh', kernel_initializer='he_uniform'))
        self.adam = Adam(lr=ALPHA)
        self.model.compile(optimizer=self.adam, loss=self.newLoss)

    def forward(self, state):
        return self.model.predict(np.array(state))

    def update(self, source, degree):
        self.model.set_weights((np.array(source.model.get_weights())*degree)+(np.array(self.model.get_weights())*(1-degree)))

    def train(self, state, errors, batch_size):
        self.model.fit( state, errors, batch_size=batch_size, verbose = 0)

    def newLoss(self, xPred,yPred):
        return K.sum(-1*yPred)

In [4]:
class Critic():
    def __init__(self, state_dim, action_dim):
        self.first_input = Input((state_dim, ))
        self.first_dense = Dense(LAYER_2, activation = 'relu', kernel_initializer='he_uniform')(self.first_input)

        self.second_input = Input((action_dim, ))
        self.second_dense = Dense(LAYER_2, activation = 'relu', kernel_initializer='he_uniform')(self.second_input)

        self.merged = Concatenate(axis = 1)([self.first_dense, self.second_dense])
        self.output_layer = Dense(LAYER_2, activation = 'relu', kernel_initializer='he_uniform')(self.merged)
        self.output_layer_2 = Dense(1, kernel_initializer='he_uniform')(self.output_layer)
        
        self.model = Model(inputs=[self.first_input, self.second_input], outputs=self.output_layer_2)
        self.adam = Adam(lr=ALPHA)
        self.model.compile(optimizer=self.adam, loss='mse')

    def forward(self, state, action):
        return self.model.predict([state, action])

    def update(self, source, degree):
        self.model.set_weights((np.array(source.model.get_weights())*degree)+(np.array(self.model.get_weights())*(1-degree)))

In [5]:
class ActorCritic:
    def __init__(self, state_dim, action_dim, memory, load):
        self.memory = memory
        self.noise = OrnsteinUhlenbeckActionNoise(action_dim)

        self.actor = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim, action_dim)
        self.target_actor = Actor(state_dim, action_dim)
        self.target_critic = Critic(state_dim, action_dim)

        if load != 0:
            self.load_models(load) #load the model

        self.net_update(self.target_actor, self.actor, True)
        self.net_update(self.target_critic, self.critic, True)

    def get_action(self, state, train):
        if train:
            action = self.actor.forward([state])
            noise = np.float32(self.noise.sample())
            return action + noise
        action = self.actor.forward([state])
        return action

    def optimize(self):
        state,action,reward,next_state, done = self.memory.sample(BATCH_SIZE)

        state = np.array(state)
        action = np.array(action)
        reward = np.array(reward)
        next_state = np.array(next_state)
        done = 1-np.array(done)
        
        next_action = self.target_actor.forward(next_state)
        target = reward[:,np.newaxis] + GAMMA* self.target_critic.forward(next_state, next_action)*done[:, np.newaxis]
        self.critic.model.fit([state,action], target, batch_size=BATCH_SIZE, verbose = 0)

        action = self.actor.forward(state)
        self.actor.train(state, self.critic.forward(state, action), BATCH_SIZE)
        
        self.net_update(self.target_actor, self.actor, False)
        self.net_update(self.target_critic, self.critic, False)

    def net_update(self,target, source, hard):
        degree = 1
        if not hard: degree = TAU
        target.update(source, degree)

    def save_models(self, episode):
        self.actor.model.save_weights('Models/' + str(episode) + 'actor_weights.h5')
        self.critic.model.save_weights('Models/' + str(episode) + 'critic_weights.h5')

    def load_models(self, episode):
        self.actor.model.load_weights('Models/' + str(episode) + 'actor_weights.h5')
        self.critic.model.load_weights('Models/' + str(episode) + 'critic_weights.h5')
        self.net_update(self.target_actor, self.actor, True)
        self.net_update(self.target_critic, self.critic, True)
        print('Models loaded succesfully')

class OrnsteinUhlenbeckActionNoise:
    def __init__(self, action_dim):
        self.action_dim = action_dim
        self.theta = 0.15
        self.sigma = 0.02
        self.dx = np.zeros(self.action_dim)

    def sample(self):
        self.dx = self.dx + self.theta * (-self.dx) + self.sigma * np.random.randn(len(self.dx))
        return self.dx

In [6]:
def env_run(env, episode, trainer, memory, train):
    state = env.reset()
    epoch_reward = 0
    print(episode)
    
    for step in range(MAX_STEPS):
        if not train:
            env.render()
        action = trainer.get_action(state, train)[0]
        next_state, reward, done, _ = env.step(action)
        epoch_reward +=reward
        if train:
            if done:
                break
            memory.remember((state, action, reward, next_state, done))
            state = next_state
            trainer.optimize()
        else:
            if done:
                env.close()
                print("\n Testing agent got a reward of :",epoch_reward)
                break
        state = next_state
    gc.collect()
    if episode%100 == 1:
        trainer.save_models(episode)
    return epoch_reward

def prepopulate_memory(memory, env):
    state = env.reset()
    for _ in range(MAX_BUFFER):
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)
        memory.remember((state, action, reward, next_state, done))
        if done:
            state = env.reset()
        else:
            state = next_state

def main():

    training = 1
    test_interwal = 5
    load = 0

    env = gym.make('LunarLanderContinuous-v2')
    memory = None

    if training == 1:
        memory = Memory(MAX_BUFFER)
        prepopulate_memory(memory, env)
    rewards = []
    max_reward = 0

    trainer = ActorCritic(env.observation_space.shape[0], env.action_space.shape[0], memory, load)

    for episode in np.arange(MAX_EPISODES):
        if training == 1:
            env_run(env, episode, trainer, memory, True)
        if episode%test_interwal == 0:
            max_reward += env_run(env, episode,trainer, None, False)
            rewards.append(max_reward/((episode/test_interwal)+1))
    plt.plot(rewards)
    plt.show()
    
if __name__ == '__main__':
    main()

0
0

 Testing agent got a reward of : -495.40696960932246
1
2
3
4
5
5

 Testing agent got a reward of : -744.121453729284
6
7
8
9
10
10

 Testing agent got a reward of : -781.8956276934927
11
12
13
14
15
15

 Testing agent got a reward of : -920.1599047950518
16
17
18
19


KeyboardInterrupt: 