# Lunar Lander policy gradient

In [26]:
import gym
import numpy as np
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
from keras.utils import to_categorical

import matplotlib.pyplot as plt

max_epochs = 20
mem_size = 400
batch_size = 100
blank = 20

LAYER_1 = 128
LAYER_2 = 64
ALPHA = 0.002
GAMMA = 0.99
SAVE = max_epochs
LOAD = max_epochs

env = gym.make("LunarLanderContinuous-v2")

In [27]:
from collections import deque

class Memory:
    def __init__(self, size, samplesize):
        self.samplesize = samplesize
        self.memory = deque(maxlen=size)
        self.size = size
    
    def remember(self, experience):
        self.memory.append(experience)
    
    def sample(self):
        indecies = np.random.choice(np.arange(self.size), size=self.samplesize, replace = False)
        return [self.memory[index] for index in indecies]       

In [34]:
class PGA():
    def __init__(self, state_dim, action_dim):
        self.action_dim = action_dim
        self.model = Sequential()
        self.model.add(Dense(LAYER_1, activation='relu', input_shape=(state_dim,)))
        self.model.add(Dense(LAYER_2, activation = 'relu'))
        self.model.add(Dense(action_dim, activation = 'tanh'))
        adam = keras.optimizers.Adam(lr=ALPHA)
        self.model.compile(optimizer= adam, loss='mse', metrics=['accuracy'])
        
    def predict(self, state):
        return self.model.predict(np.array([state]))
    
    def train(self, experiences):
        state, action, reward, next_state, done = zip(*experiences)
        state = np.array(state)
        action = np.array(action)
        reward = np.array(reward)
        next_state = np.array(next_state)
        done = np.array(done)

        
    def save_model(self, epoch):
        self.model.save_weights('models/gradient_weights' + str(epoch) + '.h5')
    
    def load_model(self, epoch):
        self.model.load_weights('models/gradient_weights' + str(epoch) + '.h5')

In [46]:
def playtest(env, agent):
    rewards = 0
    state = env.reset()
    done = False
    rewards = 0
    while not done:
        action = agent.predict(state)
        env.render()
        next_state, reward, done, _ = env.step(action[0])
        rewards+=reward
        if done:
            env.close()
            return rewards
        else:
            state = next_state

In [49]:
def test_run(env, agent):
    memory = Memory(mem_size,batch_size)
    epsilon = 1
    epsilon_decay = 0.9
    rewards = 0
    print("Prepopulating memory:")
    print("____________________")
    state = env.reset()
    for step in range(mem_size):
        if step%(mem_size/blank)==0:
            print("|", end='')
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)
        memory.remember((state, action, reward, next_state, 1-done))
        if done:
            state = env.reset()
        else:
            state = next_state
     
    av_reward = []     
    env.reset()
    print("\nTraining:")
    print("____________________")
    for epoch in range(1,max_epochs+1):
        done = False
        state = env.reset()
        while not done:
            if np.random.uniform() < epsilon:
                action = env.action_space.sample()
            else:
                action = agent.predict(state)
                action = action[0]
            next_state, reward, done, _ = env.step(action)
            memory.remember((state, action, reward, next_state, 1-done))
            agent.train(memory.sample())
            if done:
                if epsilon > 0.1:
                    epsilon*=epsilon_decay
                if epoch%(max_epochs/blank)==0:
                    print("|", end='')
                    rewards+=playtest(env, agent)
                    av_reward.append(rewards/(epoch/(max_epochs/blank)))
                break
            else:
                state = next_state
    return av_reward

In [50]:
agent = PGA(env.observation_space.shape[0], env.action_space.shape[0])
rewards = test_run(env, agent)
agent.save_model(SAVE);

plt.plot(rewards_single, label = "policy gradient")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

plt.xlabel('Episode')
plt.ylabel('Average Reward')

Prepopulating memory:
____________________
||||||||||||||||||||
Training:
____________________
|||||||||||||||||||

KeyboardInterrupt: 