In [1]:
import random
import math
import gym
import gym.spaces
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import *

env = gym.make('LunarLander-v2')

epsilon = 1
Gamma = 0.99
max_eps = epsilon
min_eps = 0.1
LAMBDA = 0.001
mem_allowed = 100000
mem_all = []
batch_size = 100
state_num = 8
action_num = 4
step = 0

Using TensorFlow backend.


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [2]:
def explore_Greedy(epsilon,v):
    if np.random.rand()<epsilon:
        return np.random.randint(0,env.action_space.n)
    else:
        return np.argmax(v)
    return k    

def eps_reduce(epsilon,min_eps,max_eps,step):
    return min_eps + (max_eps - min_eps) * math.exp(-LAMBDA*step)

def predict_2(model,data):
    return model.predict(data.reshape(1,state_num)).flatten()


def experience(exp_all,sample):
    return exp_all.append(sample)

In [None]:
state_num  = env.env.observation_space.shape[0]
action_num = env.env.action_space.n    

model = Sequential()
model.add(Dense(output_dim=120, activation='relu', input_dim=state_num))
model.add(Dense(output_dim=60, activation='relu'))
model.add(Dense(output_dim=action_num, activation='linear'))
opt = RMSprop(lr=0.00025)
model.compile(loss='mse', optimizer=opt) 
Reward_cum=np.zeros(1500)

for episode in range(1500):
    state = env.reset()
    R = 0
    state_num  = env.env.observation_space.shape[0]
    action_num = env.env.action_space.n    
   

    while True:
        
        action = explore_Greedy(epsilon, predict_2(model,state))
        
        state2, reward, done, info = env.step(action)
        
        if done: # terminal state
            state2 = None
            
        sample_to_add = ((state,action, reward,state2))
        
        state = state2
        epsilon = eps_reduce(epsilon,min_eps,max_eps,step)        
        R += reward
        step += 1
        
        mem_all.append(sample_to_add)
        
        if len(mem_all)>mem_allowed:
            mem_all.pop(0)
        
        num = np.min([batch_size,len(mem_all)])
        batch = random.sample(mem_all,num)

        no_next_state = np.zeros(state_num)
        states_now = np.array([iter[0] for iter in batch])
        states_next = np.array([(no_next_state if iter2[3] is None else iter2[3]) for iter2 in batch])

        predict_now = model.predict(states_now)
        predict_next = model.predict(states_next)

        x = np.zeros((len(batch),state_num))
        y = np.zeros((len(batch),action_num))

        for i in range(len(batch)):
            current_state = batch[i][0]
            action_taken = batch[i][1]
            reward_received = batch[i][2]
            next_state = batch[i][3]

            x[i] = current_state

            target = predict_now[i]

            if next_state is None:
                target[action_taken] = reward_received
            else:
                target[action_taken] = reward_received+ Gamma*np.amax(predict_next[i])

            y[i] = target
            
        model.fit(x,y,batch_size=batch_size,nb_epoch=1,verbose=0)
        
        #train_model(model,x,y)
        
        
        if done:
            break
     
    Reward_cum[episode]=R
    print("Reward for episode %s is %s" % (episode, R))
    if episode > 98:
        rm = np.mean(Reward_cum[episode-99:episode+1])
        print("Rolling mean is %s" %(rm))
model.save("lunarlander_weights_pzou3.h5")


  """
  
  import sys


Reward for episode 0 is -232.43461347077667
Reward for episode 1 is -274.0682893238844
Reward for episode 2 is -159.8533428077896
Reward for episode 3 is -131.94620598972884
Reward for episode 4 is -96.5090426175518
Reward for episode 5 is -457.52863069343346
Reward for episode 6 is -239.5101957434258
Reward for episode 7 is -125.03536084271764
Reward for episode 8 is -129.06583763086576
Reward for episode 9 is -72.5252281584031
Reward for episode 10 is -102.24796280066296
Reward for episode 11 is -139.87231554166542
Reward for episode 12 is 83.62454598275161
Reward for episode 13 is -147.56719920678654
Reward for episode 14 is -237.9337072125185
Reward for episode 15 is -264.37613398392864
Reward for episode 16 is -403.60473431845145
Reward for episode 17 is -480.5612094737238
Reward for episode 18 is -292.66353812811496
Reward for episode 19 is -331.1211833750699
Reward for episode 20 is -635.5593732963673
Reward for episode 21 is -668.276019853509
Reward for episode 22 is -620.01573

In [None]:

model.save("lunarlander_weights_pzou3.h5")

R=np.zeros(100)
env_ = gym.make("LunarLander-v2")
model1 = Sequential()
model1.add(Dense(output_dim=120, activation='relu', input_dim=state_num))
model1.add(Dense(output_dim=60, activation='relu'))
model1.add(Dense(output_dim=action_num, activation='linear'))
opt = RMSprop(lr=0.00025)
model1.compile(loss='mse', optimizer=opt) 
model1.load_weights("lunarlander_weights_pzou3.h5")
for j in range(100):    
    s = env_.reset()
    while True:
        a=np.argmax(predict_2(model1,s))
        s_, r, done, info = env_.step(a)
        if done: # terminal state
            s_ = None
        s = s_
        R[j] += r
        if done:
            break
    print("Test Reward:", R[j])

In [None]:
##training plot
x1=range(1,1501)
plt.plot(x1,Reward_cum)
plt.xlabel('Training episode #')
plt.ylabel('Reward')
plt.title('Training process with 120-60 structure')
plt.show()    

##testing plot
x=range(1,101)
avg_r=np.mean(R)
plt.plot(x, R)
plt.xlabel('Run #')
plt.ylabel('Reward')
plt.title('Testing process')
plt.text(20,80,'Average reward:')
plt.text(20,60,avg_r)
plt.show()

In [None]:
avg_r=np.mean(R)
print(avg_r)