In [1]:
import gym
import numpy as np
import collections
import random
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution

disable_eager_execution()
# env = gym.make('LunarLander-v2')
# env.reset()
# for _ in range(500):
#     env.render()
#     env.step(env.action_space.sample())

In [2]:
class DQNAgent:

    def __init__(self, n_states, n_actions, lr, gamma, ep, min_epsilon,ep_decay):
        
        """
        parameters
        """
        # environment
        self.n_states = n_states
        self.n_actions = n_actions
        
        # parameters
        self.gamma = gamma # discount factor
        self.epsilon = ep  # exploration rate
        self.epsilon_min = min_epsilon
        self.epsilon_decay = ep_decay
        self.learning_rate = lr
        self.memory = collections.deque(maxlen =50000)
        self.replay_count = 0
        
        # local network and target network
        self.q_local = self.build_dqn()
        self.q_target = self.build_dqn()

        
  
    def memorize(self, state, action, reward, next_state, done):
        
        self.memory.append((state, action, reward, next_state, done))
    
    def update(self):
     # print("update target network")
      self.q_target.set_weights(self.q_local.get_weights())
  
    def select_action(self, state):
        if np.random.rand()<=self.epsilon:
            return np.random.choice(self.n_actions)
        return np.argmax(self.q_local.predict(state)[0])
    
    def select_action_test(self, state):
        return np.argmax(self.q_local.predict(state)[0])
    
    def replay(self, batch_size,step):
        batch = random.sample(self.memory, batch_size)
        #target_f = self.model.predict_on_batch(batch[0])
        
        
        states = []
        actions = []
        next_states = []
        rewards = []
        finishes = []
        for state, action, reward, next_state, done in batch:
            states.append(state)
            actions.append(action)
            next_states.append(next_state)
            rewards.append(reward)
            finishes.append(done)
        states = np.array(states)
        actions = np.array(actions)
        next_states = np.array(next_states)
        rewards = np.array(rewards)
        finishes = np.array(finishes)
        states = np.squeeze(states)
        next_states = np.squeeze(next_states)
        
        q_vals_next_state = self.q_target.predict_on_batch(next_states)
        q_vals_target = self.q_local.predict_on_batch(states)
        
        max_q_values_next_state = np.amax(q_vals_next_state, axis=1)
        q_vals_target[np.arange(batch_size), actions] = rewards + self.gamma * (max_q_values_next_state) * (1 - finishes)
        self.q_local.fit(states, q_vals_target, verbose=0)
        self.replay_count += 1

        
        if 0<=step<=50:
            self.epsilon = 0.5
        elif 50<step<=100:
            self.epsilon = 0.3
        elif 100<step<=300:
            self.epsilon = 0.2
        elif 300<step<=500:
            self.epsilon = 0.1
        elif 500<step<=1000:
            self.epsilon = 0.05
        else:
            self.epsilon = 0.01
    
    def build_dqn(self):      
        """
        Q-function Approximator
        """
        model = Sequential()
        model.add(Dense(32, input_dim = self.n_states, activation='relu'))
        model.add(Dense(64,  activation='relu'))
        model.add(Dense(self.n_actions, activation='linear'))
        model.compile(loss='mse', optimizer = Adam(lr=self.learning_rate))
        return model

    def save_model(self, name):
        self.q_local.save(name)

In [3]:
"""
Parameters
"""

MAX_MOVES =1000
AVG_REWARD_LEN = 100
TARGET_AVG_REWARD = 200
BATCH_SIZE = 64
LEARNING_RATE = 0.001
GAMMA = 0.99
EPSILON = 1.0
MIN_EPSILON = 0.01
EPSILON_DECAY = 0.05
TARGET_UPDATE = 20
"""
Randomization Seeds
"""
random_seed = 0

np.random.seed(random_seed)
random.seed(random_seed)
tf.random.set_seed(random_seed)
"""
Environment
"""

env = gym.make("LunarLander-v2")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
print(f"There are {n_states} states and {n_actions} actions in the Environment")


There are 8 states and 4 actions in the Environment


In [4]:
test_reward = []
agent_test = DQNAgent(n_states, n_actions, lr = LEARNING_RATE,gamma = GAMMA, ep = EPSILON, min_epsilon = MIN_EPSILON,ep_decay = EPSILON_DECAY )
agent_test.q_local = keras.models.load_model('dqn_defined_epsilon_update20')
env = gym.make('LunarLander-v2')

for _ in range(10):
    state = env.reset()
    state = np.reshape(state, [1, n_states])
    t_rewards = 0
    while True:
        env.render()
        action = agent_test.select_action_test(state)
        next_state, reward, done, _ = env.step(action)
        t_rewards += reward
        next_state = np.reshape(next_state, [1, n_states])
        #agent.memorize(state, action, reward, next_state, done)
        state = next_state
        if done:
            print(f"total reward:{t_rewards}")   
            break




total reward:223.66576802082602
total reward:270.7219101002065
total reward:276.9658156161074
total reward:276.8789445203622
total reward:263.2889928622239
total reward:271.80706707277284
total reward:239.7376037160408
total reward:278.1044890928906
total reward:280.94236002495734
total reward:260.70014736150983
