In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import math


Using Theano backend.


In [2]:
class Agent:
    
    def __init__(self, state_size, action_size, memory=20000, 
                 gamma=0.99, max_eps=1, min_eps=0.1, decay=0.001,
                 lr=0.00025, batch_size=32):
        
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=memory)
        
        self.max_eps = max_eps
        self.epsilon = max_eps
        self.min_eps = min_eps
        
        self.decay = decay
        self.lr = lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.steps = 0
                
        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(self.state_size,),
                            activation = 'relu'))
        self.model.add(Dense(24, activation='relu'))
        self.model.add(Dense(self.action_size, activation='linear'))
        

        self.model.compile(optimizer=Adam(lr=lr), loss='mse')
        
        
        
    def act(self, state, test=False):
        
        if test:
            probs = self.model.predict(state)
            return np.argmax(probs[0])         
        
        # explore
        if np.random.rand() < self.epsilon:
            return random.randrange(self.action_size)
        
        # exploit
        else:
            probs = self.model.predict(state)
            return np.argmax(probs[0])
        
    def add_memory(self, state, action, reward, next_state, done):
        
        self.memory.append((state, action, reward, next_state, done))
        
    
    def batch_run(self):
        batch_size = min(self.batch_size, len(self.memory))
        batch = random.sample(self.memory, batch_size)
        
        states = []
        targets = []
        
        for i in range(len(batch)):
            state = batch[i][0]
            action = batch[i][1]
            reward = batch[i][2]
            next_state = batch[i][3]
            done = batch[i][4]
            
            probs = self.model.predict(next_state)[0]
            target_n = reward + self.gamma * np.amax(probs)
            
            if done:
                target_n = reward
                
            target = self.model.predict(state)
            target[0][action] = target_n
            
            states.append(state[0])
            targets.append(target[0])
            
        self.model.fit(np.array(states), np.array(targets), epochs=1, verbose=0)
        
        self.steps += 1
        self.epsilon = self.min_eps + (self.max_eps - self.min_eps \
                                      ) * math.exp(-self.decay * self.steps)
            
            

In [3]:
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = Agent(state_size, action_size)

In [None]:
done = False
batch_size = 32
EPISODES = 1000
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    R = 0
    for time in range(150000):
        # env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward
        R += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.add_memory(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {}".format(e, EPISODES, R, agent.epsilon))
            break
        agent.batch_run()
        
for e in range(200):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    R = 0
    for time in range(150000):
        # env.render()
        action = agent.act(state, test)
        next_state, reward, done, _ = env.step(action)
        reward = reward
        R += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.add_memory(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {}".format(e, EPISODES, R, agent.epsilon))
            break
        agent.batch_run()

episode: 0/1000, score: -135.19624625509954, e: 0.9308047117479722
episode: 1/1000, score: -265.39557394052474, e: 0.874637178782552
episode: 2/1000, score: -266.656369274285, e: 0.7974248481649728
episode: 3/1000, score: -440.26747127970816, e: 0.6591371340227155
episode: 4/1000, score: -273.74526370339424, e: 0.6244734271365907
episode: 5/1000, score: -314.45317671673934, e: 0.5856046246719949
episode: 6/1000, score: -387.9531058354748, e: 0.5289733360746749
episode: 7/1000, score: -195.3737681243113, e: 0.4900967414348538
episode: 8/1000, score: -45.838132707307864, e: 0.46154798208782744
episode: 9/1000, score: -75.87697694523476, e: 0.42583618768913867
episode: 10/1000, score: -58.85719218492597, e: 0.3948287747713206
episode: 11/1000, score: -64.73013909516713, e: 0.3702627849664549
episode: 12/1000, score: -10.161406385935337, e: 0.3514878713992665
episode: 13/1000, score: -157.59068031287478, e: 0.323943718440868
episode: 14/1000, score: -162.2570132472859, e: 0.271639228171152