In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import math


Using Theano backend.


In [49]:
class Agent:
    
    def __init__(self, state_size, action_size, memory=20000, 
                 gamma=0.99, max_eps=1, min_eps=0.1, decay=0.0001,
                 lr=0.00025, batch_size=32):
        
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=memory)
        
        self.max_eps = max_eps
        self.epsilon = max_eps
        self.min_eps = min_eps
        
        self.decay = decay
        self.lr = lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.steps = 0
        
        
        
        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(self.state_size,),
                            activation = 'relu'))
        self.model.add(Dense(24, activation='relu'))
        self.model.add(Dense(self.action_size, activation='linear'))
        

        self.model.compile(optimizer=Adam(lr=lr), loss='mse')
        
        
        
    def act(self, state):
        # explore
        if np.random.rand() < self.epsilon:
            return random.randrange(self.action_size)
        
        # exploit
        else:
            probs = self.model.predict(state)
            return np.argmax(probs[0])
        
    def add_memory(self, state, action, reward, next_state, done):
        
        self.memory.append((state, action, reward, next_state, done))
        
    
    def batch_run(self):
        batch_size = min(self.batch_size, len(self.memory))
        batch = random.sample(self.memory, batch_size)
        
        states = []
        targets = []
        
        for i in range(len(batch)):
            state = batch[i][0]
            action = batch[i][1]
            reward = batch[i][2]
            next_state = batch[i][3]
            done = batch[i][4]
            
            probs = self.model.predict(next_state)[0]
            target_n = reward + self.gamma * np.amax(probs)
            
            if done:
                target_n = reward
                
            target = self.model.predict(state)
            target[0][action] = target_n
            
            states.append(state[0])
            targets.append(target[0])
            
        self.model.fit(np.array(states), np.array(targets), epochs=1, verbose=0)
        
        self.steps += 1
        self.epsilon = self.min_eps + (self.max_eps - self.min_eps \
                                      ) * math.exp(-self.decay * self.steps)
            
            

In [50]:
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = Agent(state_size, action_size)

In [None]:
done = False
batch_size = 32
EPISODES = 1000
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    R = 0
    for time in range(150000):
        # env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward
        R += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.add_memory(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {}".format(e, EPISODES, R, agent.epsilon))
            break
        agent.batch_run()

episode: 0/1000, score: -126.44150338966415, e: 0.992739444945014
episode: 1/1000, score: -110.61753218465195, e: 0.9819141920283637
episode: 2/1000, score: -117.34672464773749, e: 0.9747120647315229
episode: 3/1000, score: -169.19005087758796, e: 0.9662683758003969
episode: 4/1000, score: -507.2320467177176, e: 0.9567059663831154
episode: 5/1000, score: -105.53884188723777, e: 0.9493698854349543
episode: 6/1000, score: 27.45003860488046, e: 0.9429391420836651
episode: 7/1000, score: -168.20225442528064, e: 0.9328010376890603
episode: 8/1000, score: -277.6675572179824, e: 0.926247828748743
episode: 9/1000, score: -82.74165704138443, e: 0.9199101518368628
episode: 10/1000, score: -90.79308249391228, e: 0.9094012390972809
episode: 11/1000, score: -296.4291911228055, e: 0.9017483361708779
episode: 12/1000, score: -84.7580894164775, e: 0.8957577165541621
episode: 13/1000, score: -121.11261071638313, e: 0.8881549939326455
episode: 14/1000, score: -113.48195233256831, e: 0.8803127207115923
e