In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import math


Using Theano backend.


In [2]:
class Agent:
    
    def __init__(self, state_size, action_size, memory=20000, 
                 gamma=0.99, max_eps=1, min_eps=0.1, decay=0.001,
                 lr=0.00025, batch_size=32):
        
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=memory)
        
        self.max_eps = max_eps
        self.epsilon = max_eps
        self.min_eps = min_eps
        
        self.decay = decay
        self.lr = lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.steps = 0
                
        self.model = Sequential()
        self.model.add(Dense(400, input_shape=(self.state_size,),
                            activation = 'relu'))
        self.model.add(Dense(200, activation='relu'))
        self.model.add(Dense(self.action_size, activation='linear'))
        

        self.model.compile(optimizer=Adam(lr=lr), loss='mse')
        
        
        
    def act(self, state, test=False):
        
        if test:
            probs = self.model.predict(state)
            return np.argmax(probs[0])         
        
        # explore
        if np.random.rand() < self.epsilon:
            return random.randrange(self.action_size)
        
        # exploit
        else:
            probs = self.model.predict(state)
            return np.argmax(probs[0])
        
    def add_memory(self, state, action, reward, next_state, done):
        
        self.memory.append((state, action, reward, next_state, done))
        
    
    def batch_run(self):
        batch_size = min(self.batch_size, len(self.memory))
        batch = random.sample(self.memory, batch_size)
        
        states = []
        targets = []
        
        for i in range(len(batch)):
            state = batch[i][0]
            action = batch[i][1]
            reward = batch[i][2]
            next_state = batch[i][3]
            done = batch[i][4]
            
            probs = self.model.predict(next_state)[0]
            target_n = reward + self.gamma * np.amax(probs)
            
            if done:
                target_n = reward
                
            target = self.model.predict(state)
            target[0][action] = target_n
            
            states.append(state[0])
            targets.append(target[0])
            
        self.model.fit(np.array(states), np.array(targets), epochs=1, verbose=0)
        
        self.steps += 1
        self.epsilon = self.min_eps + (self.max_eps - self.min_eps \
                                      ) * math.exp(-self.decay * self.steps)
            
            

In [3]:
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = Agent(state_size, action_size)

In [None]:

done = False
batch_size = 32
TRAIN = 1000
r = np.zeros(TRAIN)
for e in range(TRAIN):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    R = 0
    for time in range(150000):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        R += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.add_memory(state, action, reward, next_state, done)
        state = next_state
        if done:
            print(e, R, reward, agent.epsilon)
            r[e] = R

            break
        agent.batch_run()

TEST = 100
r_test = np.zeros(TEST)
for e in range(TEST):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    R = 0
    for time in range(150000):
        tet = True
        # env.render()
        action = agent.act(state, test=True)
        next_state, reward, done, _ = env.step(action)
        R += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.add_memory(state, action, reward, next_state, done)
        state = next_state
        if done:
            print(e, R, reward, agent.epsilon)
            r_test[e] = R

            break
        agent.batch_run()

0 -311.1311495062664 -100 0.9184356410214083
1 -106.72216322838148 -100 0.8547561850033666
2 -112.0670113330076 -100 0.7842988677347938
3 -192.13321330422554 -100 0.7310560988877152
4 -239.37052364494622 -100 0.6784746719099781
5 -145.3440035544929 -100 0.5890157821670499
6 -195.40436680559012 -100 0.5268338226352179
7 -420.7253261585752 -100 0.4800849435220095
8 -331.69698710302373 -100 0.43643156765091984
9 -94.62703420488683 -100 0.37517156154884057
10 -93.16218453485034 -100 0.3130218544130568
11 -73.4316016981215 -1.7982364810483518 0.17844476631578318
12 -76.33710921263473 -1.749647618298613 0.12888708944578908
13 -57.438678065121465 -2.8582463038041355 0.11063759860396351
14 -99.593279264369 1.4823482817948854 0.10391726914099135
15 -267.01816656139897 -100 0.1014939068917623
16 -62.64482104476486 -1.72630796077421 0.10055012748501663
17 -46.19211064334441 2.8913139189662345 0.10020258307357677
18 -24.364471473314165 -1.0582940615690888 0.10007460071132161
19 -18.465694608428198

In [37]:
np.mean(r_test)

269.61857044378775