In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import math


Using Theano backend.


In [2]:
class Agent:
    
    def __init__(self, state_size, action_size, memory=20000, 
                 gamma=0.99, max_eps=1, min_eps=0.1, decay=0.001,
                 lr=0.00025, batch_size=32):
        
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=memory)
        
        self.max_eps = max_eps
        self.epsilon = max_eps
        self.min_eps = min_eps
        
        self.decay = decay
        self.lr = lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.steps = 0
                
        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(self.state_size,),
                            activation = 'relu'))
        self.model.add(Dense(24, activation='relu'))
        self.model.add(Dense(self.action_size, activation='linear'))
        

        self.model.compile(optimizer=Adam(lr=lr), loss='mse')
        
        
        
    def act(self, state, test=False):
        
        if test:
            probs = self.model.predict(state)
            return np.argmax(probs[0])         
        
        # explore
        if np.random.rand() < self.epsilon:
            return random.randrange(self.action_size)
        
        # exploit
        else:
            probs = self.model.predict(state)
            return np.argmax(probs[0])
        
    def add_memory(self, state, action, reward, next_state, done):
        
        self.memory.append((state, action, reward, next_state, done))
        
    
    def batch_run(self):
        batch_size = min(self.batch_size, len(self.memory))
        batch = random.sample(self.memory, batch_size)
        
        states = []
        targets = []
        
        for i in range(len(batch)):
            state = batch[i][0]
            action = batch[i][1]
            reward = batch[i][2]
            next_state = batch[i][3]
            done = batch[i][4]
            
            probs = self.model.predict(next_state)[0]
            target_n = reward + self.gamma * np.amax(probs)
            
            if done:
                target_n = reward
                
            target = self.model.predict(state)
            target[0][action] = target_n
            
            states.append(state[0])
            targets.append(target[0])
            
        self.model.fit(np.array(states), np.array(targets), epochs=1, verbose=0)
        
        self.steps += 1
        self.epsilon = self.min_eps + (self.max_eps - self.min_eps \
                                      ) * math.exp(-self.decay * self.steps)
            
            

In [3]:
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = Agent(state_size, action_size)

In [7]:
def running(batch_size, gamma, lr):
    env = gym.make('LunarLander-v2')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = Agent(state_size, action_size, batch_size=batch_size, gamma=gamma, lr=lr)
    done = False
    EPISODES = 1000
    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        R = 0
        for time in range(150000):
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward
            R += reward
            next_state = np.reshape(next_state, [1, state_size])
            agent.add_memory(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {}".format(e, EPISODES, R, agent.epsilon))
                break
            agent.batch_run()
    RR = np.zeros(200)
    for e in range(200):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        R = 0
        for time in range(150000):
            # env.render()
            action = agent.act(state, test)
            next_state, reward, done, _ = env.step(action)
            reward = reward
            R += reward
            next_state = np.reshape(next_state, [1, state_size])
            agent.add_memory(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {}".format(e, EPISODES, R, agent.epsilon))
                RR[e] = R
                break
            agent.batch_run()
            
    return RR

In [8]:
a = []
batches = [25, 50]
gammas = [0.99, 0.95]
lrs = [0.00025, 0.0005]

In [None]:
for b in batches:
    for g in gammas:
        for l in lrs:
            a.append(running(batch_size=b, gamma=g, lr=l))

episode: 0/1000, score: -173.48172027352234, e: 0.9159840133785289
episode: 1/1000, score: -302.36243869181544, e: 0.8487422234440046
episode: 2/1000, score: -111.27589489543733, e: 0.7884170029925776
episode: 3/1000, score: -326.84168368056106, e: 0.7297952479611499
episode: 4/1000, score: -270.9478873104229, e: 0.6732917579601843
episode: 5/1000, score: -215.12366563571197, e: 0.6187358340539895
episode: 6/1000, score: -325.7981968921416, e: 0.5755132577614519
episode: 7/1000, score: -438.2661834921166, e: 0.5363282221906089
episode: 8/1000, score: -444.6182507229579, e: 0.49638848905539934
episode: 9/1000, score: -235.84081247038856, e: 0.46154798208782744
episode: 10/1000, score: -326.3457763647187, e: 0.431754342673077
episode: 11/1000, score: -459.5348068611722, e: 0.3980897940177829
episode: 12/1000, score: -264.8182128446083, e: 0.36972279956192167
episode: 13/1000, score: -362.05448361937147, e: 0.35224346784143046
episode: 14/1000, score: -354.1704940544349, e: 0.337553951841

episode: 136/1000, score: 127.37701095296589, e: 0.1
episode: 137/1000, score: -27.257986684137563, e: 0.1
episode: 138/1000, score: -6.69720590314968, e: 0.1
episode: 139/1000, score: -54.0421438612441, e: 0.1
episode: 140/1000, score: -32.09040987841634, e: 0.1
episode: 141/1000, score: -89.43874355915655, e: 0.1
episode: 142/1000, score: 6.953151584742307, e: 0.1
episode: 143/1000, score: 17.08191603169702, e: 0.1
episode: 144/1000, score: -146.59617863126067, e: 0.1
episode: 145/1000, score: -172.45355065980002, e: 0.1
episode: 146/1000, score: -58.80704741503926, e: 0.1
episode: 147/1000, score: -121.20322206901022, e: 0.1
episode: 148/1000, score: 77.76957143672433, e: 0.1
episode: 149/1000, score: -113.69434258448966, e: 0.1
episode: 150/1000, score: -71.54978625366923, e: 0.1
episode: 151/1000, score: -54.669436671613006, e: 0.1
episode: 152/1000, score: 81.05557122521752, e: 0.1
episode: 153/1000, score: -220.5208589952004, e: 0.1
episode: 154/1000, score: -134.8536903910868, 