In [56]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import math


In [57]:
class Agent:
    
    def __init__(self, state_size, action_size, memory=20000, 
                 gamma=0.99, max_eps=1, min_eps=0.1, decay=0.001,
                 lr=0.00025, batch_size=32):
        
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=memory)
        
        self.max_eps = max_eps
        self.epsilon = max_eps
        self.min_eps = min_eps
        
        self.decay = decay
        self.lr = lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.steps = 0
                
        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(self.state_size,),
                            activation = 'relu'))
        self.model.add(Dense(24, activation='relu'))
        self.model.add(Dense(self.action_size, activation='linear'))
        

        self.model.compile(optimizer=Adam(lr=lr), loss='mse')
        
        
        
    def act(self, state, test=False):
        
        if test:
            probs = self.model.predict(state)
            return np.argmax(probs[0])         
        
        # explore
        if np.random.rand() < self.epsilon:
            return random.randrange(self.action_size)
        
        # exploit
        else:
            probs = self.model.predict(state)
            return np.argmax(probs[0])
        
    def add_memory(self, state, action, reward, next_state, done):
        
        self.memory.append((state, action, reward, next_state, done))
        
    
    def batch_run(self):
        batch_size = min(self.batch_size, len(self.memory))
        batch = random.sample(self.memory, batch_size)
        
        states = []
        targets = []
        
        for i in range(len(batch)):
            state = batch[i][0]
            action = batch[i][1]
            reward = batch[i][2]
            next_state = batch[i][3]
            done = batch[i][4]
            
            probs = self.model.predict(next_state)[0]
            target_n = reward + self.gamma * np.amax(probs)
            
            if done:
                target_n = reward
                
            target = self.model.predict(state)
            target[0][action] = target_n
            
            states.append(state[0])
            targets.append(target[0])
            
        self.model.fit(np.array(states), np.array(targets), epochs=1, verbose=0)
        
        self.steps += 1
        self.epsilon = self.min_eps + (self.max_eps - self.min_eps \
                                      ) * math.exp(-self.decay * self.steps)
            
            

In [58]:
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = Agent(state_size, action_size)

In [None]:
done = False
batch_size = 32
EPISODES = 1000
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    R = 0
    for time in range(150000):
        # env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward
        R += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.add_memory(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {}".format(e, EPISODES, R, agent.epsilon))
            break
        agent.batch_run()
        
for e in range(200):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    R = 0
    for time in range(150000):
        # env.render()
        action = agent.act(state, test)
        next_state, reward, done, _ = env.step(action)
        reward = reward
        R += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.add_memory(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {}".format(e, EPISODES, R, agent.epsilon))
            break
        agent.batch_run()

episode: 0/1000, score: -62.33958035658112, e: 0.9391544379153535
episode: 1/1000, score: -386.66609258636475, e: 0.8684648037716335
episode: 2/1000, score: -129.89432015880783, e: 0.8108026065395224
episode: 3/1000, score: -396.9876452124879, e: 0.7254020752686196
episode: 4/1000, score: -396.74961775916756, e: 0.6653215946706766
episode: 5/1000, score: -71.0394358849822, e: 0.6120359120612096
episode: 6/1000, score: -442.54568480602416, e: 0.5514184621594499
episode: 7/1000, score: -202.1437109850732, e: 0.4466774342509796
episode: 8/1000, score: -225.21012062170087, e: 0.40809086734940914
episode: 9/1000, score: -142.74011919284084, e: 0.33707931872899877
episode: 10/1000, score: -306.882126933604, e: 0.30693293666805144
episode: 11/1000, score: -283.1836767591879, e: 0.2829832764825576
episode: 12/1000, score: -453.9006648828327, e: 0.2367828679858308
episode: 13/1000, score: -171.8930294567121, e: 0.20224733830368735
episode: 14/1000, score: -249.03161087056074, e: 0.1752935718018

episode: 128/1000, score: -103.72951961541685, e: 0.1
episode: 129/1000, score: -101.25277702823482, e: 0.1
episode: 130/1000, score: -147.19285033464803, e: 0.1
episode: 131/1000, score: -116.69676099797023, e: 0.1
episode: 132/1000, score: -68.27911426350248, e: 0.1
episode: 133/1000, score: -110.56532809137148, e: 0.1
episode: 134/1000, score: -96.2122878050814, e: 0.1
episode: 135/1000, score: -256.07751474383934, e: 0.1
episode: 136/1000, score: -506.05621324466676, e: 0.1
episode: 137/1000, score: -277.2269612098265, e: 0.1
episode: 138/1000, score: -460.5817815861872, e: 0.1
episode: 139/1000, score: -461.14919927956686, e: 0.1
episode: 140/1000, score: -99.47695145310544, e: 0.1
episode: 141/1000, score: -366.3259601891566, e: 0.1
episode: 142/1000, score: -378.6306550823891, e: 0.1
episode: 143/1000, score: -249.56264028461138, e: 0.1
episode: 144/1000, score: -123.23732487137934, e: 0.1
episode: 145/1000, score: -427.6810255912093, e: 0.1
episode: 146/1000, score: -167.860378

In [None]:
def running():
    done = False
    EPISODES = 1000
    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        R = 0
        for time in range(150000):
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward
            R += reward
            next_state = np.reshape(next_state, [1, state_size])
            agent.add_memory(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {}".format(e, EPISODES, R, agent.epsilon))
                break
            agent.batch_run()
    RR = np.zeros(200)
    for e in range(200):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        R = 0
        for time in range(150000):
            # env.render()
            action = agent.act(state, test)
            next_state, reward, done, _ = env.step(action)
            reward = reward
            R += reward
            next_state = np.reshape(next_state, [1, state_size])
            agent.add_memory(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {}".format(e, EPISODES, R, agent.epsilon))
                RR[e] = R
                break
            agent.batch_run()