In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D
from keras.optimizers import Adam

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0   # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        
        # 3x3 Conv
        model.add(Conv2D(8, (3, 3), padding='same', input_shape=self.state_size))
        model.add(Activation('relu'))

        # 2x2 Pooling
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())

        # FC Layers
        model.add(Dense(16, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        # Random exploration
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        act_values = self.model.predict(state)
        
        return np.argmax(act_values[0])  # returns action using policy

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            target = reward
            
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)))
                
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [3]:
env = gym.make('MsPacman-v0')
state_size = env.observation_space.shape
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)

episodes = 50
done = False
batch_size = 32

In [None]:
for e in range(episodes):
    state = env.reset()
    state = np.expand_dims(state, axis=0)
    
    for time in range(20000):
        env.render()
        
        # Transition Dynamics
        action = agent.act(state)
        print("action{}: {}"
              .format(time, action))
        next_state, reward, done, _ = env.step(action)
        next_state = np.expand_dims(next_state, axis=0)
        
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        
        if done:
            print("episode: {}/{}, time: {}, e: {:.5}"
                  .format(e, episodes, time, agent.epsilon))
            break
            
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

action0: 1
action1: 5
action2: 1
action3: 8
action4: 7
action5: 7
action6: 8
action7: 8
action8: 1
action9: 7
action10: 4
action11: 8
action12: 8
action13: 8
action14: 2
action15: 6
action16: 0
action17: 7
action18: 0
action19: 7
action20: 0
action21: 8
action22: 8
action23: 8
action24: 6
action25: 8
action26: 0
action27: 8
action28: 8
action29: 7
action30: 1
action31: 5
action32: 8
action33: 1
action34: 0
action35: 8
action36: 1
action37: 3
action38: 8
action39: 8
action40: 2
action41: 8
action42: 2
action43: 6
action44: 7
action45: 8
action46: 6
action47: 1
action48: 6
action49: 8
action50: 1
action51: 8
action52: 7
action53: 8
action54: 1
action55: 8
action56: 8
action57: 8
action58: 3
action59: 3
action60: 2
action61: 8
action62: 8
action63: 3
action64: 4
action65: 4
action66: 8
action67: 8
action68: 8
action69: 8
action70: 8
action71: 8
action72: 8
action73: 8
action74: 1
action75: 0
action76: 1
action77: 5
action78: 8
action79: 8
action80: 3
action81: 1
action82: 8
action83: 4
ac