In [1]:
%matplotlib inline
import gym
import matplotlib.pyplot as plt
import numpy as np
import random
from collections import deque
from IPython.display import display
from JSAnimation.IPython_display import display_animation
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from matplotlib import animation

Using TensorFlow backend.


In [2]:
def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(display_animation(anim, default_mode='loop'))

In [7]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.9                 # discount rate
        self.epsilon = 1.0               # exploration rate
        self.e_decay = .999              # exploration rate decay
        self.e_min = 0.05                # min exploration rate
        self.learning_rate = 0.1         # learning rate for algorithm
        self.learning_rate_model = 0.01  # learning rate for model (Adam optimizer)
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(20, input_dim=self.state_size, activation='relu'))
        model.add(Dense(20, activation='relu', kernel_initializer='uniform'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate_model))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        return self.act_greedy(state)
    
    def act_greedy(self, state):
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        batch_size = min(batch_size, len(self.memory))
        minibatch = random.sample(self.memory, batch_size)
        X = np.zeros((batch_size, self.state_size))
        Y = np.zeros((batch_size, self.action_size))
        for i in range(batch_size):
            state, action, reward, next_state, done = minibatch[i]
            target = self.model.predict(state)[0]
            if done:
                target[action] = -10
            else:
#                 target[action] = reward + self.gamma * \
#                             np.amax(self.model.predict(next_state)[0])
                target[action] = target[action] + self.learning_rate * \
                    (reward + self.gamma * np.amax(self.model.predict(next_state)[0] - target[action]))
            X[i], Y[i] = state, target
        self.model.fit(X, Y, batch_size=batch_size, epochs=1, verbose=0)
        if self.epsilon > self.e_min:
            self.epsilon *= self.e_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [18]:
def train(env):
    EPISODES = 1000
    FRAMES_PER_EPISODE = 1000

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = DQNAgent(state_size, action_size)

    for e in range(1, EPISODES + 1):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        scores = []
        for time in range(FRAMES_PER_EPISODE):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done or time == FRAMES_PER_EPISODE - 1:
                scores.append(time)
                if e % (EPISODES / 20) == 0:
                    print("episode: {}/{}, avg score: {}, exploration rate: {:.2}"
                          .format(e, EPISODES, np.average(scores), agent.epsilon))
                break
        agent.replay(32)
        
    return agent

In [19]:
def evaluate(env, agent, render):
    FRAMES = 500

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    
    state = env.reset()
    frames = []
    for e in range(FRAMES):
        if render:
            frames.append(env.render(mode = 'rgb_array'))
        state = np.reshape(state, [1, state_size])
        action = agent.act_greedy(state)
        state, reward, done, _ = env.step(action)
        if done or e == FRAMES - 1:
            print("score: {}"
                  .format(e))
            break

    if render:
        env.render(close=True)
        display_frames_as_gif(frames)

In [30]:
env.render(close=True)

In [20]:
env = gym.make('CartPole-v1')

[2017-05-24 09:11:06,647] Making new env: CartPole-v1


In [21]:
agent = train(env)

episode: 50/1000, avg score: 8.0, exploration rate: 0.95
episode: 100/1000, avg score: 15.0, exploration rate: 0.91
episode: 150/1000, avg score: 49.0, exploration rate: 0.86
episode: 200/1000, avg score: 39.0, exploration rate: 0.82
episode: 250/1000, avg score: 8.0, exploration rate: 0.78
episode: 300/1000, avg score: 17.0, exploration rate: 0.74
episode: 350/1000, avg score: 16.0, exploration rate: 0.71
episode: 400/1000, avg score: 29.0, exploration rate: 0.67
episode: 450/1000, avg score: 23.0, exploration rate: 0.64
episode: 500/1000, avg score: 38.0, exploration rate: 0.61
episode: 550/1000, avg score: 27.0, exploration rate: 0.58
episode: 600/1000, avg score: 10.0, exploration rate: 0.55
episode: 650/1000, avg score: 9.0, exploration rate: 0.52
episode: 700/1000, avg score: 70.0, exploration rate: 0.5
episode: 750/1000, avg score: 44.0, exploration rate: 0.47
episode: 800/1000, avg score: 108.0, exploration rate: 0.45
episode: 850/1000, avg score: 26.0, exploration rate: 0.43
e

In [24]:
evaluate(env, agent, False)

score: 120
