In [1]:
import gym
import numpy as np
import tensorflow as tf
import random
from collections import deque

In [2]:
env = gym.make('CartPole-v0')

[2017-02-24 19:18:06,751] Making new env: CartPole-v0


In [3]:
# Input and output size based on the Env
input_size = env.observation_space.shape[0]
output_size= env.action_space.n

dis = 0.9
REPLAY_MEMORY = 50000

In [4]:
class DQN:
    def __init__(self, session, input_size, output_size, name="main"):
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        
        self._build_network()
    
    def _build_network(self, h_size=10, l_rate=1e-1):
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(tf.float32, [None, self.input_size], name="input_x")
            
            # First layer of weights
            W1 = tf.get_variable("W1", shape=[self.input_size, h_size],
                                initializer=tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.tanh(tf.matmul(self._X,W1))
            
            # Second layer of weights
            W2 = tf.get_variable("W2", shape=[h_size, self.output_size],
                                initializer=tf.contrib.layers.xavier_initializer())
            
            #Q prediction
            self._Qpred = tf.matmul(layer1, W2)
            
        # We need to define the parts of the network needed for learning a policy
        self._Y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)
        
        # Loss Function
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
        
        # Learning
        self._train = tf.train.AdamOptimizer(learning_rate=l_rate).minimize(self._loss)
    
    def predict(self, state):
        x = np.reshape(state, [1, self.input_size])
        return self.session.run(self._Qpred, feed_dict={self._X:x})
    
    def update(self, x_stack, y_stack):
        return self.session.run([self._loss, self._train],
                                feed_dict={self._X:x_stack, self._Y: y_stack})

In [5]:
def simple_replay_train(DQN, train_batch):
    x_stack = np.empty(0).reshape(0, DQN.input_size)
    y_stack = np.empty(0).reshape(0, DQN.output_size)
    
    # Get stored information from the buffer
    for state, action, reward, next_state, done in train_batch:
        Q = DQN.predict(state)
        # terminal?
        if done:
            # Update Q, and no Qs+1, sice it's a terminal state
            Q[0, action] = reward
        else:
            # Obtain the Q_s1 values by feeding the new state through our network 
            Q[0, action] = reward + dis * np.max(DQN.predict(next_state))
        
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
    
    # Train out network using target and predicted Q values on each episode
    return DQN.update(x_stack, y_stack)        

In [6]:
def bot_play(mainDQN):
    # See our trained network in action
    s = env.reset()
    reward_sum = 0
    while True:
        # env.render()
        a = np.argmax(mainDQN.predict(s))
        s, reward, done, _ = env.step(a)
        reward_sum += reward
        if done:
            print("Total score:{}".format(reward_sum))
            break

In [7]:
def main():
    max_episodes = 5000
    
    # store the previous observations in replay memory
    replay_buffer = deque()
    
    sess = tf.InteractiveSession()
    mainDQN = DQN(sess, input_size, output_size)
    sess.run(tf.global_variables_initializer())
    
    for episode in range(max_episodes):
        e = 1. / ((episode / 10) + 1)
        done = False
        step_count = 0    
        state = env.reset()
            
        while not done:
            if np.random.rand(1) < e:
                action = env.action_space.sample()
            else:
                # Choose an action by greedily from the Q-network
                action = np.argmax(mainDQN.predict(state))
                
            # Get new state and reward from environment
            next_state, reward, done, _ = env.step(action)
            
            if done:
                reward = -100
           
            # Save the experienve to our buffer
            replay_buffer.append((state, action, reward, next_state, done))
            if len(replay_buffer) > REPLAY_MEMORY:
                replay_buffer.popleft()
            
            state = next_state
            step_count += 1
            if step_count > 10000:
                break
                    
        print("Episode:{} steps:{}".format(episode, step_count))
        if step_count > 10000:
             break
            
        # train every 10 episodes
        if episode % 10 == 1:
            for _ in range(50):
                # Minibatch works better
                minibatch = random.sample(replay_buffer, 10)
                loss, _ = simple_replay_train(mainDQN, minibatch)
            print("Loss:",loss)
        

    bot_play(mainDQN)

if __name__ == "__main__":
    main()
                

Episode:0 steps:13
Episode:1 steps:22
('Loss:', 492.24115)
Episode:2 steps:18
Episode:3 steps:39
Episode:4 steps:14
Episode:5 steps:38
Episode:6 steps:13
Episode:7 steps:36
Episode:8 steps:18
Episode:9 steps:15
Episode:10 steps:9
Episode:11 steps:10
('Loss:', 521.38879)
Episode:12 steps:79
Episode:13 steps:40
Episode:14 steps:55
Episode:15 steps:20
Episode:16 steps:35
Episode:17 steps:71
Episode:18 steps:60
Episode:19 steps:77
Episode:20 steps:32
Episode:21 steps:37
('Loss:', 3.6882424)
Episode:22 steps:34
Episode:23 steps:47
Episode:24 steps:36
Episode:25 steps:56
Episode:26 steps:37
Episode:27 steps:45
Episode:28 steps:43
Episode:29 steps:50
Episode:30 steps:18
Episode:31 steps:37
('Loss:', 1.0413755)
Episode:32 steps:21
Episode:33 steps:20
Episode:34 steps:16
Episode:35 steps:15
Episode:36 steps:24
Episode:37 steps:18
Episode:38 steps:15
Episode:39 steps:19
Episode:40 steps:18
Episode:41 steps:21
('Loss:', 3.1494496)
Episode:42 steps:18
Episode:43 steps:17
Episode:44 steps:19
Episod