In [1]:
'''
This code is based on:
https://github.com/hunkim/DeepRL-Agents
CF https://github.com/golbin/TensorFlow-Tutorials
https://github.com/dennybritz/reinforcement-learning/blob/master/DQN/dqn.py
'''
import numpy as np
import tensorflow as tf
import random
from collections import deque
import dqn

import gym
from gym import wrappers

env = gym.make('CartPole-v0')

# Constants defining our neural network
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.9
REPLAY_MEMORY = 50000


def replay_train(mainDQN, targetDQN, train_batch):
    x_stack = np.empty(0).reshape(0, input_size)
    y_stack = np.empty(0).reshape(0, output_size)

    # Get stored information from the buffer
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)

        # terminal?
        if done:
            Q[0, action] = reward
        else:
            # get target from target DQN (Q')
            Q[0, action] = reward + dis * np.max(targetDQN.predict(next_state))

        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])

    # Train our network using target and predicted Q values on each episode
    return mainDQN.update(x_stack, y_stack)


def ddqn_replay_train(mainDQN, targetDQN, train_batch):
    '''
    Double DQN implementation
    :param mainDQN: main DQN
    :param targetDQN: target DQN
    :param train_batch: minibatch for train
    :return: loss
    '''
    x_stack = np.empty(0).reshape(0, mainDQN.input_size)
    y_stack = np.empty(0).reshape(0, mainDQN.output_size)

    # Get stored information from the buffer
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)

        # terminal?
        if done:
            Q[0, action] = reward
        else:
            # Double DQN: y = r + gamma * targetDQN(s')[a] where
            # a = argmax(mainDQN(s'))
            Q[0, action] = reward + dis * \
                targetDQN.predict(next_state)[
                0, np.argmax(mainDQN.predict(next_state))]

        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])

    # Train our network using target and predicted Q values on each episode
    return mainDQN.update(x_stack, y_stack)


def get_copy_var_ops(*, dest_scope_name="target", src_scope_name="main"):

    # Copy variables src_scope to dest_scope
    op_holder = []

    src_vars = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
    dest_vars = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)

    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))

    return op_holder


def bot_play(mainDQN, env=env):
    # See our trained network in action
    state = env.reset()
    reward_sum = 0
    while True:
        env.render()
        action = np.argmax(mainDQN.predict(state))
        state, reward, done, _ = env.step(action)
        reward_sum += reward
        if done:
            print("Total score: {}".format(reward_sum))
            break


def main():
    max_episodes = 5000
    # store the previous observations in replay memory
    replay_buffer = deque()

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size, name="main")
        targetDQN = dqn.DQN(sess, input_size, output_size, name="target")
        tf.global_variables_initializer().run()

        # initial copy q_net -> target_net
        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)

        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    # Choose an action by greedily from the Q-network
                    action = np.argmax(mainDQN.predict(state))

                # Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)
                if done:  # Penalty
                    reward = -100

                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000:  # Good enough. Let's move on
                    break

            print("Episode: {}  steps: {}".format(episode, step_count))
            if step_count > 10000:
                pass
              #  break

            if episode % 10 == 1:  # train every 10 episode
                # Get a random batch of experiences.
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = ddqn_replay_train(mainDQN, targetDQN, minibatch)

                print("Loss: ", loss)
                # copy q_net -> targ et_net
                sess.run(copy_ops)

        # See our trained bot in action
        env2 = wrappers.Monitor(env, 'gym-results', force=True)

        for i in range(200):
            bot_play(mainDQN, env=env2)

        env2.close()

if __name__ == "__main__":
    main()

[2017-03-30 15:55:24,918] Making new env: CartPole-v0


Episode: 0  steps: 12
Episode: 1  steps: 12
Loss:  1029.95
Episode: 2  steps: 29
Episode: 3  steps: 53
Episode: 4  steps: 67
Episode: 5  steps: 73
Episode: 6  steps: 16
Episode: 7  steps: 95
Episode: 8  steps: 66
Episode: 9  steps: 51
Episode: 10  steps: 32
Episode: 11  steps: 72
Loss:  4.86098
Episode: 12  steps: 86
Episode: 13  steps: 61
Episode: 14  steps: 60
Episode: 15  steps: 94
Episode: 16  steps: 83
Episode: 17  steps: 47
Episode: 18  steps: 43
Episode: 19  steps: 166
Episode: 20  steps: 74
Episode: 21  steps: 55
Loss:  0.803805
Episode: 22  steps: 76
Episode: 23  steps: 46
Episode: 24  steps: 93
Episode: 25  steps: 14
Episode: 26  steps: 66
Episode: 27  steps: 52
Episode: 28  steps: 73
Episode: 29  steps: 72
Episode: 30  steps: 64
Episode: 31  steps: 49
Loss:  1.82903
Episode: 32  steps: 9
Episode: 33  steps: 9
Episode: 34  steps: 14
Episode: 35  steps: 9
Episode: 36  steps: 9
Episode: 37  steps: 13
Episode: 38  steps: 10
Episode: 39  steps: 9
Episode: 40  steps: 9
Episode: 41

[2017-03-30 16:00:43,128] Clearing 14 monitor files from previous run (because force=True was provided)
[2017-03-30 16:00:43,132] Starting new video recorder writing to /Users/joikyeon/Dropbox/Study/Modulabs/RL_LAB/Lecture7/gym-results/openaigym.video.0.5251.video000000.mp4


Episode: 4999  steps: 75


[2017-03-30 16:00:46,786] Starting new video recorder writing to /Users/joikyeon/Dropbox/Study/Modulabs/RL_LAB/Lecture7/gym-results/openaigym.video.0.5251.video000001.mp4


Total score: 66.0
Total score: 108.0
Total score: 77.0
Total score: 84.0
Total score: 65.0
Total score: 78.0
Total score: 63.0


[2017-03-30 16:00:57,222] Starting new video recorder writing to /Users/joikyeon/Dropbox/Study/Modulabs/RL_LAB/Lecture7/gym-results/openaigym.video.0.5251.video000008.mp4


Total score: 66.0
Total score: 113.0
Total score: 93.0
Total score: 103.0
Total score: 97.0
Total score: 99.0
Total score: 92.0
Total score: 82.0
Total score: 135.0
Total score: 76.0
Total score: 118.0
Total score: 135.0
Total score: 119.0
Total score: 65.0
Total score: 84.0
Total score: 100.0
Total score: 90.0
Total score: 105.0


[2017-03-30 16:01:02,544] Starting new video recorder writing to /Users/joikyeon/Dropbox/Study/Modulabs/RL_LAB/Lecture7/gym-results/openaigym.video.0.5251.video000027.mp4


Total score: 103.0
Total score: 81.0
Total score: 66.0
Total score: 113.0
Total score: 89.0
Total score: 65.0
Total score: 130.0
Total score: 79.0
Total score: 59.0
Total score: 106.0
Total score: 102.0
Total score: 132.0
Total score: 97.0
Total score: 83.0
Total score: 72.0
Total score: 94.0
Total score: 85.0
Total score: 70.0
Total score: 72.0
Total score: 106.0
Total score: 66.0
Total score: 100.0
Total score: 99.0
Total score: 83.0
Total score: 139.0
Total score: 72.0
Total score: 54.0
Total score: 86.0
Total score: 72.0
Total score: 129.0
Total score: 110.0
Total score: 100.0
Total score: 91.0
Total score: 110.0
Total score: 163.0
Total score: 87.0
Total score: 108.0
Total score: 89.0


[2017-03-30 16:01:09,602] Starting new video recorder writing to /Users/joikyeon/Dropbox/Study/Modulabs/RL_LAB/Lecture7/gym-results/openaigym.video.0.5251.video000064.mp4


Total score: 90.0
Total score: 79.0
Total score: 62.0
Total score: 79.0
Total score: 96.0
Total score: 140.0
Total score: 73.0
Total score: 80.0
Total score: 115.0
Total score: 109.0
Total score: 95.0
Total score: 162.0
Total score: 66.0
Total score: 119.0
Total score: 105.0
Total score: 77.0
Total score: 87.0
Total score: 92.0
Total score: 77.0
Total score: 91.0
Total score: 86.0
Total score: 102.0
Total score: 136.0
Total score: 98.0
Total score: 82.0
Total score: 112.0
Total score: 108.0
Total score: 122.0
Total score: 64.0
Total score: 148.0
Total score: 140.0
Total score: 82.0
Total score: 66.0
Total score: 84.0
Total score: 103.0
Total score: 64.0
Total score: 78.0
Total score: 79.0
Total score: 94.0
Total score: 87.0
Total score: 104.0
Total score: 83.0
Total score: 120.0
Total score: 83.0
Total score: 103.0
Total score: 84.0
Total score: 80.0
Total score: 87.0
Total score: 96.0
Total score: 118.0
Total score: 114.0
Total score: 107.0
Total score: 95.0
Total score: 82.0
Total sc

[2017-03-30 16:01:20,634] Starting new video recorder writing to /Users/joikyeon/Dropbox/Study/Modulabs/RL_LAB/Lecture7/gym-results/openaigym.video.0.5251.video000125.mp4


Total score: 108.0
Total score: 73.0
Total score: 97.0
Total score: 70.0
Total score: 99.0
Total score: 81.0
Total score: 89.0
Total score: 94.0
Total score: 136.0
Total score: 122.0
Total score: 65.0
Total score: 94.0
Total score: 66.0
Total score: 134.0
Total score: 71.0
Total score: 102.0
Total score: 95.0
Total score: 85.0
Total score: 111.0
Total score: 109.0
Total score: 74.0
Total score: 90.0
Total score: 105.0
Total score: 119.0
Total score: 94.0
Total score: 67.0
Total score: 90.0
Total score: 82.0
Total score: 89.0
Total score: 75.0
Total score: 99.0
Total score: 91.0
Total score: 117.0
Total score: 85.0
Total score: 75.0
Total score: 120.0
Total score: 62.0
Total score: 66.0
Total score: 83.0
Total score: 68.0
Total score: 83.0
Total score: 73.0
Total score: 124.0
Total score: 106.0
Total score: 107.0
Total score: 82.0
Total score: 78.0
Total score: 82.0
Total score: 122.0
Total score: 92.0
Total score: 79.0
Total score: 154.0
Total score: 85.0
Total score: 76.0
Total score:

[2017-03-30 16:01:34,027] Finished writing results. You can upload them to the scoreboard via gym.upload('/Users/joikyeon/Dropbox/Study/Modulabs/RL_LAB/Lecture7/gym-results')


Total score: 84.0
Total score: 52.0
Total score: 76.0
