# 第11回講義 宿題

## 課題. Deep Q-Network（DQN）でMountainCarを攻略せよ

## Homework

Deep Q-Network（DQN）により、MountainCarを攻略してみましょう。  
今回の評価は提出点のみとなりますが、より上手にゲームを攻略できるようチャレンジしてみてください。

## ルール

環境としてMountainCar-v0を利用します。  
MountainCarは二つの山の間にある車を右の山の頂上まで運ぶゲームです。  
エピソード終了時のRewardが-200よりも大きくなれば、成功となります。  
- state: サイズ(2,)のnp.ndarray
  - (車の位置, 車の速度)
- action:
    - 0: 車を左に移動させる
    - 1: 車を移動させない
    - 2: 車を右に移動させる
- reward:
    -1: エピソード終了まで
- terminal:
    - False: エピソード継続
    - True: エピソード終了 (ゴールするか、200step経過)

## 評価について

- MountainCarをDQNによって攻略するコードを提出してください。(提出ページにそのままペーストして頂いて構いません)
- なお今回の評価は提出点のみとなります。

## サンプルコード

In [7]:
import os

import gym
import numpy as np
import tensorflow as tf

#追加
# import matplotlib
# import matplotlib.animation as animation
# import matplotlib.pyplot as plt
# !apt-get install -y xvfb python-opengl > /dev/null 2>&1
# !pip install gym pyvirtualdisplay > /dev/null 2>&1
# !pip install JSAnimation
# from pyvirtualdisplay import Display
# pydisplay = Display(visible=0, size=(400, 300))
# pydisplay.start()
# from IPython import display
# # 結果の確認
# from JSAnimation.IPython_display import display_animation
# from IPython.display import HTML
# def animate(i):
#     patch.set_data(frames[i])

from collections import deque

env = gym.make('MountainCar-v0')

tf.reset_default_graph()
# WRITE ME

n_states = 2
n_actions = 3

initializer = tf.variance_scaling_initializer()

x_state = tf.placeholder(tf.float32, [None, n_states])

def original_network(x):
    with tf.variable_scope('Original', reuse=tf.AUTO_REUSE):
        h = tf.layers.Dense(units=16, activation=tf.nn.elu, kernel_initializer=initializer)(x)
        h = tf.layers.Dense(units=16, activation=tf.nn.elu, kernel_initializer=initializer)(h)
        h = tf.layers.Dense(units=16, activation=tf.nn.elu, kernel_initializer=initializer)(h)
        y = tf.layers.Dense(units=n_actions, kernel_initializer=initializer)(h)
    return y

def target_network(x):
    with tf.variable_scope('Target', reuse=tf.AUTO_REUSE):
        h = tf.layers.Dense(units=16, activation=tf.nn.elu, kernel_initializer=initializer)(x)
        h = tf.layers.Dense(units=16, activation=tf.nn.elu, kernel_initializer=initializer)(h)
        h = tf.layers.Dense(units=16, activation=tf.nn.elu, kernel_initializer=initializer)(h)
        y = tf.layers.Dense(units=n_actions, kernel_initializer=initializer)(h)
    return y

q_original = original_network(x_state)
vars_original = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Original')

q_target = target_network(x_state)
vars_target = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Target')

copy_ops = [var_target.assign(var_original) for var_target, var_original in zip(vars_target, vars_original)]
copy_weights = tf.group(*copy_ops)

t = tf.placeholder(tf.float32, [None])
x_action = tf.placeholder(tf.int32, [None])
q_value = tf.reduce_sum(q_original * tf.one_hot(x_action, n_actions), axis=1)

cost = tf.reduce_mean(tf.square(tf.subtract(t,q_value)))
optimizer = tf.train.AdamOptimizer()
train_ops = optimizer.minimize(cost)

class ReplayMemory:
    def __init__(self, memory_size):
        self.memory_size = memory_size
        self.memory = deque([], maxlen = memory_size)
    
    def append(self, transition):
        self.memory.append(transition)
    
    def sample(self, batch_size):
        batch_indexes = np.random.randint(0, len(self.memory), size=batch_size).tolist()

        state      = np.array([self.memory[index]['state'] for index in batch_indexes])
        next_state = np.array([self.memory[index]['next_state'] for index in batch_indexes])
        reward     = np.array([self.memory[index]['reward'] for index in batch_indexes])
        action     = np.array([self.memory[index]['action'] for index in batch_indexes])
        terminal   = np.array([self.memory[index]['terminal'] for index in batch_indexes])
        
        return {'state': state, 'next_state': next_state, 'reward': reward, 'action': action, 'terminal': terminal}

memory_size = 50000 #メモリーサイズ
initial_memory_size = 500 #事前に貯める経験数

replay_memory = ReplayMemory(memory_size)

step = 0

while True:
    state = env.reset()
    terminal = False
    
    while not terminal:
        action = env.action_space.sample() # ランダムに行動を選択
        
        next_state, reward, terminal, _ = env.step(action) # 状態、報酬、終了判定の取得
        
        transition = {
            'state': state,
            'next_state': next_state,
            'reward': reward,
            'action': action,
            'terminal': int(terminal)
        }
        replay_memory.append(transition) # 経験の記憶

        state = next_state
        
        step += 1
    
    if step >= initial_memory_size:
        break
        
eps_start = 1.0
eps_end = 0.01 #defaultは0.1
n_steps = 10000
def get_eps(step):
    return max(0.01, (eps_end - eps_start) / n_steps * step + eps_start)
  
gamma = 0.95 #defaultは0.99
target_update_interval = 1000 #重みの更新間隔
batch_size = 32
n_episodes = 300
step = 0
init = tf.global_variables_initializer()

with tf.Session() as sess:
    init.run()
    copy_weights.run() # 初期重みのコピー
    for episode in range(n_episodes):
        state = env.reset()
        terminal = False

        total_reward = 0
        total_q_max = []
        while not terminal:
            q = q_original.eval(feed_dict={x_state: state[None]}) # Q値の計算
            total_q_max.append(np.max(q))

            eps = get_eps(step) # εの更新
            if np.random.random() < eps:
                action = env.action_space.sample() # （ランダムに）行動を選択
            else:
                action = np.argmax(q) # 行動を選択
            next_state, reward, terminal, _ = env.step(action) # 状態、報酬、終了判定の取得
            reward = np.sign(reward)
            total_reward += reward # エピソード内の報酬を更新

            transition = {
                'state': state,
                'next_state': next_state,
                'reward': reward,
                'action': action,
                'terminal': int(terminal)
            }
            replay_memory.append(transition) # 経験の記憶
            
            batch = replay_memory.sample(batch_size) # 経験のサンプリング
            q_target_next = q_target.eval(feed_dict={x_state: batch['next_state']}) # ターゲットQ値の計算
            t_value = batch['reward'] + (1 - batch['terminal']) * gamma * q_target_next.max(1)
            
            train_ops.run(feed_dict = {x_state: batch['state'], x_action: batch['action'], t: t_value}) # 訓練オペレーション

            state = next_state

            if (step + 1) % target_update_interval == 0:
                copy_weights.run() # 一定期間ごとに重みをコピー

            step += 1

        if (episode + 1) % 10 == 0:
            print('Episode: {}, Reward: {}, Q_max: {:.4f}, eps: {:.4f}'.format(episode + 1, total_reward, np.mean(total_q_max), eps))
    
    # 学習させたネットワークでTest
#     frames = []
    state = env.reset()
    terminal = False

    total_reward = 0
    while not terminal:
#         img = env.render(mode="rgb_array")
#         frames.append(img)

        q = q_original.eval(feed_dict={x_state: state[None]})
        action = np.argmax(q)

        next_state, reward, terminal, _ = env.step(action)
        total_reward += reward

        state = next_state
    
    print('Test Reward:', total_reward)
    
# plt.figure(figsize=(frames[0].shape[1]/72.0, frames[0].shape[0]/72.0), dpi=72)
# patch = plt.imshow(frames[0])
# plt.axis('off')
    
# anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50)
# HTML(anim.to_jshtml())


env.close()

Episode: 10, Reward: -200.0, Q_max: -1.9052, eps: 0.8021
Episode: 20, Reward: -200.0, Q_max: -3.6088, eps: 0.6041
Episode: 30, Reward: -200.0, Q_max: -5.0134, eps: 0.4061
Episode: 40, Reward: -200.0, Q_max: -6.2918, eps: 0.2081
Episode: 50, Reward: -200.0, Q_max: -7.5130, eps: 0.0126
Episode: 60, Reward: -200.0, Q_max: -8.6419, eps: 0.0100
Episode: 70, Reward: -140.0, Q_max: -9.1042, eps: 0.0100
Episode: 80, Reward: -200.0, Q_max: -10.1651, eps: 0.0100
Episode: 90, Reward: -200.0, Q_max: -10.9732, eps: 0.0100
Episode: 100, Reward: -200.0, Q_max: -11.7054, eps: 0.0100
Episode: 110, Reward: -200.0, Q_max: -12.2296, eps: 0.0100
Episode: 120, Reward: -200.0, Q_max: -13.1268, eps: 0.0100
Episode: 130, Reward: -200.0, Q_max: -13.4971, eps: 0.0100
Episode: 140, Reward: -200.0, Q_max: -14.1151, eps: 0.0100
Episode: 150, Reward: -200.0, Q_max: -14.0480, eps: 0.0100
Episode: 160, Reward: -200.0, Q_max: -14.9404, eps: 0.0100
Episode: 170, Reward: -200.0, Q_max: -15.0752, eps: 0.0100
Episode: 180,