In [201]:
import tensorflow as tf
import gym
from collections import deque
import random
import numpy as np

In [209]:
# 環境構築
env = gym.make('MountainCarContinuous-v0')

state_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

print( env.action_space )

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Box(1,)


In [203]:
# actor
tf.reset_default_graph()

# 観測
a_state = tf.placeholder(tf.float32, [None, state_dim])

# Criticから渡される勾配
a_action_grads = tf.placeholder(tf.float32, [None, act_dim])

# ネットワーク構築
with tf.variable_scope("actor"):
    fc1 = tf.contrib.layers.fully_connected(a_state, 400 )
    fc2 = tf.contrib.layers.fully_connected(fc1, 300)
    fc3 = tf.contrib.layers.fully_connected(fc2, act_dim, activation_fn=tf.tanh )
    a_output = tf.multiply(fc3, 1)

# 勾配を計算
a_params = [ v for v in tf.trainable_variables() if "actor" in v.name ]
a_grads = tf.gradients(a_output, a_params, -a_action_grads)
a_grads = list(map(lambda x: tf.div(x, 20),  a_grads))

# 最適化
a_train_step = tf.train.AdamOptimizer(0.0001).apply_gradients(zip(a_grads,  a_params))

In [204]:
# critic
c_state = tf.placeholder(tf.float32, [None, state_dim])
c_action = tf.placeholder(tf.float32, [None, act_dim])
c_target_q = tf.placeholder(tf.float32, [None, act_dim])

fc1 = tf.contrib.layers.fully_connected(c_state, 400)
w1 = tf.Variable(tf.random_normal([400, 300], mean=0.0, stddev=0.05))
w2 = tf.Variable(tf.random_normal([act_dim, 300], mean=0.0, stddev=0.05))
b2 = tf.Variable(tf.zeros([300]))

# 二層目でactionと結合
fc2 = tf.nn.relu(tf.matmul(fc1, w1) + tf.matmul(c_action, w2) + b2)
c_output = tf.contrib.layers.fully_connected(fc2, act_dim, activation_fn=None)

c_loss = tf.reduce_mean(tf.square(c_target_q - c_output))

#c_loss = tf.losses.huber_loss(c_target_q, c_output)
c_train_step = tf.train.AdamOptimizer(0.001).minimize(c_loss)
c_action_grads = tf.gradients(c_output, c_action)


In [205]:
# Experinece Rplay用のメモリ
memory = deque(maxlen=10000)

def sample( memory, batch_size ):
    samp = random.sample( memory, batch_size )
    
    states = []
    actions = []
    rewards = []
    next_states = []
    
    for s, a, r, ns in samp:
        states.append( s )
        actions.append( a )
        rewards.append( r )
        next_states.append( ns )
    return np.array(states), np.array(actions), np.array(rewards), np.array(next_states)

In [206]:
# メモリにランダムの経験を入れる
memory.clear()

env.reset()
action = env.action_space.sample()
state, reward, done, _ = env.step(action)
for i in range(1, 100):
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    memory.append((state, action, reward, next_state))
    state = next_state
    if done:
        break

In [221]:
with tf.Session() as sess:
    sess.run( tf.global_variables_initializer() )
    for ep in range(1000):
        total_reward = 0
        env.reset()
        action = env.action_space.sample()
        
        state, reward, done, _ = env.step(action)
        # 学習
        for it in range(1000):
            # メモリから20個取り出す
            states, actions, rewards, next_states = sample( memory, 20 )
                        
            # 次の状態next_stateで取るべき行動next_actionを予測
            next_actions = sess.run( a_output, feed_dict={a_state: next_states} )
            
            # 次の状態の価値を計算
            next_q = sess.run( c_output, feed_dict={c_state:next_states, c_action: next_actions} )
            
            # critic学習
            # 今の状態と行動のQ値を更新
            # rewards: 今の状態と行動で得られた報酬
            # next_q: 遷移先のQ値
            feed_dict = {
                c_state: states, 
                c_action: actions, 
                c_target_q: rewards.reshape(20,1) + 0.99 * next_q
            }
            sess.run( c_train_step , feed_dict=feed_dict)
            
            # Q値が最大となるようなactionの勾配を計算
            feed_dict = {
                c_state: states,
                c_action: actions
            }
            action_grads = sess.run( c_action_grads, feed_dict=feed_dict )[0]
            
            # actor学習
            feed_dict = {
                a_state: states,
                a_action_grads: action_grads                
            }
            sess.run( a_train_step, feed_dict=feed_dict )
                        
            # 実際に行動
            if ep%10==0:
                env.render()
            action = sess.run( a_output, feed_dict={a_state: state.reshape(-1,state_dim)} )[0]
            action += np.random.randn()/1.0
            
            next_state, reward, done, _ = env.step(action)
            memory.append( (state.flatten(), action, reward, next_state) )

            total_reward += reward
            state = next_state
            if done:
                break

        print(ep, total_reward)

0 -194.46498162010894
1 -193.1384991781803
2 -186.90858367258895
3 -200.31313177618057
4 -146.60679714588588
5 -173.30712492785548
6 -153.94330900944902
7 -165.26276282763777
8 -195.688743343262
9 -185.31600588199487
10 -184.0525121419867
11 -168.33288204571673
12 -165.98387419280706
13 -172.44140156953628
14 -158.526974933941
15 -163.82903145585664
16 -159.75742756215496
17 -159.4248071254868
18 -174.10655074269513
19 -179.43940193693587
20 -206.72110257906024
21 -188.82175516314595
22 -189.42101787512328
23 -190.654174641023
24 -42.082720069770815
25 -197.55965811241873
26 -197.58623741389653
27 -183.81051259904226
28 -182.88170021715464
29 -36.58178904886523
30 -193.55772272708066
31 -187.27502485442037
32 -197.71038215637702
33 -191.13178171848745
34 -195.09807122234412
35 -205.47758082517777
36 -198.76777750167471
37 -204.3350423261247
38 -182.1672896344073
39 -196.05602919848482
40 -211.37165623054426
41 -215.32935249552744
42 -213.2715205274847
43 -207.08421659709435
44 -191.480

KeyboardInterrupt: 