In [29]:
import gym
import tensorflow as tf
import numpy as np
from collections import deque
import random

In [4]:
env = gym.make('CartPole-v0')
action_size = 2
state_size = 4

In [67]:
# ネットワーク構築
tf.reset_default_graph()

# 状態からQ値を予測するネットワーク
inputs_ = tf.placeholder(tf.float32,[None, state_size] )
fc1 = tf.contrib.layers.fully_connected(inputs_, 64)
fc2 = tf.contrib.layers.fully_connected(fc1, 64)
output = tf.contrib.layers.fully_connected(fc2, action_size, activation_fn=None)

# 実際に行った行動と教師となるQ値から学習するネットワーク
actions_ = tf.placeholder(tf.int32,[None] )
targetQs_ = tf.placeholder(tf.float32, [None] )
one_hot_actions = tf.one_hot(actions_, action_size)
Q = tf.reduce_sum( tf.multiply(output, one_hot_actions), axis=1)
loss = tf.reduce_mean( tf.square(targetQs_ - Q) )
opt = tf.train.AdamOptimizer(0.0001).minimize(loss)

In [68]:
# Experinece Rplay用のメモリ
memory = deque(maxlen=10000)

def sample( memory, batch_size ):
    samp = random.sample( memory, batch_size )
    
    states = []
    actions = []
    rewards = []
    next_states = []
    
    for s, a, r, ns in samp:
        states.append( s )
        actions.append( a )
        rewards.append( r )
        next_states.append( ns )
    return np.array(states), np.array(actions), np.array(rewards), np.array(next_states)

In [94]:
# メモリにランダムの経験を入れる
memory.clear()

env.reset()
action = env.action_space.sample()
state, reward, done, _ = env.step(action)
for i in range(1, 20):
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    
    if done:
        # 失敗(done)したら次のstateに(0, 0)を入れる
        next_state = np.zeros( state_size )
        memory.append((state, action, reward, next_state))
        env.reset()
    else:
        memory.append((state, action, reward, next_state))
        state = next_state        

In [95]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    step = 0
    for ep in range(0, 1000):
        total_rewards = 0

        for it in range(200):
            # ε-greedy
            step += 1
            explore_p = 0.01 + (1.0 - 0.01) * np.exp(-0.0001*step)

            if explore_p > np.random.rand():
                # 一定の確率でランダムに行動
                action = env.action_space.sample()
            else:
                # Q値が最大の行動を選択
                Qs = sess.run(output, feed_dict={inputs_: state.reshape(-1,state_size)})
                action = np.argmax(Qs)
            
            if ep%100==0:
                env.render()
   
            # 実際に行動
            next_state, reward, done, _ = env.step(action)
            total_rewards += reward
        
            if done:
                # 失敗したらnext_stateに(0, 0)を入れる
                next_state = np.zeros( state_size )
                
                # メモリに追加
                memory.append((state, action, reward, next_state))
            else:
                # メモリに追加
                memory.append((state, action, reward, next_state))
                state = next_state
            
            # ネットワークの学習
            # メモリから20個取り出す
            states, actions, rewards, next_states = sample(memory, 20)

            # 各状態のQ値を予測
            next_Qs = sess.run(output, feed_dict={inputs_: next_states})
            
            # 次の状態が終了(0, 0)であれば，その状態のQ値を0にする
            episode_ends = (next_states == np.zeros(state[0].shape)).all(axis=1)
            next_Qs[episode_ends] = (0, 0)
            
            # Q値がtargetsになるようネットワークを学習
            targets = rewards + 0.99 * np.max(next_Qs, axis=1)
            sess.run([loss, opt], feed_dict={inputs_: states, targetQs_: targets, actions_: actions})
            
            if done:
                # 失敗したら環境をリセットして再開
                env.reset()
                action = env.action_space.sample()
                state, reward, done, _ = env.step(action)
                break
        if ep%10==0:
            print( "epoch:", ep, "  rewards:", total_rewards )

epoch: 0   rewards: 2.0
epoch: 10   rewards: 20.0
epoch: 20   rewards: 50.0
epoch: 30   rewards: 14.0
epoch: 40   rewards: 19.0
epoch: 50   rewards: 11.0
epoch: 60   rewards: 10.0
epoch: 70   rewards: 16.0
epoch: 80   rewards: 14.0
epoch: 90   rewards: 14.0
epoch: 100   rewards: 10.0
epoch: 110   rewards: 10.0
epoch: 120   rewards: 19.0
epoch: 130   rewards: 10.0
epoch: 140   rewards: 30.0
epoch: 150   rewards: 28.0
epoch: 160   rewards: 9.0
epoch: 170   rewards: 19.0
epoch: 180   rewards: 11.0
epoch: 190   rewards: 11.0
epoch: 200   rewards: 9.0
epoch: 210   rewards: 14.0
epoch: 220   rewards: 10.0
epoch: 230   rewards: 24.0
epoch: 240   rewards: 26.0
epoch: 250   rewards: 10.0
epoch: 260   rewards: 31.0
epoch: 270   rewards: 13.0
epoch: 280   rewards: 12.0
epoch: 290   rewards: 22.0
epoch: 300   rewards: 14.0
epoch: 310   rewards: 23.0
epoch: 320   rewards: 11.0
epoch: 330   rewards: 16.0
epoch: 340   rewards: 29.0
epoch: 350   rewards: 23.0
epoch: 360   rewards: 16.0
epoch: 370   re

KeyboardInterrupt: 