In [1]:
import numpy as np
import tensorflow as tf
import gym
import random
import keras
import os
import time
from collections import deque
from keras.layers import Dense,Flatten,Dropout,Input
from keras.models import Sequential,Model

Using TensorFlow backend.


In [2]:
#env=gym.make('CartPole-v0')
#env.reset()
#observation,_,_,_=env.step(0)
#print(observation)

In [16]:
#parameters
ENV_NAME='CartPole-v0'#これはstateを変数そのもので表現する感じ
MAX_EPISODES=100
RANDOM_RESETS=10
MAX_STEPS=1000
RENDER=False
INITIAL_EPSILON=1.0
FINAL_EPSILON=0.1
SAVE_PATH='saved_networks/'+ENV_NAME
LOAD=False
LEARNING_RATE=1.0e-4
MOMENTUM=0.95
MIN_GRAD=0.1
INITIALIZE_REPLAY_SIZE=80#学習が始まるまでにメモリにためておく
MEMORY_SIZE=1000
TARGET_NETWORK_UPDATES=1000#main&targetの共有重みの更新の頻度
SAVE_INTERVAL=10
BATCH_SIZE=32
TRAIN=True
MAX_EPSIODES_TEST=5
GAMMA=0.9


"""画像を入力データとするとき"""
FRAME_WIDTH=64
FRAME_HEIGHT=64
STATE_LENGTH=4#一回の学習に入れる画像の枚数

In [25]:
#agent class
class Agent():
    
    def __init__(self,num_actions,num_states):
        self.num_actions=num_actions
        self.num_states=num_states
        self.t=0#現在のステップ数
        self.epsilon=INITIAL_EPSILON
        self.epsilon_step=(INITIAL_EPSILON-FINAL_EPSILON)/MAX_STEPS
        
        #parameters
        self.total_q_max=0
        self.total_reward=0
        self.total_loss=0
        self.episode=0
        self.duration=0
        
        self.start=0
        
        #memory
        self.replay_memory=deque()#端っこから情報を捨てれる
        
        #Q_netwrokの作成
        self.s,self.q_values,q_network=self.build_network()#main
        q_network_weights=q_network.trainable_weights
        
        self.st,self.target_q_values,target_q_network=self.build_network()#target 最適化における教師信号
        target_q_network_weights=target_q_network.trainable_weights
        
        #mainとtargetのnetworkの重みを共有
        self.update_target_network=[target_q_network_weights[i].assign(q_network_weights[i]) for i in range(len(target_q_network_weights))]
        
        
        #最適化
        self.a,self.y,self.loss,self.optimizer=self.train_op(q_network_weights)#重みを更新
        
        #tensorflowの実行インスタンス
        self.sess=tf.InteractiveSession()
        
        self.saver=tf.train.Saver()
        
        #保存するdirectoryの作成
        if not os.path.exists(SAVE_PATH):
            os.mkdir(SAVE_PATH)
            
        
        #全てのパラメータの初期化
        self.sess.run(tf.global_variables_initializer())
        
        #学習済みのモデルを用いるときは
        if LOAD:
            self.load_network()
            
        #共有重みの更新
        self.sess.run(self.update_target_network)
        
        
    def build_network(self):
        s=tf.placeholder(tf.float32,[None,self.num_states])#batch_sizeを入れるテンソルを残しておく
        
        model=Sequential()
        model.add(Dense(16,activation='relu',input_dim=self.num_states))
        model.add(Dense(16,activation='relu'))
        model.add(Dense(self.num_actions,activation='linear'))
        
        q_values=model(s)
        
        return s,q_values,model
    
    
    def train_op(self,q_network_weights):
        a=tf.placeholder(tf.int64,[None])
        y=tf.placeholder(tf.float32,[None])
        
        #行動をベクトル化
        a_onehot=tf.one_hot(a,self.num_actions,1.0,0.0)
        q_value=tf.reduce_sum(tf.multiply(self.q_values,a_onehot),reduction_indices=1)
        
        #損失
        error=tf.abs(y-q_value)
        quardratic_part=tf.clip_by_value(error,0.0,1.0)#errorを1.0~0.0に
        linear_part=error-quardratic_part
        loss=tf.reduce_mean(0.5*tf.square(quardratic_part)+linear_part)
        
        #最適手法
        
        """勾配が消失しないようにepsilon入れておく"""
        optimizer=tf.train.RMSPropOptimizer(LEARNING_RATE,momentum=MOMENTUM,epsilon=MIN_GRAD)
        optimizer=optimizer.minimize(loss,var_list=q_network_weights)
        
        """prioritized experience replayならばerrorをここで返す"""
        return a,y,loss,optimizer
        
    
    #ε-greedyで行動選択
    def get_action(self,state):
        """画像データなど一気に入れるときにはstepの中でその数だけ同じ行動をとるようにする"""
        if np.random.rand()<self.epsilon:
            action=random.randrange(self.num_actions)
            
        else:
            action=np.argmax(self.q_values.eval(feed_dict={self.s:[np.float32(state)]}))
            
        if self.epsilon>=FINAL_EPSILON and self.t>=INITIALIZE_REPLAY_SIZE:
            self.epsilon-=self.epsilon_step
            
        return action
    
    def run(self,state,action,reward,next_state,terminal):
        
        #報酬
        reward=np.sign(reward)#符号だけ
        
        #memoryに保存
        self.replay_memory.append((state,action,reward,next_state,terminal))
        
        #memoryのサイズがいっぱいになったら捨てる
        if len(self.replay_memory)> MEMORY_SIZE:
            self.replay_memory.popleft()#deque()を使わないならばremove()関数を作るなどする
        
        """最適化の開始"""
        if self.t>=INITIALIZE_REPLAY_SIZE:
            self.replay()
            
            #共有パラメータの更新
            if self.t%TARGET_NETWORK_UPDATES==0:
                self.sess.run(self.update_target_network)
            #重みの保存    
            if self.t%SAVE_INTERVAL==0:
                save_path=self.saver.save(self.sess,SAVE_PATH+'/'+ENV_NAME,global_step=(self.t))
                
        self.total_reward+=reward
        self.total_q_max+=np.max(self.q_values.eval(feed_dict={self.s:[np.float32(state)]}))
        self.duration+=1
        
        #学習の途中で終了したときその時点での結果を出力し、合計パラメータをリセット
        if terminal:
            elapsed_time=time.time()-self.start
            #どの段階で終了したかを確認
            if self.t<INITIALIZE_REPLAY_SIZE:
                mode='random'
            elif INITIALIZE_REPLAY_SIZE<=self.t<INITIALIZE_REPLAY_SIZE+MAX_STEPS:
                mode='exploration'
            else:
                mode='exploit'
            
            print('episode:{0}/time_step:{1}/duration:{2}/total_reward:{3}/avg_loss:{4}/mode:{5}'
                 .format(self.episode+1,self.t,self.duration,self.total_reward,self.total_loss/float(self.duration),mode))
            
            
            self.total_reward=0
            self.total_q_max=0
            self.total_loss=0
            self.episode+=1
        
        
        
        self.t+=1
        
        return next_state
    
    """minibatch学習"""
    def replay(self):
        state_batch=[]
        action_batch=[]
        reward_batch=[]
        next_state_batch=[]
        terminal_batch=[]
        
        
        #memoryからサンプリングしてくる
        minibatch=random.sample(self.replay_memory,BATCH_SIZE)
        
        
        #ミニバッチの作成
        for data in minibatch:
            state_batch.append(data[0])
            action_batch.append(data[1])
            reward_batch.append(data[2])
            next_state_batch.append(data[3])
            terminal_batch.append(data[4])
            
        

        terminal_batch=np.array(terminal_batch)+0#termonal_batchの数値化
        
        
        target_q_values_batch=self.target_q_values.eval(feed_dict={self.st:np.float32(np.array(next_state_batch))})
        
        
        #DDQNではここでactionを(s_t+1)から推定してtarget_q_values_batchを変化させる
        #actions=np.argmax(self.q_values.eval(feed_dict={self.s:np.float32(np.array(next_state_batch))}))
        #targte_q_values_batch=[target_q_values[i][action] for i,action in enumerate(actions)]
        
        y_batch=reward_batch+(1-terminal_batch)*GAMMA*np.max(target_q_values_batch,axis=1)#教師バッチ
        
        #学習
        loss,_=self.sess.run([self.loss,self.optimizer],feed_dict={self.s:np.float32(np.array(state_batch)),
                                                                  self.a:action_batch,self.y:y_batch})
        
        self.total_loss+=loss
        
        
    """学習済みのネットワークを用いる時"""
    def load_network():
        checkpoint=tf.train.get_checkpoint_state(SAVE_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess,checkpoint.model_checkpoint_path)
            print('successfully loaded')
        else:
            print('please training')
            
            
    def get_action_at_test(self,state):
        if np.random.rand()<0.05:
            action=random.randrange(self.num_actions)
        else:
            action=np.argmax(self.q_values.eval(feed_dict={self.s:[np.float32(state)]}))
            
            
        self.t+=1
        return action
        
        
        
            
        
        
        
        
        
        

In [26]:
#main
env=gym.make(ENV_NAME)
NUM_ACTIONS=env.action_space.n
NUM_STATES=env.observation_space.shape[0]
agent=Agent(NUM_ACTIONS,NUM_STATES)#agentのインスタンス

"""observationが画像であるならば計算コストを考えてresizeする"""
def preprocessing(last_observation,observation):
    img=np.maximum(last_observation,observation)
    img=cv2.cvtColor(img,cv2.COLOR_RGB2GRAY)
    img=cv2.resize(img,(FRAME_WIDTH,FRAME_HEIGHT))
    img=np.uint8(img)#8 bit
    
    return np.reshape(img,(1,FRAME_WIDTH,FRAME_HEIGHT))#テンソル次元を増やす
    
#学習のスタート
if TRAIN:
    for eps in range(MAX_EPISODES):
        terminal=False#学習の途中で終了したかの判定
        observation=env.reset()#ゲームの初期化

        #ここで初期値をランダムにするために(何もしない)を何回か行う
        for _ in range(np.random.randint(0,RANDOM_RESETS)):
            observation,reward,terminal,_=env.step(0)#infoはいらないので削除　no actionのindexは0

        state=observation#初期状態(変数)
        agent.start=time.time()#学習時間の計測
        while not terminal:
            action=agent.get_action(state)#行動のインデックスを返す
            observation,reward,terminal,_=env.step(action)

            #描画
            if RENDER:
                env.render()

            state=agent.run(state,action,reward,observation,terminal)#学習の実行 状態の更新 
            
            
else:
    for _ in range(MAX_EPISODES_TEST):
        terminal=False
        observation=env.reset()
        
        for _ in range(np.random.randint(0,RANDOM_RESETS)):
            observation,_,terminal,_=env.step(0)#何も行動しない
            
        state=observation
        while not terminal:
            action=agent.get_action_at_test(state)
            observation,_,terminal,_=env.step(action)
            env.render()
            state=observation
        

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m




episode:1/time_step:15/duration:16/total_reward:16.0/avg_loss:0.0/mode:random
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
episode:2/time_step:17/duration:18/total_reward:2.0/avg_loss:0.0/mode:random
episode:3/time_step:20/duration:21/total_reward:3.0/avg_loss:0.0/mode:random
episode:4/time_step:39/duration:40/total_reward:19.0/avg_loss:0.0/mode:random
episode:5/time_step:43/duration:44/total_reward:4.0/avg_loss:0.0/mode:random
episode:6/time_step:45/duration:46/total_reward:2.0/avg_loss:0.0/mode:random
episode:7/time_step:54/duration:55/total_reward:9.0/avg_loss:0.0/mode:random
episode:8/time_step:67/duration:68/total_reward:13.0/avg_loss:0.0/mode:random
episode:9/time_step:70/duration:71/total_reward:3.0/avg_loss:0.0/mode:random
episode:10/time_step:90/duration:91/total_reward:20.0/avg_loss:0.07575585619433896/mode:expl

KeyboardInterrupt: 