In [1]:
#Actor_critic
import tensorflow as tf
import numpy as np
import os
import gym

#環境はcar_envで書く
import pyglet

"""やり直す時には全てのグラフを消す必要あり"""

In [1]:
#Actor_critic
import tensorflow as tf
import numpy as np
import os
import gym

#環境はcar_envで書く
import pyglet


#class Environment
class CarEnv(object):
    n_sensor=5#センサーの数　距離などを測るためのもの
    action_dim=1
    state_dim=n_sensor
    viewer=None#観測者
    viewer_xy=(500,500)#観測者の位置
    sensor_max=150.0#センサーの感度
    start_point=[450,300]#初期値
    speed=50.0#車の速度
    dt=0.1#時間の刻み幅
    
    def __init__(self,discrete_action=False):
        self.is_discrete_action=discrete_action
        if discrete_action:
            self.actions=[-1,0,1]#行動の種類
        else:
            self.action_bound=[-1,1]
            
        self.terminal=False#終了の判定
        
        #車の座標 (x,y,θ,w,l)
        self.car_info=np.array([0,0,0,20,40],dtype=np.float64)
        
        #障害物の設置
        self.obstacle_points=np.array([[120,120],[380,120],[380,380],[120,380],])
        self.sensor_info=self.sensor_max+np.zeros((self.n_sensor,3)) #(distance,end_x,end_y)
        
        
    def step(self,action):
        """next_state,reward,terminal"""
        if self.is_discrete_action:
            action=self.actions[action]#actionのインデクス
        else:
            action=np.clip(action,*self.action_bound)[0]
        
        #θについて
        self.car_info[2]+=action*np.pi/30
        #x,y
        self.car_info[:2]=self.car_info[:2]+self.speed*self.dt*np.array([np.cos(self.car_info[2]),np.sin(self.car_info[2])])
        
        #センサーのupdate
        self.update_sensor()
        
        #状態と報酬
        s=self.get_state()
        r=-1 if self.terminal else 0
        
        return s,r,self.terminal
    
    def reset(self):
        """初期化"""
        self.terminal=False
        self.car_info[:3]=np.array([*self.start_point,-np.pi/2])
        self.update_sensor()
        
        return self.get_state()
        
        
    def render(self):
        """観測状態を描画するためにviewer classを定義"""
        if self.viewer is None:
            self.viewer=Viewer(*self.viewer_xy,self.car_info,self.sensor_info,self.obstacle_points)
            
        self.viewer.render()
        
    def sample_action(self):
        if self.is_discrete_action:
            a-np.random.choice(list(range(3)))#3つの行動のインデクスから取り出す
        else:
            a=np.random.uniform(*self.action_bound,size=self.action_dim)
            
        return a
            
 
        
    def update_sensor(self):
        cx,cy,rotate=self.car_info[:3]
        
        n_sensors=len(self.sensor_info)
        sensor_theta=np.linspace(-np.pi/2,np.pi/2,n_sensors)#sensorの視角
        xs=cx+(np.zeros((n_sensors,))+self.sensor_max)*np.cos(sensor_theta)
        ys=cy+(np.zeros((n_sensors,))+self.sensor_max)*np.sin(sensor_theta)
        
        #sensorの座標
        xy_s=np.array([[x_s,y_s] for x_s,y_s in zip(xs,ys)])
        
        tmp_x=xy_s[:,0]-cx
        tmp_y=xy_s[:,1]-cy
        
        rotated_x=tmp_x*np.cos(rotate)-tmp_y*np.sin(rotate)
        rotated_y=tmp_x*np.sin(rotate)+tmp_y*np.sin(rotate)
        
        #x,yを変換する
        self.sensor_info[:,-2:]=np.vstack([rotated_x+cx,rotated_y+cy]).T
        
        
        #ゲーム内の車同士の衝突
        q=np.array([cx,cy])
        
        for si in range(len(self.sensor_info)):
            #s
            s=self.sensor_info[si,-2:]-q
            possible_sensor_distance=[self.sensor_max]
            possible_intersections=[self.sensor_info[si,-2:]]#x,y
            
            for op in range(len(self.obstacle_points)):
                p=self.obstacle_points[op]
                r=self.obstacle_points[(op+1)%len(self.obstacle_points)]-self.obstacle_points[op]
                
                #衝突する可能性
                if np.cross(r,s)!=0:
                    #衝突確率を表示
                    t=np.cross((q-p),s)/np.cross(r,s)
                    u=np.cross((q-p),r)/np.cross(r,s)
                    
                    if 0<=t<=1 and 0<=u<=1:
                        intersection=q+u*s
                        possible_intersections.append(intersection)
                        possible_sensor_distance.append(np.linalg.norm(u*s))
                        
            #ゲーム空間から出る場合
            game_xy=np.array([[0,0],[self.viewer_xy[0],0],[*self.viewer_xy],[0,self.viewer_xy[1]],[0,0]])
            
            #四つの端がある
            for ed in range(4):
                p=game_xy[ed]
                r=game_xy[(ed+1)%len(game_xy)]-game_xy[ed]
                
                if np.cross(r,s)!=0:
                    t=np.cross((q-p),s)/np.cross(r,s)
                    u=np.cross((q-p),r)/np.cross(r,s)
                    
                    if 0<=u<=1 and 0<=t<=1:
                        intersection=p+t*r
                        possible_intersections.append(intersection)
                        possible_sensor_distance.append(np.linalg.norm(intersection-q))
                        
                        
            distance=np.min(possible_sensor_distance)
            distance_index=np.argmin(possible_sensor_distance)
            self.sensor_info[si,0]=distance
            self.sensor_info[si,-2:]=possible_intersections[distance_index]
            
            if distance<self.car_info[-1]/2:
                self.terminal=True
                
                        
        
        
        
        
    def get_state(self):
        s=self.sensor_info[:,0].flatten()/self.sensor_max
        return s
    
    
    
    
class Viewer(pyglet.window.Window):
    color = {
        'background': [1]*3 + [1]
    }
    fps_display = pyglet.clock.ClockDisplay()
    bar_thc = 5

    def __init__(self, width, height, car_info, sensor_info, obstacle_coords):
        super(Viewer, self).__init__(width, height, resizable=False, caption='2D car', vsync=False)  # vsync=False to not use the monitor FPS
        self.set_location(x=80, y=10)
        pyglet.gl.glClearColor(*self.color['background'])

        self.car_info = car_info
        self.sensor_info = sensor_info

        self.batch = pyglet.graphics.Batch()
        background = pyglet.graphics.OrderedGroup(0)
        foreground = pyglet.graphics.OrderedGroup(1)

        self.sensors = []
        line_coord = [0, 0] * 2
        c = (73, 73, 73) * 2
        for i in range(len(self.sensor_info)):
            self.sensors.append(self.batch.add(2, pyglet.gl.GL_LINES, foreground, ('v2f', line_coord), ('c3B', c)))

        car_box = [0, 0] * 4
        c = (249, 86, 86) * 4
        self.car = self.batch.add(4, pyglet.gl.GL_QUADS, foreground, ('v2f', car_box), ('c3B', c))

        c = (134, 181, 244) * 4
        self.obstacle = self.batch.add(4, pyglet.gl.GL_QUADS, background, ('v2f', obstacle_coords.flatten()), ('c3B', c))

    def render(self):
        pyglet.clock.tick()
        self._update()
        self.switch_to()
        self.dispatch_events()
        self.dispatch_event('on_draw')
        self.flip()

    def on_draw(self):
        self.clear()
        self.batch.draw()
        # self.fps_display.draw()

    def _update(self):
        cx, cy, r, w, l = self.car_info

        # sensors
        for i, sensor in enumerate(self.sensors):
            sensor.vertices = [cx, cy, *self.sensor_info[i, -2:]]

        # car
        xys = [
            [cx + l / 2, cy + w / 2],
            [cx - l / 2, cy + w / 2],
            [cx - l / 2, cy - w / 2],
            [cx + l / 2, cy - w / 2],
        ]
        r_xys = []
        for x, y in xys:
            tempX = x - cx
            tempY = y - cy
            # apply rotation
            rotatedX = tempX * np.cos(r) - tempY * np.sin(r)
            rotatedY = tempX * np.sin(r) + tempY * np.cos(r)
            # rotated x y
            x = rotatedX + cx
            y = rotatedY + cy
            r_xys += [x, y]
        self.car.vertices = r_xys
    
    
    
        
    
    
    
    
    

In [2]:
#2Dcar
np.random.seed(1)
tf.set_random_seed(1)
DISCRETE_ACTION=False
env=CarEnv(DISCRETE_ACTION)
#parameters
MAX_EPISODES=1000
MAX_STEPS=600

#2D car
ACTION_DIM=env.action_dim#行動の数
STATE_DIM=env.state_dim#状態の数
ACTION_BOUND=env.action_bound#行動の制限
LOAD=False
RENDER=True#描画するかどうか


TRAIN_INTERVAL_2=300
TRAIN_INTERVAL_1=400
BATCH_SIZE=16
LEARNING_RATE=0.0001
MEMORY_CAPACITY=2000
GAMMA=0.9
VAR_MIN=0.1

In [3]:
#実行版

#入力データ(s,a,next_s)
with tf.name_scope('S'):
    S=tf.placeholder(tf.float32,shape=[None,STATE_DIM],name='s')
with tf.name_scope('R'):
    R=tf.placeholder(tf.float32,shape=[None,1],name='r')
with tf.name_scope('S_'):
    S_=tf.placeholder(tf.float32,shape=[None,STATE_DIM],name='s_')
    
#memory
class Memory(object):
    """dims:人数"""
    def __init__(self,capacity,dims):
        self.capacity=capacity
        self.memory=np.zeros((capacity,dims))
        self.count=0
        
    def memorize(self,s,a,r,s_):
        #遷移に保存
        transition=np.hstack((s,a,r,s_))#行方向に結合
        index=self.count%self.capacity#新しい記憶に
        self.memory[index,:]=transition
        self.count+=1
        
    def sample(self,batch_size):
        #ランダムにメモリを返す
        indexes=np.random.choice(self.capacity,batch_size)
        
        return self.memory[indexes,:]
    
    
#actor
class Actor(object):
    
    def __init__(self,sess,action_dim,action_bound,learning_rate):
        self.sess=sess#実行インスタンス
        self.action_dim=action_dim
        self.action_bound=action_bound
        self.lr=learning_rate
        self.t=0
        
        #create network advを返す　行動確率に等しい
        with tf.variable_scope('Actor'):
            self.a=self._build_net(S, scope='eval_net', trainable=True)
            self.a_=self._build_net(S_, scope='target_net', trainable=False)
            
        #それぞれのネットワークのパラメタ
        self.main_network_params=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,scope='Actor/eval_net')
        self.target_network_params=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,scope='Actor/target_net')
            
            
    def _build_net(self, s, scope, trainable):
        with tf.variable_scope(scope):
            #重みとバイアスの初期化
            init_w=tf.contrib.layers.xavier_initializer()
            init_b=tf.constant_initializer(0.001)
            #全結合層 重みを更新する層にはtrainableを設定
            net=tf.layers.dense(s,100,activation=tf.nn.relu,kernel_initializer=init_w,bias_initializer=init_b,trainable=trainable)
            net=tf.layers.dense(net,20,activation=tf.nn.relu,kernel_initializer=init_w,bias_initializer=init_b,trainable=trainable)
            with tf.variable_scope('a'):
                #adv
                actions=tf.layers.dense(net,self.action_dim,activation=tf.nn.tanh,kernel_initializer=init_w,name='a',trainable=trainable)
                a_sc=tf.multiply(actions,self.action_bound,name='a_sc')
                
                
        return a_sc
        
    def learning(self,s):
        #状態の入力
        self.sess.run(self.train_op,feed_dict={S:s})
            
        #parameterの更新
            
        if self.t%TRAIN_INTERVAL_1==0:
            self.sess.run([tf.assign(t,m) for m,t in zip(self.main_network_params,self.target_network_params)])
        self.t+=1
            
    def opt(self,a_grads):
        with tf.variable_scope('policy_grads'):
            self.policy_grads=tf.gradients(ys=self.a,xs=self.main_network_params,grad_ys=a_grads)
                
        with tf.variable_scope('actor_train'):
            optimizer=tf.train.RMSPropOptimizer(-self.lr)
            self.train_op=optimizer.apply_gradients(zip(self.policy_grads,self.main_network_params))
             
            
    def get_action(self,s):
        #stateの配列を一元化
        s=s[np.newaxis,:]
        return self.sess.run(self.a,feed_dict={S:s})[0]
    
    
class Critic(object):
    #評価するためにはs,aが必要
    def __init__(self,sess,a,a_):
        self.sess=sess
        self.num_actions=ACTION_DIM
        self.num_states=STATE_DIM
        self.t=0
        
        with tf.variable_scope('Critic'):
            #main
            self.a=a
            self.q_main=self._build_net(S,self.a,scope='eval_net',trainable=True)
            
            #target
            self.a_=a_
            self.q_target=self._build_net(S_,self.a_,scope='target_net',trainable=False)
            
            #weights_params
            self.main_network_weights=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,scope='Critic/eval_net')
            self.target_network_weights=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,scope='Critic/target_net')
            
            
        #最適化のインスタンス
        with tf.variable_scope('target_q'):
            self.target_q=self.q_target*GAMMA+R
            
        with tf.variable_scope('loss'):
            self.loss=tf.reduce_mean(tf.square(self.target_q-self.q_main))
            
        with tf.variable_scope('Critic_train'):
            optimizer=tf.train.RMSPropOptimizer(LEARNING_RATE)
            self.train_op=optimizer.minimize(self.loss)
            
        with tf.variable_scope('a_grads'):
            self.a_grads=tf.gradients(self.q_main,self.a)[0]
            
    def _build_net(self,s,a,scope,trainable):
        with tf.variable_scope(scope):
            init_w=tf.contrib.layers.xavier_initializer()
            init_b=tf.constant_initializer(0.01)
            
            #state,actionで別々に総入力関数を出すことがポイント
            with tf.variable_scope('l1'):
                n_l1=100
                w1_s=tf.get_variable('w1_s',[STATE_DIM,n_l1],initializer=init_w,trainable=trainable)
                w1_a=tf.get_variable('w1_a',[ACTION_DIM,n_l1],initializer=init_w,trainable=trainable)
                b1=tf.get_variable('b1',[1,n_l1],initializer=init_b,trainable=trainable)
                net=tf.nn.relu6(tf.matmul(s,w1_s)+tf.matmul(a,w1_a)+b1)
            net=tf.layers.dense(net,20,activation=tf.nn.relu,kernel_initializer=init_w,bias_initializer=init_b,trainable=trainable)
            
            with tf.variable_scope('q'):
                q=tf.layers.dense(net,1,kernel_initializer=init_w,bias_initializer=init_b,trainable=trainable)
                
        return q
    
    def learning(self,s,a,r,s_):
        self.sess.run(self.train_op,feed_dict={S:s,self.a:a,R:r,S_:s_})
        if self.t%TRAIN_INTERVAL_2==0:
            self.sess.run([tf.assign(t,m) for m,t in zip(self.main_network_weights,self.target_network_weights)])
            
        self.t+=1
        
#main
sess=tf.Session()

actor=Actor(sess,ACTION_DIM,ACTION_BOUND[1],LEARNING_RATE)
critic=Critic(sess,actor.a,actor.a_)#actor側の行動確率を引数に
actor.opt(critic.a_grads)

Memo=Memory(MEMORY_CAPACITY,dims=2*STATE_DIM+ACTION_DIM+1)

#保存インスタンス
saver=tf.train.Saver()
#保存先のパス
path='./discrete'if DISCRETE_ACTION else './continuous'

if LOAD:
    #学習済みのデータを用いるときは
    saver.restor(sess,tf.train.latest_chekpoint(path))
else:
    #そうでないときはパラメタを初期化
    sess.run(tf.global_variables_initializer())


def train():
    var=2.0
    for ep in range(MAX_EPISODES):
        s=env.reset()
        step=0

        for t in range(MAX_STEPS):
            if RENDER:
                env.render()

            action=actor.get_action(s)
            action=np.clip(np.random.normal(action,var),*ACTION_BOUND)#random
            #環境にぶち込んでnext_satte,reward,terminalをもらう
            s_,r,terminal=env.step(action)

            Memo.memorize(s,action,r,s_)

            if Memo.count > MEMORY_CAPACITY:
                var=max([var*0.995,VAR_MIN])
                #バッチ生成
                minibatch=Memo.sample(BATCH_SIZE)
                s_batch=minibatch[:,:STATE_DIM]
                a_batch=minibatch[:,STATE_DIM:STATE_DIM+ACTION_DIM]
                r_batch=minibatch[:,-STATE_DIM-1:-STATE_DIM]
                ns_batch=minibatch[:,-STATE_DIM:]

                #actor,criticの双方のネットワークを学習
                critic.learning(s_batch,a_batch,r_batch,ns_batch)
                actor.learning(s_batch)

            #状態の更新
            s=s_
            step+=1

            #終了するかステップ数が最大になった時
            if terminal or t==MAX_STEPS-1:
                print('episode:{}/steps:{}'.format(ep,int(step)))

                break

    #保存
    if not os.path.exists(path):
        os.mkdir(path)
        ckpt_path=os.path.join(path,'DDPG.ckpt')
        save_path=saver.save(sess,ckpt_path)
        print('saved')
        
        
        
        
if __name__=='__main__':
    train()
                
        
        
    
            
            
        
        
            

    

    

episode:0/steps:35
episode:1/steps:42
episode:2/steps:31
episode:3/steps:55
episode:4/steps:34
episode:5/steps:61
episode:6/steps:57
episode:7/steps:63
episode:8/steps:60
episode:9/steps:57
episode:10/steps:24
episode:11/steps:18
episode:12/steps:23
episode:13/steps:22
episode:14/steps:57
episode:15/steps:58
episode:16/steps:33
episode:17/steps:38
episode:18/steps:58
episode:19/steps:23
episode:20/steps:22
episode:21/steps:55
episode:22/steps:58
episode:23/steps:61
episode:24/steps:58
episode:25/steps:69
episode:26/steps:25
episode:27/steps:49
episode:28/steps:58
episode:29/steps:27
episode:30/steps:58
episode:31/steps:36
episode:32/steps:24
episode:33/steps:30
episode:34/steps:60
episode:35/steps:58
episode:36/steps:30
episode:37/steps:59
episode:38/steps:60
episode:39/steps:62
episode:40/steps:34
episode:41/steps:22
episode:42/steps:21
episode:43/steps:64
episode:44/steps:23
episode:45/steps:60
episode:46/steps:57
episode:47/steps:26
episode:48/steps:58
episode:49/steps:27
episode:50

episode:395/steps:96
episode:396/steps:95
episode:397/steps:95
episode:398/steps:96
episode:399/steps:65
episode:400/steps:94
episode:401/steps:95
episode:402/steps:94
episode:403/steps:92
episode:404/steps:92
episode:405/steps:92
episode:406/steps:91
episode:407/steps:88
episode:408/steps:84
episode:409/steps:83
episode:410/steps:85
episode:411/steps:84
episode:412/steps:30
episode:413/steps:19
episode:414/steps:18
episode:415/steps:19
episode:416/steps:18
episode:417/steps:18
episode:418/steps:18
episode:419/steps:18
episode:420/steps:18
episode:421/steps:18
episode:422/steps:18
episode:423/steps:18
episode:424/steps:17
episode:425/steps:18
episode:426/steps:17
episode:427/steps:18
episode:428/steps:18
episode:429/steps:19
episode:430/steps:20
episode:431/steps:19
episode:432/steps:19
episode:433/steps:19
episode:434/steps:21
episode:435/steps:20
episode:436/steps:21
episode:437/steps:21
episode:438/steps:21
episode:439/steps:23
episode:440/steps:21
episode:441/steps:22
episode:442/s

episode:776/steps:31
episode:777/steps:86
episode:778/steps:91
episode:779/steps:87
episode:780/steps:89
episode:781/steps:89
episode:782/steps:87
episode:783/steps:90
episode:784/steps:85
episode:785/steps:88
episode:786/steps:89
episode:787/steps:88
episode:788/steps:92
episode:789/steps:92
episode:790/steps:89
episode:791/steps:91
episode:792/steps:92
episode:793/steps:90
episode:794/steps:91
episode:795/steps:90
episode:796/steps:89
episode:797/steps:90
episode:798/steps:90
episode:799/steps:88
episode:800/steps:38
episode:801/steps:88
episode:802/steps:87
episode:803/steps:89
episode:804/steps:86
episode:805/steps:88
episode:806/steps:87
episode:807/steps:89
episode:808/steps:87
episode:809/steps:84
episode:810/steps:39
episode:811/steps:87
episode:812/steps:85
episode:813/steps:84
episode:814/steps:86
episode:815/steps:85
episode:816/steps:87
episode:817/steps:87
episode:818/steps:84
episode:819/steps:84
episode:820/steps:83
episode:821/steps:83
episode:822/steps:40
episode:823/s

In [11]:
#memory
class Memory(object):
    """dims:人数"""
    def __init__(self,capacity,dims):
        self.capacity=capacity
        self.memory=np.zeros((capacity,dims))
        self.count=0
        
    def memorize(self,s,a,r,s_):
        #遷移に保存
        transition=np.hstack((s,a,r,s_))#行方向に結合
        index=self.count%self.capacity#新しい記憶に
        self.memory[index,:]=transition
        self.count+=1
        
    def sample(self,batch_size):
        #ランダムにメモリを返す
        indexes=np.random.choice(self,capacity,batch_size)
        
        return self.memory[indexes,:]

In [12]:
#actor
class Actor(object):
    
    def __init__(self,sess,action_dim,action_bound,learning_rate,train_interval):
        self.sess=sess#実行インスタンス
        self.action_dim=action_dim
        self.action_bound=ACTION_BOUND
        self.lr=learning_rate
        self.train_interval=train_interval
        self.t=0
        
        #create network advを返す　行動確率に等しい
        with tf.variable_scope('Actor'):
            self.a=self._build_net(S, scope='eval_net', trainable=True)
            self.a_=self._build_net(S_, scope='target_net', trainable=False)
            
        #それぞれのネットワークのパラメタ
        self.main_network_params=tf.get_collection(tf.GraphKeys.GLOABAL_VARIABLES,scope='Actor/eval_net')
        self.target_network_params=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,scope='Actor/target_net')
            
            
    def _build_net(self, s, scope, trainable):
        #これでパラメータの共有化が可能
        with tf.variable_scope(scope):
            #重みとバイアスの初期化
            init_w=tf.contrib.layers.xavier_initializer()
            init_b=tf.constant_initializer(0)
            #全結合層 重みを更新する層にはtrainableを設定
            net=tf.layers.dense(s,100,activation='relu',kernel_initializer=init_w,bias_initializer=init_b,trainable=trainable)
            net=tf.layers.dense(net,20,activation='relu',kernel_initializer=init_w,bias_initializer=init_b,trainable=trainable)
            with tf.variable_scope('a'):
                #adv
                actions=tf.layers.dense(net,self.action_dim,activation=tf.nn.tanh,kernel_initializer=init_w,name='a',trainable=trainable)
                a_sc=tf.multiply(actions,self.action_bound,name='a_sc')
                
                
        return a_sc
        
    def learning(self,s):
        #状態の入力
        self.sess.run(self.train_op,feed_dict={S:s})
            
        #parameterの更新
            
        if self.t%self.train_interval==0:
            self.sess.run([tf.assign(m,t) for m,t in zip(self.main_network_params,self.target_network_params)])
        self.t+=1
            
    def opt(self,a_grads):
        with tf.variable_scope('policy_grads'):
            self.policy_grads=tf.gradients(ys=self.a,xs=self.main_network_params,grad_ys=a_grads)
                
        with tf.variable_scope('actor_train'):
            optimizer=tf.train.RMSPropOptimizer(-self.lr)
            self.train_op=optimizer.apply_gradients(zip(self.policy_grads,self.main_network_params))
             
            
    def get_action(self,s):
        #stateの配列を一元化
        s=s[np.newaxis,:]
        return self.sess.run(self.a,feed_dict={S:s})[0]
        
        
            
            
            

In [13]:
class Critic(object):
    #評価するためにはs,aが必要
    def __init__(self,sess,a,a_):
        self.sess=sess
        self.num_actions=ACTION_DIM
        self.num_states=STATE_DIM
        self.t=0
        
        with tf.variable_scope('Critic'):
            #main
            self.a=a
            self.q_main=self.net(S,self.a,scope='eval_net',trainable=True)
            
            #target
            self.a_=a_
            self.q_target=self.net(S_,self.a_,scope='target_net',trainable=False)
            
            #weights_params
            self.main_network_weights=tf.get_collection(tf.GraphKeys.GLOABAL_VARIABLES,scope='Critic/eval_net')
            self.target_network_weights=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,scope='Critic/target_net')
            
            
        #最適化のインスタンス
        with tf.variable_scope('target_q'):
            self.target_q=self.q_target*GAMMA+R
            
        with tf.variable_scope('loss'):
            self.loss=tf.reduce_mean(tf.square(self.target_q-self.q_main))
            
        with tf.variable_scope('Critic_train'):
            optimizer=tf.train.RMSPropOptimizer(LEARNING_RATE)
            self.train_op=optimizer.minimize(self.loss)
            
        with tf.variable_scope('a_grads'):
            self.a_grads=tf.gradients(self.q_main,self.a)[0]
            
    def net(self,s,scope,trainable):
        with tf,variable_scope(scope):
            init_w=tf.contrib.layers.xavier_initializer()
            init_b=tf.constant_initializer(0)
            
            net=tf.layers.dense(s,100,activation='relu',kernel_initializer=init_w,bias_initializer=init_b,trainable=trainable)
            net=tf.layers.dense(net,20,activation='relu',kernel_initializer=init_w,bias_initializer=init_b,trainable=trainable)
            
            with tf.variable_scope('q'):
                q=tf.layers.dense(net,1,kernel_initializer=init_w,bias_initializer=init_b,trainable=trainable)
                
        return q
    
    def learning(self,s,a,r,s_):
        self.sess.run(self.train_op,feed_dict={S:s,self.a:a,R:r,S_:s_})
        if self.t%TRAIN_INTERVAL==0:
            self.sess.run([tf.assign(m,t) for m,t in zip(self.main_network_weights,self.target_network_weights)])
            
        self.t+=1
        
        
    
            
            

In [14]:
#main
sess=tf.Session()

actor=Actor(sess,ACTION_DIM,ACTION_BOUND,LEARNING_RATE,TRAIN_INTERVAL)
critic=Critic(sess,actor.a,actor.a_)#actor側の行動確率を引数に
actor.opt(critic.a_grads)

Memo=Memory(MEMORY_CAPACITY,dims=2*STATE_DIM+ACTION_DIM+1)

#保存インスタンス
saver=tf.train.Saver()
#保存先のパス
path='./discrete'if DISCRETE_ACTION else './continuous'

if LOAD:
    #学習済みのデータを用いるときは
    saver.restor(sess,tf.train.latest_chekpoint(path))
else:
    #そうでないときはパラメタを初期化
    sess.run(tf.global_variables_initializer())


def train():
    var=2.0
    for ep in range(MAX_EPISODES):
        s=env.reset()
        step=0

        for t in range(MAX_STEPS):
            if RENDER:
                env.render()

            action=actor.get_action(s)
            action=np.clip(np.random.normal(a,var),*ACTION_BOUND)#random
            #環境にぶち込んでnext_satte,reward,terminalをもらう
            s_,r,terminal=env.step(a)

            Memo.memorize(s,a,r,s_)

            s_batch=[]
            a_batch=[]
            r_batch=[]
            ns_batch=[]


            if Memo.count > MEMORY_CAPACITY:
                #バッチ生成
                minibatch=Memo.sample(BATCH_SIZE)

                for bt in minibatch:
                    s_batch.append(bt[0])
                    a_batch.append(bt[1])
                    r_batch.append(bt[2])
                    ns_batch.append(bt[3])

                #actor,criticの双方のネットワークを学習
                critic.learning(s_batch,a_batch,r_batch,ns_batch)
                actor.learning(s_batch)

            #状態の更新
            s=s_
            step+=1

            #終了するかステップ数が最大になった時
            if terminal or t==MAX_STEPS-1:
                print('episode:{}/steps:{}'.format(ep,int(step)))

                break

    #保存
    if not os.path.exists(path):
        os.mkdir(path)
        ckpt_path=os.path.join(path,'DDPG.ckpt')
        save_path=saver.save(sess,ckpt_path)
        print('saved')
                
            
                



AttributeError: __enter__