# Reinforcement Learning Exercises

# Exercise 1

A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from the vertical, or the cart moves more than 2.4 units from the center.

Task: Create a reinforcement learning algorithm
> - Using Q leaning
> - Design interfaces to OpenAIGym

CartPole code: http://github.com/openai/mish/blob/master/gym/gym/envs/classic_control_cartpole.py

# Environment

In [3]:
import gym # pip install gym
env = gym.make('CartPole-v0')
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample())
env.close()

# Q-learning

In [8]:
import gym
import numpy as np
import time


#create discretization samples
def bins(clip_min, clip_max, num):
    return np.linspace(clip_min, clip_max, num + 1)[1:-1]


#discretize the state
def digitize_state(observation):
    cart_pos, cart_v, pole_angle, pole_v = observation
    digitized = [
        np.digitize(cart_pos, bins=bins(-2.4, 2.4, num_dizitized)),
        np.digitize(cart_v, bins=bins(-3.0, 3.0, num_dizitized)),
        np.digitize(pole_angle, bins=bins(-0.5, 0.5, num_dizitized)),
        np.digitize(pole_v, bins=bins(-2.0, 2.0, num_dizitized))
    ]



    return sum([x * (num_dizitized ** i) for i, x in enumerate(digitized)]) #state




#choose action by using epsilon greedy policy
def get_action(next_state, episode):

    epsilon = 0.5 * (1 / (episode + 1))
    if epsilon <= np.random.uniform(0, 1):
        next_action = np.argmax(q_table[next_state])
    else:
        next_action = np.random.choice([0, 1])
    return next_action

#building Q table: Q(s,a)= Q(s,a)+alpha[r+gamma maxQ(s_,a_)-Q(s,a)]
def update_Qtable(q_table, state, action, reward, next_state):
    gamma = 0.99
    alpha = 0.5
    next_Max_Q = max(q_table[next_state][0], q_table[next_state][1])
    q_table[state, action] = (1 - alpha) * q_table[state, action] + \
                             alpha * (reward + gamma * next_Max_Q)

    return q_table


env = gym.make('CartPole-v0')
max_number_of_steps = 200
num_consecutive_iterations = 100
num_episodes = 2000
goal_average_reward = 195

num_dizitized = 6
q_table = np.random.uniform(
    low=-1, high=1, size=(num_dizitized ** 4, env.action_space.n))

total_reward_vec = np.zeros(num_consecutive_iterations)
final_x = np.zeros((num_episodes, 1))
islearned = 0
isrender = 0

#training
for episode in range(num_episodes):
    observation = env.reset()
    state = digitize_state(observation)
    action = np.argmax(q_table[state])
    episode_reward = 0

    for t in range(max_number_of_steps):
        if islearned == 1:
            env.render()
            time.sleep(0.1)
            print(observation[0])


        observation, reward, done, info = env.step(action)


        if done:
            if t < 195:
                reward = -200
            else:
                reward = 1
        else:
            reward = 1       # define reward as the task description

        episode_reward += reward


        next_state = digitize_state(observation)
        q_table = update_Qtable(q_table, state, action, reward, next_state)


        action = get_action(next_state, episode)

        state = next_state


        if done:
            print('%d Episode finished after %f time steps / mean %f' %
                  (episode, t + 1, total_reward_vec.mean()))
            total_reward_vec = np.hstack((total_reward_vec[1:],
                                          episode_reward))
            if islearned == 1:
                final_x[episode, 0] = observation[0]
            break

    if (total_reward_vec.mean() >=
            goal_average_reward):
        print('Episode %d train agent successfuly!' % episode)
        islearned = 1
        if isrender == 0:
            isrender = 1


if islearned:
    np.savetxt('final_x.csv', final_x, delimiter=",")

0 Episode finished after 19.000000 time steps / mean 0.000000
1 Episode finished after 12.000000 time steps / mean -1.820000
2 Episode finished after 17.000000 time steps / mean -3.710000
3 Episode finished after 16.000000 time steps / mean -5.550000
4 Episode finished after 15.000000 time steps / mean -7.400000
5 Episode finished after 52.000000 time steps / mean -9.260000
6 Episode finished after 47.000000 time steps / mean -10.750000
7 Episode finished after 27.000000 time steps / mean -12.290000
8 Episode finished after 24.000000 time steps / mean -14.030000
9 Episode finished after 12.000000 time steps / mean -15.800000
10 Episode finished after 34.000000 time steps / mean -17.690000
11 Episode finished after 48.000000 time steps / mean -19.360000
12 Episode finished after 28.000000 time steps / mean -20.890000
13 Episode finished after 51.000000 time steps / mean -22.620000
14 Episode finished after 13.000000 time steps / mean -24.120000
15 Episode finished after 29.000000 time s

126 Episode finished after 200.000000 time steps / mean -34.050000
127 Episode finished after 200.000000 time steps / mean -31.520000
128 Episode finished after 200.000000 time steps / mean -28.990000
129 Episode finished after 133.000000 time steps / mean -25.270000
130 Episode finished after 200.000000 time steps / mean -24.070000
131 Episode finished after 200.000000 time steps / mean -20.200000
132 Episode finished after 177.000000 time steps / mean -20.200000
133 Episode finished after 200.000000 time steps / mean -18.600000
134 Episode finished after 175.000000 time steps / mean -15.840000
135 Episode finished after 126.000000 time steps / mean -15.430000
136 Episode finished after 172.000000 time steps / mean -14.290000
137 Episode finished after 200.000000 time steps / mean -13.130000
138 Episode finished after 138.000000 time steps / mean -9.220000
139 Episode finished after 91.000000 time steps / mean -7.940000
140 Episode finished after 154.000000 time steps / mean -7.270000

252 Episode finished after 144.000000 time steps / mean 48.780000
253 Episode finished after 106.000000 time steps / mean 46.210000
254 Episode finished after 133.000000 time steps / mean 43.260000
255 Episode finished after 144.000000 time steps / mean 42.640000
256 Episode finished after 51.000000 time steps / mean 40.070000
257 Episode finished after 127.000000 time steps / mean 36.570000
258 Episode finished after 109.000000 time steps / mean 33.830000
259 Episode finished after 143.000000 time steps / mean 30.910000
260 Episode finished after 157.000000 time steps / mean 31.360000
261 Episode finished after 11.000000 time steps / mean 28.920000
262 Episode finished after 200.000000 time steps / mean 25.020000
263 Episode finished after 200.000000 time steps / mean 25.020000
264 Episode finished after 133.000000 time steps / mean 25.020000
265 Episode finished after 200.000000 time steps / mean 24.940000
266 Episode finished after 140.000000 time steps / mean 24.940000
267 Episode 

382 Episode finished after 194.000000 time steps / mean 85.510000
383 Episode finished after 137.000000 time steps / mean 83.450000
384 Episode finished after 200.000000 time steps / mean 83.290000
385 Episode finished after 149.000000 time steps / mean 83.290000
386 Episode finished after 200.000000 time steps / mean 82.840000
387 Episode finished after 187.000000 time steps / mean 82.840000
388 Episode finished after 166.000000 time steps / mean 80.700000
389 Episode finished after 158.000000 time steps / mean 80.800000
390 Episode finished after 200.000000 time steps / mean 80.960000
391 Episode finished after 200.000000 time steps / mean 80.960000
392 Episode finished after 153.000000 time steps / mean 83.180000
393 Episode finished after 200.000000 time steps / mean 82.760000
394 Episode finished after 137.000000 time steps / mean 82.760000
395 Episode finished after 200.000000 time steps / mean 82.730000
396 Episode finished after 162.000000 time steps / mean 84.960000
397 Episod

512 Episode finished after 200.000000 time steps / mean 38.980000
513 Episode finished after 197.000000 time steps / mean 38.980000
514 Episode finished after 189.000000 time steps / mean 38.950000
515 Episode finished after 179.000000 time steps / mean 36.830000
516 Episode finished after 174.000000 time steps / mean 34.610000
517 Episode finished after 200.000000 time steps / mean 34.480000
518 Episode finished after 164.000000 time steps / mean 34.480000
519 Episode finished after 140.000000 time steps / mean 34.340000
520 Episode finished after 163.000000 time steps / mean 31.730000
521 Episode finished after 129.000000 time steps / mean 29.350000
522 Episode finished after 136.000000 time steps / mean 29.040000
523 Episode finished after 200.000000 time steps / mean 26.390000
524 Episode finished after 200.000000 time steps / mean 26.390000
525 Episode finished after 180.000000 time steps / mean 28.810000
526 Episode finished after 200.000000 time steps / mean 26.600000
527 Episod

KeyboardInterrupt: 

# Exercise 2

A 6 x 6 simulator, where a blue circle can move right, left, up and down is given. That blue circle targets to reach to the yellow circle. Along the way there are black holes and green square objects. Since, those black holes and green square objects are obstacles, the blue circle should learn to reach to the target by avoiding collision with those  obstacles.

Task 1: Change the environment method in order to assign the reward function to the agent as follows:

> - -20 when blue circle falls into the black hole
> - -10 when blue circle falls into the green square
> -  100 when blue circle reaches to the yellow circle
> -  0 else

Task 2: Provide a Q learning algorithm


# Environment

In [2]:
import numpy as np
import time
import sys
if sys.version_info.major == 2:
    import Tkinter as tk
else:
    import tkinter as tk


UNIT = 40   # pixels
Grid_H = 6  # grid height
Grid_W = 6  # grid width


class Grid_Environmnet(tk.Tk, object):
    def __init__(self):
        super(Grid_Environmnet, self).__init__()
        self.action_space = ['u', 'd', 'l', 'r']
        self.n_actions = len(self.action_space)
        self.title('Grid Environmnet')
        self.geometry('{0}x{1}'.format(Grid_H * UNIT, Grid_H * UNIT))
        self._build_Grid_Environmnet()


    def _build_Grid_Environmnet(self):
        self.canvas = tk.Canvas(self, bg='white',
                           height=Grid_H * UNIT,
                           width=Grid_W * UNIT)

        # create grids
        for c in range(0, Grid_W * UNIT, UNIT):
            x0, y0, x1, y1 = c, 0, c, Grid_H * UNIT
            self.canvas.create_line(x0, y0, x1, y1)
        for r in range(0, Grid_H * UNIT, UNIT):
            x0, y0, x1, y1 = 0, r, Grid_H * UNIT, r
            self.canvas.create_line(x0, y0, x1, y1)

        # create origin
        origin = np.array([20, 20])

        # hell
        hell1_center = origin + np.array([UNIT *4, UNIT*3])
        self.hell1 = self.canvas.create_rectangle(
            hell1_center[0] - 15, hell1_center[1] - 15,
            hell1_center[0] + 15, hell1_center[1] + 15,
            fill='green')
        # hell
        hell2_center = origin + np.array([UNIT*3, UNIT * 4])
        self.hell2 = self.canvas.create_rectangle(
            hell2_center[0] - 15, hell2_center[1] - 15,
            hell2_center[0] + 15, hell2_center[1] + 15,
            fill='green')
        # black hole
        hell3_center = origin + np.array([0, UNIT*2])
        self.hell3 = self.canvas.create_rectangle(
            hell3_center[0] - 15, hell3_center[1] - 15,
            hell3_center[0] + 15, hell3_center[1] + 15,
            fill='black')

        hell4_center = origin + np.array([UNIT*3, UNIT])
        self.hell4 = self.canvas.create_rectangle(
            hell4_center[0] - 15, hell4_center[1] - 15,
            hell4_center[0] + 15, hell4_center[1] + 15,
            fill='black')

        # create oval
        oval_center = origin + UNIT * 4
        self.oval = self.canvas.create_oval(
            oval_center[0] - 15, oval_center[1] - 15,
            oval_center[0] + 15, oval_center[1] + 15,
            fill='yellow')


        oval_center = origin + UNIT * 2
        self.oval_blue = self.canvas.create_oval(
            oval_center[0] - 15, oval_center[1] - 15,
            oval_center[0] + 15, oval_center[1] + 15,
            fill='blue')
        # pack all
        self.canvas.pack()

    def reset(self):
        self.update()
        time.sleep(0.5)
        self.canvas.delete(self.oval_blue)
        origin = np.array([20, 20])
        self.oval_blue = self.canvas.create_oval(
            origin[0] - 15, origin[1] - 15,
            origin[0] + 15, origin[1] + 15,
            fill='blue')
        # return observation
        return self.canvas.coords(self.oval_blue)

    def step(self, action):
        s = self.canvas.coords(self.oval_blue)
        base_action = np.array([0, 0])
        if action == 0:   # up
            if s[1] > UNIT:
                base_action[1] -= UNIT
        elif action == 1:   # down
            if s[1] < (Grid_H - 1) * UNIT:
                base_action[1] += UNIT
        elif action == 2:   # right
            if s[0] < (Grid_W - 1) * UNIT:
                base_action[0] += UNIT
        elif action == 3:   # left
            if s[0] > UNIT:
                base_action[0] -= UNIT

        self.canvas.move(self.oval_blue, base_action[0], base_action[1])  # move agent

        s_ = self.canvas.coords(self.oval_blue)  # next state

        # reward function
        if s_ == self.canvas.coords(self.oval):
            reward = 100
            done = True
        elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
            reward = -10
            done = True
        elif s_ in [self.canvas.coords(self.hell3)]:
            reward = -20
            done = True
        elif s_ in [self.canvas.coords(self.hell4)]:
            reward = -20
            done = True
        else:
            reward = 0
            done = False

        return s_, reward, done


    def render(self):
        time.sleep(0.1)
        self.update()


def update():
    for t in range(1):
        s = env.reset()
        while True:
            env.render()
            a = 1
            s, r, done = env.step(a)
            if done:
                break

if __name__ == '__main__':
    env = Grid_Environmnet()
    env.after(100, update)
    env.mainloop()

# Q learning for the Grid Environment

In [2]:
import numpy as np
import pandas as pd
#from Environment import Grid_Environmnet


class QLearningTable:
    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
        self.actions = actions  # a list
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)

    def chooseAction(self, observation):
        self.checkStateExist(observation)

        # action selection
        if np.random.uniform() < self.epsilon:
            # choose best action
            state_action = self.q_table.ix[observation, :]
            state_action = state_action.reindex(np.random.permutation(state_action.index))     # some actions have same value
            action = state_action.argmax()
        else:
            # choose random action
            action = np.random.choice(self.actions)
        return action

    def learn(self, s, a, r, s_):
        self.checkStateExist(s_)
        q_predict = self.q_table.ix[s, a]
        if s_ != 'terminal':
            q_target = r + self.gamma * self.q_table.ix[s_, :].max()  # next state is not terminal
        else:
            q_target = r  # next state is terminal
        self.q_table.ix[s, a] += self.lr * (q_target - q_predict)  # update
        print(self.q_table.ix[s,a])
        #print(s)

    def checkStateExist(self, state):
        if state not in self.q_table.index:
            # append new state to q table
            self.q_table = self.q_table.append(
                pd.Series(
                    [0]*len(self.actions),
                    index=self.q_table.columns,
                    name=state,

                )
            )
            #print(self.q_table)





def update():
    for episode in range(1000):
        # initial observation
        observation = env.reset()

        while True:
            # fresh env#
            env.render()

            # RL choose action based on observation
            action = RL.chooseAction(str(observation))

            # RL take action and get next observation and reward
            observation_, reward, done = env.step(action)

            # RL learn from this transition
            RL.learn(str(observation), action, reward, str(observation_))

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break

    # end of game
    print('Search finished')
    env.destroy()

if __name__ == "__main__":
    env = Grid_Environmnet()
    RL = QLearningTable(actions=list(range(env.n_actions)))

    env.after(1000, update)
    env.mainloop()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#

0.0
0.0
-0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.199
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\MEC\Anaconda3\lib\tkinter\__init__.py", line 1705, in __call__
    return self.func(*args)
  File "C:\Users\MEC\Anaconda3\lib\tkinter\__init__.py", line 749, in callit
    func(*args)
  File "<ipython-input-2-d03f54f9a27a>", line 69, in update
    observation_, reward, done = env.step(action)
  File "<ipython-input-1-75065677ede2>", line 95, in step
    s = self.canvas.coords(self.oval_blue)
  File "C:\Users\MEC\Anaconda3\lib\tkinter\__init__.py", line 2469, in coords
    self.tk.call((self._w, 'coords') + args))]
_tkinter.TclError: invalid command name ".!canvas"


# Exercise 3

Environement EnvRob contains two robots, which have mission of approaching the black box. This is happen, whenever the black box appears to be in the area of robots.

Task: Design and implement the learning algorithm, which will enable the robots to accomplish the mission stated.

# EnvRob

In [2]:
import numpy as np
import pyglet
D = 0


pyglet.clock.set_fps_limit(10000)
# pyglet: Load images, sound, music and video in almost any format

class ArmEnv(object):
    action_bound = [-1, 1]
    action_dim = 2
    state_dim = 7
    dt = .1  # refresh rate
    arm1l = 100
    arm2l = 100
    arm1lb = 100
    arm2lb = 100
    viewer = None
    viewer_xy = (400, 400)
    #viewer = (800, 800)

    vers = (100, 100)
    get_point = False
    mouse_in = np.array([False])
    point_l = 15
    grab_counter = 0

    def __init__(self, mode='easy'):
        self.mode = mode
        self.arm_info = np.zeros((2, 4))
        self.arm_infob = np.zeros((2, 4))
        self.arm_info[0, 0] = self.arm1l
        self.arm_info[1, 0] = self.arm2l
        self.arm_infob[0, 0] = self.arm1lb
        self.arm_infob[1, 0] = self.arm2lb
        self.point_info = np.array([250, 303])
        self.point_info_init = self.point_info.copy()
        self.center_coord = np.array(self.viewer_xy)/2


    def step1(self, action):
        #action = (node1 angular v, node2 angular v)
        action = np.clip(action, *self.action_bound)
        self.arm_info[:, 1] += action * self.dt
        self.arm_info[:, 1] %= np.pi * 2

        arm1rad = self.arm_info[0, 1]
        arm2rad = self.arm_info[1, 1]
        arm1dx_dy = np.array([self.arm_info[0, 0] * np.cos(arm1rad), self.arm_info[0, 0] * np.sin(arm1rad)])
        arm2dx_dy = np.array([self.arm_info[1, 0] * np.cos(arm2rad), self.arm_info[1, 0] * np.sin(arm2rad)])
        self.arm_info[0, 2:4] = self.center_coord + arm1dx_dy  # (x1, y1)
        self.arm_info[1, 2:4] = self.arm_info[0, 2:4] + arm2dx_dy  # (x2, y2)

        s, arm2_distance = self._get_state1()
        r1= self._r_func1(arm2_distance, center_coord=self.center_coord)
        return s, r1, self.get_point


    def step2(self, action):
        # action = (node1 angular v, node2 angular v)
        action = np.clip(action, *self.action_bound)
        self.arm_infob[:, 1] += action * self.dt
        self.arm_infob[:, 1] %= np.pi * 2

        arm1radb = self.arm_infob[0, 1]
        arm2radb = self.arm_infob[1, 1]
        arm1dx_dyb = np.array([self.arm_infob[0, 0] * np.cos(arm1radb), self.arm_infob[0, 0] * np.sin(arm1radb)])
        arm2dx_dyb = np.array([self.arm_infob[1, 0] * np.cos(arm2radb), self.arm_infob[1, 0] * np.sin(arm2radb)])
        self.arm_infob[0, 2:4] = self.center_coord + arm1dx_dyb  # (x1, y1)
        self.arm_infob[1, 2:4] = self.arm_infob[0, 2:4] + arm2dx_dyb  # (x2, y2)


        s, arm2b_distance = self._get_state2()
        r2 = self._r_func2(arm2b_distance, center_coord = -self.center_coord)
        return s, r2, self.get_point

    def reset1(self):
        self.get_point = False
        self.grab_counter = 0

        if self.mode == 'hard':
            pxy = np.clip(np.random.rand(2) * self.viewer_xy[0], 150, 250)
            self.point_info[:] = pxy
        else:
            arm1rad, arm2rad = np.random.rand(2) * np.pi * 2
            self.arm_info[0, 1] = arm1rad
            self.arm_info[1, 1] = arm2rad
            arm1dx_dy = np.array([self.arm_info[0, 0] * np.cos(arm1rad), self.arm_info[0, 0] * np.sin(arm1rad)])
            arm2dx_dy = np.array([self.arm_info[1, 0] * np.cos(arm2rad), self.arm_info[1, 0] * np.sin(arm2rad)])
            self.arm_info[0, 2:4] = self.center_coord + arm1dx_dy  # (x1, y1)
            self.arm_info[1, 2:4] = self.arm_info[0, 2:4] + arm2dx_dy  # (x2, y2)


            self.point_info[:] = self.point_info_init
        return self._get_state1()[0]

    def reset2(self):
        self.get_point = False
        self.grab_counter = 0

        if self.mode == 'hard':
            pxy = np.clip(np.random.rand(2) * self.viewer_xy[0], 150, 250)
            self.point_info[:] = pxy
        else:
            arm1rad, arm2rad = np.random.rand(2) * np.pi * 2
            self.arm_info[0, 1] = arm1rad
            self.arm_info[1, 1] = arm2rad
            arm1dx_dy = np.array([self.arm_info[0, 0] * np.cos(arm1rad), self.arm_info[0, 0] * np.sin(arm1rad)])
            arm2dx_dy = np.array([self.arm_info[1, 0] * np.cos(arm2rad), self.arm_info[1, 0] * np.sin(arm2rad)])
            self.arm_info[0, 2:4] = self.center_coord + arm1dx_dy  # (x1, y1)
            self.arm_info[1, 2:4] = self.arm_info[0, 2:4] + arm2dx_dy  # (x2, y2)


            arm1radb, arm2radb = np.random.rand(2) * np.pi * 2
            self.arm_infob[0, 1] = arm1radb
            self.arm_infob[1, 1] = arm2radb
            arm1dx_dyb = np.array([self.arm_infob[0, 0] * np.cos(arm1radb), self.arm_infob[0, 0] * np.sin(arm1radb)])
            arm2dx_dyb = np.array([self.arm_infob[1, 0] * np.cos(arm2radb), self.arm_infob[1, 0] * np.sin(arm2radb)])
            self.arm_infob[0, 2:4] = self.center_coord + arm1dx_dyb  # (x1, y1)
            self.arm_infob[1, 2:4] = self.arm_infob[0, 2:4] + arm2dx_dyb  # (x2, y2)

            self.point_info[:] = self.point_info_init
        return self._get_state2()[0]

    def render(self):
        if self.viewer is None:
            self.viewer = Viewer(*self.viewer_xy, self.arm_info, self.arm_infob, self.point_info, self.point_l, self.mouse_in)
        self.viewer.render()



    def sample_action(self):
        return np.random.uniform(*self.action_bound, size=self.action_dim)

    def set_fps(self, fps=30):
        pyglet.clock.set_fps_limit(fps)

    def _get_state1(self):
        # return the distance (dx, dy) between arm finger point with blue point
        arm_end = self.arm_info[:, 2:4]
        t_arms = np.ravel(arm_end - self.point_info)
        center_dis = (self.center_coord - self.point_info) / 200
        in_point = 1 if self.grab_counter > 0 else 0
        return np.hstack([in_point, t_arms / 200, center_dis,
                          ]), t_arms[-2:]

    def _get_state2(self):
        # return the distance (dx, dy) between arm finger point with blue point
        arm_end = self.arm_info[:, 2:4]
        arm_endb = self.arm_infob[:, 2:4]
        t_arms = np.ravel(arm_end - self.point_info)
        t_armsb = np.ravel(arm_endb - self.point_info)
        center_dis = (self.center_coord - self.point_info)/200
        in_point = 1 if self.grab_counter > 0 else 0
        return np.hstack([in_point, t_armsb/200, center_dis,
                          ]), t_armsb[-2:]



    def _r_func1(self, distance, center_coord):
        t = 50
        abs_distance = np.sqkrt(np.sum(np.square(distance+center_coord/2)))
        r1 = -abs_distance/200
        if abs_distance < self.point_l and (not self.get_point):
            r1 += 1.
            self.grab_counter += 1
            if self.grab_counter > t:
                r1 += 10.
                self.get_point = True
        elif abs_distance > self.point_l:
            self.grab_counter = 0
            self.get_point = False
        return r1

    def _r_func2(self, distance, center_coord):
        t = 50
        abs_distance = np.sqrt(np.sum(np.square(distance + center_coord / 2)))
        r2 = -abs_distance / 200
        if abs_distance < self.point_l and (not self.get_point):
            r2 += 1.
            self.grab_counter += 1
            if self.grab_counter > t:
                r2 += 10.
                self.get_point = True
        elif abs_distance > self.point_l:
            self.grab_counter = 0
            self.get_point = False
        return r2


class Viewer(pyglet.window.Window):
    color = {
        'background': [1]*3 + [1]
    }
    fps_display = pyglet.clock.ClockDisplay()
    bar_thc = 5

    def __init__(self, width, height, arm_info, arm_infob, point_info, point_l, mouse_in):
        super(Viewer, self).__init__(width, height, resizable=False, caption='Arm', vsync=False)  # vsync=False to not use the monitor FPS
        self.set_location(x=80, y=10)
        pyglet.gl.glClearColor(*self.color['background'])

        self.arm_info = arm_info
        self.arm_infob = arm_infob
        self.point_info = point_info
        self.mouse_in = mouse_in
        self.point_l = point_l

        self.center_coord = np.array((min(width, height)/2, ) * 2)
        self.batch = pyglet.graphics.Batch()

        arm1_box, arm2_box, point_box, arm1b_box, arm2b_box = [0]*8, [0]*8, [0]*8, [0]*8, [0]*8
        c1, c2, c3 = (255, 127, 36) * 4, (0, 0, 0) * 4, (0, 0, 0) * 4
        self.point = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', point_box), ('c3B', c2))
        self.arm1 = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', arm1_box), ('c3B', c1))
        self.arm2 = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', arm2_box), ('c3B', c1))
        self.arm1b = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', arm1b_box), ('c3B', c1))
        self.arm2b = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', arm2b_box), ('c3B', c1))

    def render(self):
        pyglet.clock.tick()
        self._update_arm()
        self.switch_to()
        self.dispatch_events()
        self.dispatch_event('on_draw')
        self.flip()

    def on_draw(self):
        self.clear()
        self.batch.draw()


    def _update_arm(self):
        point_l = self.point_l
        #center_coord = self.center_coord
        point_box = (self.point_info[0] - point_l, self.point_info[1] - point_l,
                     self.point_info[0] + point_l, self.point_info[1] - point_l,
                     self.point_info[0] + point_l, self.point_info[1] + point_l,
                     self.point_info[0] - point_l, self.point_info[1] + point_l)
        self.point.vertices = point_box

        arm1_coord = (*self.center_coord + self.center_coord / 2, *(self.arm_info[0, 2:4]) + self.center_coord / 2)  # (x0, y0, x1, y1)
        arm2_coord = (*(self.arm_info[0, 2:4]) + self.center_coord / 2,
            *(self.arm_info[1, 2:4]) + self.center_coord / 2)  # (x1, y1, x2, y2)


        arm1_thick_rad = np.pi / 2 - self.arm_info[0, 1]
        x01, y01 = arm1_coord[0] - np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[1] + np.sin(
            arm1_thick_rad) * self.bar_thc
        x02, y02 = arm1_coord[0] + np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[1] - np.sin(
            arm1_thick_rad) * self.bar_thc
        x11, y11 = arm1_coord[2] + np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[3] - np.sin(
            arm1_thick_rad) * self.bar_thc
        x12, y12 = arm1_coord[2] - np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[3] + np.sin(
            arm1_thick_rad) * self.bar_thc
        arm1_box = (x01, y01, x02, y02, x11, y11, x12, y12)
        arm2_thick_rad = np.pi / 2 - self.arm_info[1, 1]
        x11_, y11_ = arm2_coord[0] + np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[1] - np.sin(
            arm2_thick_rad) * self.bar_thc
        x12_, y12_ = arm2_coord[0] - np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[1] + np.sin(
            arm2_thick_rad) * self.bar_thc
        x21, y21 = arm2_coord[2] - np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[3] + np.sin(
            arm2_thick_rad) * self.bar_thc
        x22, y22 = arm2_coord[2] + np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[3] - np.sin(
            arm2_thick_rad) * self.bar_thc
        arm2_box = (x11_, y11_, x12_, y12_, x21, y21, x22, y22)
        self.arm1.vertices = arm1_box
        self.arm2.vertices = arm2_box

        arm1b_coord = (*self.center_coord - self.center_coord/2, *(self.arm_infob[0, 2:4]) - self.center_coord/2)  # (x0, y0, x1, y1)
        arm2b_coord = (
        *(self.arm_infob[0, 2:4]) - self.center_coord/2, *(self.arm_infob[1, 2:4]) - self.center_coord/2)  # (x1, y1, x2, y2)


        arm1b_thick_rad = np.pi / 2 - self.arm_infob[0, 1]
        x01b, y01b = arm1b_coord[0] - np.cos(arm1b_thick_rad) * self.bar_thc, arm1b_coord[1] + np.sin(
            arm1b_thick_rad) * self.bar_thc
        x02b, y02b = arm1b_coord[0] + np.cos(arm1b_thick_rad) * self.bar_thc, arm1b_coord[1] - np.sin(
            arm1b_thick_rad) * self.bar_thc
        x11b, y11b = arm1b_coord[2] + np.cos(arm1b_thick_rad) * self.bar_thc, arm1b_coord[3] - np.sin(
            arm1b_thick_rad) * self.bar_thc
        x12b, y12b = arm1b_coord[2] - np.cos(arm1b_thick_rad) * self.bar_thc, arm1b_coord[3] + np.sin(
            arm1b_thick_rad) * self.bar_thc
        arm1b_box = (x01b, y01b, x02b, y02b, x11b, y11b, x12b, y12b)
        arm2b_thick_rad = np.pi / 2 - self.arm_infob[1, 1]
        x11b_, y11b_ = arm2b_coord[0] + np.cos(arm2b_thick_rad) * self.bar_thc, arm2b_coord[1] - np.sin(
            arm2b_thick_rad) * self.bar_thc
        x12b_, y12b_ = arm2b_coord[0] - np.cos(arm2b_thick_rad) * self.bar_thc, arm2b_coord[1] + np.sin(
            arm2b_thick_rad) * self.bar_thc
        x21b, y21b = arm2b_coord[2] - np.cos(arm2b_thick_rad) * self.bar_thc, arm2b_coord[3] + np.sin(
            arm2b_thick_rad) * self.bar_thc
        x22b, y22b = arm2b_coord[2] + np.cos(arm2b_thick_rad) * self.bar_thc, arm2b_coord[3] - np.sin(
            arm2b_thick_rad) * self.bar_thc
        arm2b_box = (x11b_, y11b_, x12b_, y12b_, x21b, y21b, x22b, y22b)
        self.arm1b.vertices = arm1b_box
        self.arm2b.vertices = arm2b_box

    def on_key_press(self, symbol, modifiers):
        if symbol == pyglet.window.key.UP:
            self.arm_info[0, 1] += .1
            print(self.arm_info[:, 2:4] - self.point_info)
            self.arm_infob[0, 1] += .1
            print(self.arm_infob[:, 2:4] - self.point_info)
        elif symbol == pyglet.window.key.DOWN:
            self.arm_info[0, 1] -= .1
            print(self.arm_info[:, 2:4] - self.point_info)
            self.arm_infob[0, 1] -= .1
            print(self.arm_infob[:, 2:4] - self.point_info)
        elif symbol == pyglet.window.key.LEFT:
            self.arm_info[1, 1] += .1
            print(self.arm_info[:, 2:4] - self.point_info)
            self.arm_infob[1, 1] += .1
            print(self.arm_infob[:, 2:4] - self.point_info)
            self.arm_infob[1, 1] += .1
            print(self.arm_infob[:, 2:4] - self.point_info)
            self.arm_infob[1, 1] += .1
            print(self.arm_infob[:, 2:4] - self.point_info)
        elif symbol == pyglet.window.key.RIGHT:
            self.arm_info[1, 1] -= .1
            print(self.arm_info[:, 2:4] - self.point_info)
            self.arm_infob[1, 1] -= .1
            print(self.arm_infob[:, 2:4] - self.point_info)
        elif symbol == pyglet.window.key.Q:
            pyglet.clock.set_fps_limit(1000)
        elif symbol == pyglet.window.key.A:
            pyglet.clock.set_fps_limit(30)

    def on_mouse_motion(self, x, y, dx, dy):
        self.point_info[:] = [x, y]

    def on_mouse_enter(self, x, y):
        self.mouse_in[0] = True

    def on_mouse_leave(self, x, y):
        self.mouse_in[0] = False

# Learning Algorithm

In [None]:
import tensorflow as tf
import numpy as np
import os
import shutil # copy(Mw)
#from arm_env import ArmEnv



np.random.seed(1)# seed ...sequence of numbers(mw)
tf.set_random_seed(1)

MAX_EPISODES = 600
MAX_EP_STEPS = 200
LR_A = 1e-4  # learning rate for actor
LR_C = 1e-4  # learning rate for critic
GAMMA = 0.9  # reward discount
REPLACE_ITER_A = 1100
REPLACE_ITER_C = 1000
MEMORY_CAPACITY = 5000
BATCH_SIZE = 16
VAR_MIN = 0.1
RENDER = True
LOAD = False
#LOAD =True
MODE = ['easy', 'hard']
n_model1 = 1
n_model2 = 1

env = ArmEnv(mode=MODE[n_model1])
env = ArmEnv(mode=MODE[n_model2])

STATE_DIM = env.state_dim
ACTION_DIM = env.action_dim
ACTION_BOUND = env.action_bound

STATE_DIM2 = env.state_dim
ACTION_DIM2 = env.action_dim
ACTION_BOUND2 = env.action_bound

# all placeholder for tf
with tf.name_scope('S'):
    S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
with tf.name_scope('R1'):
    R1 = tf.placeholder(tf.float32, [None, 1], name='r1')

with tf.name_scope('R2'):
    R2 = tf.placeholder(tf.float32, [None, 1], name='r2')

with tf.name_scope('S_'):
    S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')


class Actor(object):
    def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
        self.sess = sess # A session allows to execute graphs
        self.a_dim = action_dim
        self.action_bound = action_bound
        self.lr = learning_rate
        self.t_replace_iter = t_replace_iter
        self.t_replace_counter = 0
        # It allows us to create our operations and build our computation graph, without needing the data.
        with tf.variable_scope('Actor'):
            # input s, output a
            self.a = self._build_net(S, scope='eval_net', trainable=True)

            # input s_, output a, get a_ for critic
            self.a_ = self._build_net(S_, scope='target_net', trainable=False)





        self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
        self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')



    def _build_net(self, s, scope, trainable):
        with tf.variable_scope(scope):
            init_w = tf.contrib.layers.xavier_initializer()
            init_b = tf.constant_initializer(0.001)
            net = tf.layers.dense(s, 200, activation=tf.nn.relu6,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l1',
                                  trainable=trainable)
            net = tf.layers.dense(net, 200, activation=tf.nn.relu6,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l2',
                                  trainable=trainable)
            net = tf.layers.dense(net, 10, activation=tf.nn.relu,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l3',
                                  trainable=trainable)
            with tf.variable_scope('a'):
                actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
                                          name='a', trainable=trainable)

                scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a')  # Scale output to -action_bound to action_bound
        return scaled_a

    def learn(self, s):   # batch update
        self.sess.run(self.train_op, feed_dict={S: s})
        if self.t_replace_counter % self.t_replace_iter == 0:
            self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
        self.t_replace_counter += 1


    def choose_action(self, s):
        s = s[np.newaxis, :]  # single state
        return self.sess.run(self.a, feed_dict={S: s})[0]  # single action

    def add_grad_to_graph(self, a_grads):
        with tf.variable_scope('policy_grads'):
            self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)

        with tf.variable_scope('A_train'):
            opt = tf.train.RMSPropOptimizer(-self.lr)  # (- learning rate) for ascent policy
            self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))


class Actor2(object):
    def __init__(self, sess2, action_dim, action_bound, learning_rate, t_replace_iter):
        self.sess2 = sess2 # A session allows to execute graphs(mw)
        self.a_dim = action_dim
        self.action_bound = action_bound
        self.lr = learning_rate
        self.t_replace_iter = t_replace_iter
        self.t_replace_counter = 0
        # It allows us to create our operations and build our computation graph, without needing the data.(mw)
        with tf.variable_scope('Actor2'):
            # input s, output a
            self.a2 = self._build_net2(S, scope='eval_net2', trainable=True)

            # input s_, output a, get a_ for critic
            self.a2_ = self._build_net2(S_, scope='target_net2', trainable=False)





        self.e_params2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor2/eval_net2')
        self.t_params2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor2/target_net2')



    def _build_net2(self, s, scope, trainable):
        with tf.variable_scope(scope):
            init_w = tf.contrib.layers.xavier_initializer()
            init_b = tf.constant_initializer(0.001)
            net = tf.layers.dense(s, 200, activation=tf.nn.relu6,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l1',
                                  trainable=trainable)
            net = tf.layers.dense(net, 200, activation=tf.nn.relu6,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l2',
                                  trainable=trainable)
            net = tf.layers.dense(net, 10, activation=tf.nn.relu,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l3',
                                  trainable=trainable)
            with tf.variable_scope('a2'):
                actions2 = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
                                          name='a2', trainable=trainable)

                scaled_a2 = tf.multiply(actions2, self.action_bound, name='scaled_a2')  # Scale output to -action_bound to action_bound

        return scaled_a2

    def learn2(self, s):   # batch update
        self.sess2.run(self.train_op2, feed_dict={S: s})
        if self.t_replace_counter % self.t_replace_iter == 0:
            self.sess2.run([tf.assign(t, e) for t, e in zip(self.t_params2, self.e_params2)])
        self.t_replace_counter += 1


    def choose_action(self, s):
        s = s[np.newaxis, :]    # single state
        return self.sess2.run(self.a2, feed_dict={S: s})[0]  # single action

    def add_grad_to_graph(self, a_grads):
        with tf.variable_scope('policy_grads'):
            self.policy_grads = tf.gradients(ys=self.a2, xs=self.e_params2, grad_ys=a_grads)

        with tf.variable_scope('A_train'):
            opt = tf.train.RMSPropOptimizer(-self.lr)  # (- learning rate) for ascent policy
            self.train_op2 = opt.apply_gradients(zip(self.policy_grads, self.e_params2))






class Critic1(object):
    def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a, a_):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.lr = learning_rate
        self.gamma = gamma
        self.t_replace_iter = t_replace_iter
        self.t_replace_counter = 0

        with tf.variable_scope('Critic'):
            # Input (s, a), output q
            self.a = a
            self.q = self._build_net(S, self.a, 'eval_net', trainable=True)

            # Input (s_, a_), output q_ for q_target
            self.q_ = self._build_net(S_, a_, 'target_net', trainable=False)    # target_q is based on a_ from Actor's target_net

            self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
            self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')

        with tf.variable_scope('target_q'):
            self.target_q = R1 + self.gamma * self.q_

        with tf.variable_scope('TD_error'):
            self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))

        with tf.variable_scope('C_train'):
            self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)

        with tf.variable_scope('a_grad'):
            self.a_grads = tf.gradients(self.q, a)[0]   # tensor of gradients of each sample (None, a_dim)

    def _build_net(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            init_w = tf.contrib.layers.xavier_initializer()
            init_b = tf.constant_initializer(0.01)

            with tf.variable_scope('l1'):
                n_l1 = 200
                w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
                w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
                net = tf.nn.relu6(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
            net = tf.layers.dense(net, 200, activation=tf.nn.relu6,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l2',
                                  trainable=trainable)
            net = tf.layers.dense(net, 10, activation=tf.nn.relu,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l3',
                                  trainable=trainable)
            with tf.variable_scope('q'):
                q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable)   # Q(s,a)
        return q



    def learn(self, s, a, r1, s_):
        self.sess.run(self.train_op, feed_dict={S: s, self.a: a, R1: r1, S_: s_})
        if self.t_replace_counter % self.t_replace_iter == 0:
            self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
        self.t_replace_counter += 1


class Critic2(object):
    def __init__(self, sess2, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a2, a2_):
        self.sess2 = sess2
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.lr = learning_rate
        self.gamma = gamma
        self.t_replace_iter = t_replace_iter
        self.t_replace_counter = 0

        with tf.variable_scope('Critic'):
            # Input (s, a), output q
            self.a2 = a2
            self.q2 = self._build_net2(S, self.a2, 'eval_net2', trainable=True)

            # Input (s_, a_), output q_ for q_target
            self.q2_ = self._build_net2(S_, a2_, 'target_net2', trainable=False)    # target_q is based on a_ from Actor's target_net

            self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net2')
            self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net2')

        with tf.variable_scope('target_q'):
            self.target_q2 = R2 + self.gamma * self.q2_

        with tf.variable_scope('TD_error'):
            self.loss = tf.reduce_mean(tf.squared_difference(self.target_q2, self.q2))

        with tf.variable_scope('C_train'):
            self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)

        with tf.variable_scope('a_grad'):
            self.a_grads = tf.gradients(self.q2, a2)[0]   # tensor of gradients of each sample (None, a_dim)

    def _build_net2(self, s, a2, scope, trainable):
        with tf.variable_scope(scope):
            init_w = tf.contrib.layers.xavier_initializer()
            init_b = tf.constant_initializer(0.01)

            with tf.variable_scope('l1'):
                n_l1 = 200
                w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
                w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
                net = tf.nn.relu6(tf.matmul(s, w1_s) + tf.matmul(a2, w1_a) + b1)
            net = tf.layers.dense(net, 200, activation=tf.nn.relu6,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l2',
                                  trainable=trainable)
            net = tf.layers.dense(net, 10, activation=tf.nn.relu,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l3',
                                  trainable=trainable)
            with tf.variable_scope('q'):
                q2 = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable)   # Q(s,a)

        return q2


    def learn2(self, s, a2, r2, s_):
        self.sess2.run(self.train_op, feed_dict={S: s, self.a2: a2, R2: r2, S_: s_})
        if self.t_replace_counter % self.t_replace_iter == 0:
            self.sess2.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
        self.t_replace_counter += 1







class Memory1(object):
    def __init__(self, capacity, dims):
        self.capacity = capacity
        self.data = np.zeros((capacity, dims))
        self.pointer = 0

    def store_transition(self, s, a, r1, s_):
        transition = np.hstack((s, a, [r1], s_))
        index = self.pointer % self.capacity  # replace the old memory with new memory
        self.data[index, :] = transition
        self.pointer += 1

    def sample(self, n):
        assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
        indices = np.random.choice(self.capacity, size=n)
        return self.data[indices, :]





class Memory2(object):
    def __init__(self, capacity, dims):
        self.capacity = capacity
        self.data = np.zeros((capacity, dims))
        self.pointer = 0

    def store_transition2(self, s, a2, r2, s_):
        transition = np.hstack((s, a2, [r2], s_))
        index = self.pointer % self.capacity  # replace the old memory with new memory
        self.data[index, :] = transition
        self.pointer += 1

    def sample2(self, n):
        assert self.pointer >= self.capacity, 'Memory2 has not been fulfilled'
        indices = np.random.choice(self.capacity, size=n)
        return self.data[indices, :]


sess2 = tf.Session()

# Create actor and critic.
actor = Actor(sess2, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A)
critic = Critic1(sess2, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_)
actor.add_grad_to_graph(critic.a_grads)

M1 = Memory1(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1)



saver = tf.train.Saver(max_to_keep=100)
path = './'+MODE[1]

if LOAD:
    saver.restore(sess2, tf.train.latest_checkpoint(path))
else:
    sess2.run(tf.global_variables_initializer())

# Create actor and critic.
actor2 = Actor2(sess2, ACTION_DIM2, ACTION_BOUND2[1], LR_A, REPLACE_ITER_A)
critic2 = Critic2(sess2, STATE_DIM2, ACTION_DIM2, LR_C, GAMMA, REPLACE_ITER_C, actor2.a2, actor2.a2_)
actor2.add_grad_to_graph(critic2.a_grads)

M2 = Memory2(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1)



saver2 = tf.train.Saver(max_to_keep=100)
path2 = './'+MODE[1]

if LOAD:
    saver2.restore(sess2, tf.train.latest_checkpoint(path2))
else:
    sess2.run(tf.global_variables_initializer())









def train():
    var = 2.  # control exploration

    for ep1 in range(MAX_EPISODES):
        ep2 = ep1
        s1 = env.reset1()
        s2 = env.reset2()
        ep_reward1 = 0
        ep_reward2 = 0


        for t in range(MAX_EP_STEPS):
            # while True:
            if RENDER:
                env.render()



            # Added exploration noise
            a = actor.choose_action(s1)
            a2 = actor2.choose_action(s2)
            a = np.clip(np.random.normal(a, var), *ACTION_BOUND)
            a2 = np.clip(np.random.normal(a2, var), *ACTION_BOUND2)  # add randomness to action selection for exploration
            s1_, r1, done = env.step1(a)
            s2_, r2, done = env.step2(a2)
            M1.store_transition(s1, a, r1, s1_)
            M2.store_transition2(s2, a2, r2, s2_)

            if M1.pointer > MEMORY_CAPACITY:
                var = max([var * .9999, VAR_MIN])  # decay the action randomness
                b_M = M1.sample(BATCH_SIZE)
                b_s = b_M[:, :STATE_DIM2]
                b_a = b_M[:, STATE_DIM2: STATE_DIM2 + ACTION_DIM2]
                b_r1 = b_M[:, -STATE_DIM - 1: -STATE_DIM]
                b_s_ = b_M[:, -STATE_DIM2:]

                critic.learn(b_s, b_a, b_r1, b_s_)
                actor.learn(b_s)




            if M2.pointer > MEMORY_CAPACITY:
                var = max([var * .9999, VAR_MIN])  # decay the action randomness
                b_M = M2.sample2(BATCH_SIZE)
                b_s = b_M[:, :STATE_DIM2]
                b_a = b_M[:, STATE_DIM2: STATE_DIM2 + ACTION_DIM2]
                b_r2 = b_M[:, -STATE_DIM2 - 1: -STATE_DIM2]
                b_s_ = b_M[:, -STATE_DIM2:]

                critic2.learn2(b_s, b_a, b_r2, b_s_)
                actor2.learn2(b_s)

            s1 = s1_
            s2 = s2_
            ep_reward1 += r1
            ep_reward2 += r2

            if t == MAX_EP_STEPS - 1 or done:
                # if done:
                result = '| done' if done else '| ----'
                print('Ep1:', ep1,
                      result,
                      '| R1: %i' % int(ep_reward1),
                      '| Explore: %.2f' % var,
                      )


            if t == MAX_EP_STEPS - 1 or done:
                # if done:
                result = '| done' if done else '| ----'
                print('Ep2:', ep2,
                      result,
                      '| R2: %i' % int(ep_reward2),
                      '| Explore2: %.2f' % var,
                      )
                break

    if os.path.isdir(path): shutil.rmtree(path)
    os.mkdir(path)
    if os.path.isdir(path2): shutil.rmtree(path2)
    os.mkdir(path2)
    ckpt_path = os.path.join('./' + MODE[n_model1], 'Actor Critic.ckpt1')
    ckpt_path2 = os.path.join('./' + MODE[n_model2], 'Actor Critic.ckpt2')
    save_path = saver.save(sess2, ckpt_path, write_meta_graph=False)
    save_path2 = saver2.save(sess2, ckpt_path2, write_meta_graph=False)
    print("\nSave Model1 %s\n" % save_path)
    print("\nSave Model2 %s\n" % save_path2)





def eval():
    for ep1 in range(MAX_EPISODES):
        env.set_fps(100)

        s1 = env.reset1()
        s2 = env.reset2()




        for t in range(MAX_EP_STEPS):

               if RENDER:
                  env.render()
               a1 = actor.choose_action(s1)
               s1_, r1, done = env.step1(a1)
               s1 = s1_

               a2 = actor2.choose_action(s2)
               s2_, r2, done = env.step2(a2)
               s2 = s2_
               print("Program is still running......")




if __name__ == '__main__':
    if LOAD:
        eval()
    else:

        train()



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Ep1: 0 | ---- | R1: -381 | Explore: 2.00
Ep2: 0 | ---- | R2: -144 | Explore2: 2.00
Ep1: 1 | ---- | R1: -360 | Explore: 2.00
Ep2: 1 | ---- | R2: -23 | Explore2: 2.00
Ep1: 2 | ---- | R1: -259 | Explore: 2.00
Ep2: 2 | ---- | R2: -84 | Explore2: 2.00
Ep1: 3 | ---- | R1: -204 | Explore: 2.00
Ep2: 3 | ---- | R2: -89 | Explore2: 2.00
Ep1: 4 | ---- | R1: -191 | Explore: 2.00
Ep2: 4 | ---- | R2: -135 | Explore2: 2.00
Ep1: 5 | ---- | R1: -135 | Explore: 2.00
Ep2: 5 | ---- | R2: -128 | Explore2: 2.00
Ep1: 6 | ---- | R1: -214 | Explore: 2.00
Ep2: 6 | ---- | R2: -127 | Explore2: 2.00
Ep1: 7 | ---- | R1: -71 | Explore: 2.00
Ep2: 7 | ---- |