In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.models import load_model

print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
def main(seconde, position):
    return np.array([-(position[0]-10.*seconde),-(position[1]-(20.*seconde-2*seconde** 2))])

In [2]:
class RBuffer():
    def __init__(self, maxsize, statedim, naction):
        self.cnt = 0
        self.maxsize = maxsize
        self.state_memory = np.zeros((maxsize, *statedim), dtype=np.float32)
        self.action_memory = np.zeros((maxsize, naction), dtype=np.float32)
        self.reward_memory = np.zeros((maxsize,), dtype=np.float32)
        self.next_state_memory = np.zeros((maxsize, *statedim), dtype=np.float32)
        self.done_memory = np.zeros((maxsize,), dtype= np.bool)

    def storexp(self, state, next_state, action, done, reward):
        index = self.cnt % self.maxsize
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.next_state_memory[index] = next_state
        self.done_memory[index] = 1- int(done)
        self.cnt += 1

    def sample(self, batch_size):
        max_mem = min(self.cnt, self.maxsize)
        batch = np.random.choice(max_mem, batch_size, replace= False)
        states = self.state_memory[batch]
        next_states = self.next_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        dones = self.done_memory[batch]
        return states, next_states, rewards, actions, dones

In [3]:
class Critic(tf.keras.Model):
    def __init__(self):
        super(Critic, self).__init__()
        self.f1 = tf.keras.layers.Dense(128, activation='relu')
        self.f2 = tf.keras.layers.Dense(128, activation='relu')
        #self.f3 = tf.keras.layers.Dense(64, activation='relu')
        #self.f4 = tf.keras.layers.Dense(64, activation='relu')
        self.v =  tf.keras.layers.Dense(1, activation=None)

    def call(self, inputstate, action):
        x = self.f1(tf.concat([inputstate, action], axis=1))
        x = self.f2(x)
        #x = self.f3(x)
        #x = self.f4(x)
        x = self.v(x)
        return x

In [4]:
class Actor(tf.keras.Model):
    def __init__(self, no_action):
        super(Actor, self).__init__()
        initializer = tf.keras.initializers.TruncatedNormal(mean=0., stddev=0.2)
        self.f1 = tf.keras.layers.Dense(40, kernel_initializer=initializer, activation='relu')
        self.f2 = tf.keras.layers.Dense(40, kernel_initializer=initializer, activation='relu')
        self.mu =  tf.keras.layers.Dense(no_action, activation=None)

    def call(self, state):
        x = self.f1(state)
        x = self.f2(x)
        x = self.mu(x)
        return x

In [5]:
class Agent():
    def __init__(self, n_action= 2):
        self.actor_main = Actor(n_action)
        self.actor_target = Actor(n_action)
        self.critic_main = Critic()
        self.critic_main2 = Critic()
        self.critic_target = Critic()
        self.critic_target2 = Critic()
        self.batch_size = 128
        self.n_actions = 2
        self.a_opt = tf.keras.optimizers.Adam(0.001)
        # self.actor_target = tf.keras.optimizers.Adam(.001)
        self.c_opt1 = tf.keras.optimizers.Adam(0.002)
        self.c_opt2 = tf.keras.optimizers.Adam(0.002)
        # self.critic_target = tf.keras.optimizers.Adam(.002)
        self.memory = RBuffer(100000, [3], n_action)
        self.trainstep = 0
        #self.replace = 5
        self.gamma = 0.99
        self.min_action = -100
        self.max_action = 100
        self.actor_update_steps = 20
        self.warmup = 200
        self.actor_target.compile(optimizer=self.a_opt)
        self.critic_target.compile(optimizer=self.c_opt1)
        self.critic_target2.compile(optimizer=self.c_opt2)
        self.tau = 0.005

    def savexp(self,state, next_state, action, done, reward):
        self.memory.storexp(state, next_state, action, done, reward)

    def update_target(self, tau=None):

        if tau is None:
            tau = self.tau

        weights1 = []
        targets1 = self.actor_target.weights
        for i, weight in enumerate(self.actor_main.weights):
            weights1.append(weight * tau + targets1[i]*(1-tau))
        self.actor_target.set_weights(weights1)

        weights2 = []
        targets2 = self.critic_target.weights
        for i, weight in enumerate(self.critic_main.weights):
            weights2.append(weight * tau + targets2[i]*(1-tau))
        self.critic_target.set_weights(weights2)


        weights3 = []
        targets3 = self.critic_target2.weights
        for i, weight in enumerate(self.critic_main2.weights):
            weights3.append(weight * tau + targets3[i]*(1-tau))
        self.critic_target2.set_weights(weights3)

  
    def train(self):
        if self.memory.cnt < self.batch_size:
            return

        states, next_states, rewards, actions, dones = self.memory.sample(self.batch_size)

        states = tf.convert_to_tensor(states, dtype= tf.float32)
        next_states = tf.convert_to_tensor(next_states, dtype= tf.float32)
        rewards = tf.convert_to_tensor(rewards, dtype= tf.float32)
        actions = tf.convert_to_tensor(actions, dtype= tf.float32)
        #dones = tf.convert_to_tensor(dones, dtype= tf.bool)

        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:

            target_actions = self.actor_target(next_states)
            target_actions += tf.clip_by_value(tf.random.normal(shape=[*np.shape(target_actions)], mean=0.0, stddev=0.2), -0.5, 0.5)
            target_actions = tf.clip_by_value(target_actions, self.min_action, self.max_action)


            target_next_state_values = tf.squeeze(self.critic_target(next_states, target_actions), 1)
            target_next_state_values2 = tf.squeeze(self.critic_target2(next_states, target_actions), 1)

            critic_value = tf.squeeze(self.critic_main(states, actions), 1)
            critic_value2 = tf.squeeze(self.critic_main2(states, actions), 1)

            next_state_target_value = tf.math.minimum(target_next_state_values, target_next_state_values2)

            target_values = rewards + self.gamma * next_state_target_value * dones
            critic_loss1 = tf.keras.losses.MSE(target_values, critic_value)
            critic_loss2 = tf.keras.losses.MSE(target_values, critic_value2)

        grads1 = tape1.gradient(critic_loss1, self.critic_main.trainable_variables)
        grads2 = tape2.gradient(critic_loss2, self.critic_main2.trainable_variables)

        self.c_opt1.apply_gradients(zip(grads1, self.critic_main.trainable_variables))
        self.c_opt2.apply_gradients(zip(grads2, self.critic_main2.trainable_variables))


        self.trainstep +=1

        if self.trainstep % self.actor_update_steps == 0:

            with tf.GradientTape() as tape3:

                new_policy_actions = self.actor_main(states)
                actor_loss = self.critic_main(states, new_policy_actions)
                actor_loss = tf.math.reduce_mean(actor_loss)

            grads3 = tape3.gradient(actor_loss, self.actor_main.trainable_variables)
            self.a_opt.apply_gradients(zip(grads3, self.actor_main.trainable_variables))

        #if self.trainstep % self.replace == 0:
        self.update_target()

    def act(self, state, evaluate=False):
        if self.trainstep > self.warmup:
            evaluate = True
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        actions = self.actor_main(state)
        if not evaluate:
            actions += tf.random.normal(shape=[self.n_actions], mean=0.0, stddev=0.1)
        actions = tf.clip_by_value(actions, self.min_action, self.max_action)
        return actions[0]

In [6]:
from Angrybird import AngryBird
scale = np.array([10., 100., 50.])
with tf.device('GPU:0'):
    tf.random.set_seed(336699)
    agent = Agent(2)
    env = AngryBird()
    episods = 100000
    ep_reward = []
    total_avgr = []
    target = False

    for s in range(episods):
        if target == True:
            break
        total_reward = 0
        state = env.reset()
        done = False
        ## model of wing
        cost = 0

        while not done:
            hand_made_state = state / scale
            if state[0] == 11.:
                action = np.zeros(2)
            else:
                action = agent.act(hand_made_state)
            next_state, reward, done, _ = env.step(action)
            hand_made_next_state = next_state / scale
            agent.savexp(hand_made_state, hand_made_next_state, action, done, reward)
            agent.train()
            #print(state, reward, done)
            state = next_state
            total_reward += reward
        if done and s % 50 == 0:
            ep_reward.append(total_reward)
            avg_reward = np.mean(ep_reward[-100:])
            total_avgr.append(avg_reward)
            print("total reward after {} steps is {} and avg reward is {}".format(s, total_reward, avg_reward))
            if int(avg_reward) < 50:
                target = True
        if (s + 1) % 1000 == 0:
            agent.actor_main.save_weights("td3_actor_{}".format(s+1))

total reward after 0 steps is 1796.354822134577 and avg reward is 1796.354822134577
total reward after 50 steps is 86.17583132122475 and avg reward is 941.2653267279009
total reward after 100 steps is 1300.8097986948412 and avg reward is 1061.1134840502143
total reward after 150 steps is 7494.521086758426 and avg reward is 2669.465384727267
total reward after 200 steps is 4360.491921577725 and avg reward is 3007.6706920973584
total reward after 250 steps is 7448.684128763389 and avg reward is 3747.839598208364
total reward after 300 steps is 350.0106812027828 and avg reward is 3262.435467207567
total reward after 350 steps is 443.94151622776707 and avg reward is 2910.123723335091
total reward after 400 steps is 3347.249207657332 and avg reward is 2958.693221593118
total reward after 450 steps is 765.4834687217527 and avg reward is 2739.372246305981
total reward after 500 steps is 124.26823440229357 and avg reward is 2501.6355179511006
total reward after 550 steps is 11590.894980093151 

total reward after 4700 steps is 1211.9116347355084 and avg reward is 2703.8376052212275
total reward after 4750 steps is 2213.7237265928757 and avg reward is 2698.7322523188486
total reward after 4800 steps is 1191.2564630137158 and avg reward is 2683.191264800239
total reward after 4850 steps is 2739.3192945461624 and avg reward is 2683.763999797646
total reward after 4900 steps is 1165.1966862994589 and avg reward is 2668.4249360249373
total reward after 4950 steps is 1299.8142951546363 and avg reward is 2654.7388296162344
total reward after 5000 steps is 6155.963556808764 and avg reward is 2698.334916962976
total reward after 5050 steps is 1027.4779563766824 and avg reward is 2707.747938213531
total reward after 5100 steps is 3481.6608991499334 and avg reward is 2729.556449218082
total reward after 5150 steps is 1067.8293680417248 and avg reward is 2665.2895320309144
total reward after 5200 steps is 1986.242853607795 and avg reward is 2641.5470413512153
total reward after 5250 step

total reward after 9400 steps is 2200.6974273159444 and avg reward is 1706.2514888608512
total reward after 9450 steps is 735.0021912503853 and avg reward is 1693.0588620169656
total reward after 9500 steps is 7856.854493986842 and avg reward is 1722.1506421117197
total reward after 9550 steps is 124.51764106923102 and avg reward is 1700.6113741104919
total reward after 9600 steps is 5172.268690230819 and avg reward is 1741.3753827760522
total reward after 9650 steps is 953.3635828511551 and avg reward is 1720.4359207368
total reward after 9700 steps is 1266.3674339562633 and avg reward is 1720.9804787290077
total reward after 9750 steps is 13735.082811229997 and avg reward is 1836.194069575379
total reward after 9800 steps is 5568.083932473171 and avg reward is 1879.9623442699738
total reward after 9850 steps is 488.67646642683803 and avg reward is 1857.4559159887804
total reward after 9900 steps is 1618.3542268850397 and avg reward is 1861.9874913946362
total reward after 9950 steps 

total reward after 14050 steps is 104.0277440850333 and avg reward is 1860.1434643501607
total reward after 14100 steps is 412.6979851187088 and avg reward is 1823.3455386559012
total reward after 14150 steps is 712.7898372745296 and avg reward is 1828.3507696605216
total reward after 14200 steps is 1076.9958978848356 and avg reward is 1781.2771817365926
total reward after 14250 steps is 363.2687235777553 and avg reward is 1759.7957789093118
total reward after 14300 steps is 304.10880061754267 and avg reward is 1761.5655548657794
total reward after 14350 steps is 1861.495015299723 and avg reward is 1778.39346290644
total reward after 14400 steps is 237.80602383969966 and avg reward is 1758.7645488716773
total reward after 14450 steps is 253.1556111062878 and avg reward is 1753.9460830702365
total reward after 14500 steps is 135.6901184104894 and avg reward is 1676.7344393144729
total reward after 14550 steps is 99.7665894793129 and avg reward is 1676.4869287985734
total reward after 14

total reward after 18700 steps is 7786.684871683115 and avg reward is 776.432663705787
total reward after 18750 steps is 86.7123197704136 and avg reward is 773.2717691883845
total reward after 18800 steps is 488.5170480263465 and avg reward is 772.9112894489328
total reward after 18850 steps is 1072.6023055283185 and avg reward is 775.8692180592852
total reward after 18900 steps is 552.509827060284 and avg reward is 775.9101029028566
total reward after 18950 steps is 183.57912706904594 and avg reward is 775.0752135968502
total reward after 19000 steps is 483.43483641771695 and avg reward is 771.2584731578557
total reward after 19050 steps is 1242.3934699290346 and avg reward is 782.6421304162958
total reward after 19100 steps is 226.54801489817544 and avg reward is 780.7806307140904
total reward after 19150 steps is 2341.685183119394 and avg reward is 797.0695841725391
total reward after 19200 steps is 521.1907994310666 and avg reward is 791.5115331880015
total reward after 19250 steps

total reward after 23350 steps is 182.22402267639364 and avg reward is 1001.2125510248624
total reward after 23400 steps is 147.27194672598455 and avg reward is 995.0320936977076
total reward after 23450 steps is 253.00451993810003 and avg reward is 996.5816226133503
total reward after 23500 steps is 357.0031778603422 and avg reward is 997.9645121571526
total reward after 23550 steps is 719.4733968206156 and avg reward is 988.4766876262679
total reward after 23600 steps is 279.51352641797894 and avg reward is 988.9361283700683
total reward after 23650 steps is 632.0539438864475 and avg reward is 991.8725056916977
total reward after 23700 steps is 221.84412776100675 and avg reward is 916.2240982524766
total reward after 23750 steps is 293.4848699780142 and avg reward is 918.2918237545526
total reward after 23800 steps is 321.4895066989935 and avg reward is 916.6215483412791
total reward after 23850 steps is 413.2246254162346 and avg reward is 910.0277715401583
total reward after 23900 s

total reward after 28000 steps is 1116.3981614835707 and avg reward is 492.6791362021729
total reward after 28050 steps is 148.32799128330186 and avg reward is 492.3292376621425
total reward after 28100 steps is 628.3601126052445 and avg reward is 495.69860657699877
total reward after 28150 steps is 144.1622664684406 and avg reward is 496.39073837028985
total reward after 28200 steps is 496.7734541882068 and avg reward is 498.48518836838673
total reward after 28250 steps is 807.9552219256457 and avg reward is 502.3318838817934
total reward after 28300 steps is 332.611923907606 and avg reward is 495.85305708277883
total reward after 28350 steps is 814.3501279968007 and avg reward is 502.1743181359829
total reward after 28400 steps is 473.1234838239684 and avg reward is 505.4328335069627
total reward after 28450 steps is 779.1196480618778 and avg reward is 510.69398478820045
total reward after 28500 steps is 478.2753822037463 and avg reward is 511.9067068316345
total reward after 28550 s

total reward after 32650 steps is 704.4635218069856 and avg reward is 409.09505361557336
total reward after 32700 steps is 663.768108296684 and avg reward is 413.61585680153695
total reward after 32750 steps is 158.07840945874995 and avg reward is 410.9282654347775
total reward after 32800 steps is 1016.2753871379728 and avg reward is 414.6755696815478
total reward after 32850 steps is 169.44662990671583 and avg reward is 413.2271640266868
total reward after 32900 steps is 490.0625604441802 and avg reward is 412.72329792914445
total reward after 32950 steps is 210.72244677196767 and avg reward is 411.1307403665669
total reward after 33000 steps is 210.80049290874604 and avg reward is 402.07476368081865
total reward after 33050 steps is 142.6478173521697 and avg reward is 402.0179619415074
total reward after 33100 steps is 970.4740473961638 and avg reward is 405.43910128941656
total reward after 33150 steps is 1071.9608000785042 and avg reward is 414.7170866255172
total reward after 332

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/pgi/anaconda3/envs/map556/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-ca5c109f76fa>", line 27, in <module>
    next_state, reward, done, _ = env.step(action)
  File "/home/pgi/map556/challenge2/peng-wei/Angrybird.py", line 65, in step
    self.X[1:] = self.dynamique_pos(action)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/pgi/anaconda3/envs/map556/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2045, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/pgi/anaconda3/envs/map556/lib/python3.8/site-packages/IPytho

TypeError: object of type 'NoneType' has no len()

In [11]:
for weight in agent.actor_main.weights:
    pass
    #print(tf.Variable(weight.numpy()))

In [None]:
ep = [i  for i in range(len(total_avgr))]
plt.plot( range(len(total_avgr)),total_avgr,'b')
plt.title("Avg Test Aeward Vs Test Episods")
plt.xlabel("Test Episods")

plt.ylabel("Average Test Reward")
plt.grid(True)
plt.show()

In [None]:
state = env.reset()
for i in range(10):
    hand_made_state = state / scale 
    action = agent.act(hand_made_state)
    next_state, reward, done, _ = env.step(action)
    print(action, reward)
    state = next_state
traj = np.array(env.trajectoire)
plt.plot(traj[:, 1], traj[:, 2])#, label="{}".format())
plt.scatter(traj[-1][1], traj[-1][2])