In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.models import load_model

print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
class RBuffer():
    def __init__(self, maxsize, statedim, naction):
        self.cnt = 0
        self.maxsize = maxsize
        self.state_memory = np.zeros((maxsize, *statedim), dtype=np.float32)
        self.action_memory = np.zeros((maxsize, naction), dtype=np.float32)
        self.reward_memory = np.zeros((maxsize,), dtype=np.float32)
        self.next_state_memory = np.zeros((maxsize, *statedim), dtype=np.float32)
        self.done_memory = np.zeros((maxsize,), dtype= np.bool)

    def reset(self):
        self.state_memory = np.zeros((maxsize, *statedim), dtype=np.float32)
        self.action_memory = np.zeros((maxsize, naction), dtype=np.float32)
        self.reward_memory = np.zeros((maxsize,), dtype=np.float32)
        self.next_state_memory = np.zeros((maxsize, *statedim), dtype=np.float32)
        self.done_memory = np.zeros((maxsize,), dtype= np.bool)

    def storexp(self, state, next_state, action, done, reward):
        index = self.cnt % self.maxsize
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.next_state_memory[index] = next_state
        self.done_memory[index] = 1- int(done)
        self.cnt += 1

    def sample(self, batch_size):
        max_mem = min(self.cnt, self.maxsize)
        batch = np.random.choice(max_mem, batch_size, replace= False)
        states = self.state_memory[batch]
        next_states = self.next_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        dones = self.done_memory[batch]
        return states, next_states, rewards, actions, dones

In [3]:
from tensorflow.math import logical_not, greater
from tensorflow import gather
class Critic(tf.keras.Model):
    def __init__(self):
        super(Critic, self).__init__()
        self.f1 = tf.keras.layers.Dense(128, activation='relu')
        self.f2 = tf.keras.layers.Dense(128, activation='relu')
        self.v =  tf.keras.layers.Dense(1, activation=None)

    def call(self, inputstate, action):
        t = gather(inputstate, [0], axis=1)
        x1 = gather(inputstate, [1], axis=1) * 100
        x2 = gather(inputstate, [2], axis=1) * 50
        cost1 = x1 - x2 - 100.
        cost2 = x1 - 100. + x2
        cost1 *= tf.cast(greater(cost1, 0.), tf.float32) + 1.
        cost1 *= cost1
        cost2 *= cost2
        cost = (cost1 + cost2) / 2
        #print(t, cost, tf.norm(action, axis=1)**2)
        return self.v(self.f2(self.f1(tf.concat([t, -cost, -tf.reshape(tf.norm(action, axis=1) ** 2, (128, 1))], axis=1))))
    def fit(self, X, Z, Y):
        X = tf.convert_to_tensor(X, dtype= tf.float32)
        Z = tf.convert_to_tensor(Z, dtype= tf.float32)
        Y = tf.convert_to_tensor(Y, dtype= tf.float32)
        a_opt = tf.keras.optimizers.Adam(0.001)
        with tf.GradientTape() as tape:
            action = self(X, Z)
            cost = tf.keras.losses.MSE(Y, action)
            grad = tape.gradient(cost, self.trainable_variables)
            a_opt.apply_gradients(zip(grad, self.trainable_variables))
        return tf.math.reduce_mean(cost)

In [4]:
class Actor(tf.keras.Model):
    def __init__(self, no_action):
        super(Actor, self).__init__()
        initializer = tf.keras.initializers.TruncatedNormal(mean=0., stddev=0.1)
        self.f1 = tf.keras.layers.Dense(40, kernel_initializer=initializer, activation='relu')
        self.f2 = tf.keras.layers.Dense(40, kernel_initializer=initializer, activation='relu')
        self.mu =  tf.keras.layers.Dense(no_action, activation=None)

    def call(self, state):
        x = self.f1(state)
        x = self.f2(x)
        x = self.mu(x)
        return x
    def fit(self, X, Y):
        X = tf.convert_to_tensor(X, dtype= tf.float32)
        Y = tf.convert_to_tensor(Y, dtype= tf.float32)
        a_opt = tf.keras.optimizers.Adam(0.001)
        with tf.GradientTape() as tape:
            action = self(X)
            cost = tf.keras.losses.MSE(Y, action)
            grad = tape.gradient(cost, self.trainable_variables)
            a_opt.apply_gradients(zip(grad, self.trainable_variables))
        return tf.math.reduce_mean(cost)

In [5]:
class Agent():
    def __init__(self, n_action= 2):
        self.actor_main = Actor(n_action)
        self.actor_target = Actor(n_action)
        self.critic_main = Critic()
        self.critic_main2 = Critic()
        self.critic_target = Critic()
        self.critic_target2 = Critic()
        self.batch_size = 128
        self.n_actions = 2
        self.a_opt = tf.keras.optimizers.Adam(0.001)
        # self.actor_target = tf.keras.optimizers.Adam(.001)
        self.c_opt1 = tf.keras.optimizers.Adam(0.002)
        self.c_opt2 = tf.keras.optimizers.Adam(0.002)
        # self.critic_target = tf.keras.optimizers.Adam(.002)
        self.memory = RBuffer(100000, [3], n_action)
        self.trainstep = 0
        #self.replace = 5
        self.gamma = 0.99
        self.min_action = -10
        self.max_action = 10
        self.actor_update_steps = 2
        self.warmup = 200
        self.actor_target.compile(optimizer=self.a_opt)
        self.critic_target.compile(optimizer=self.c_opt1)
        self.critic_target2.compile(optimizer=self.c_opt2)
        self.tau = 0.005

    def savexp(self,state, next_state, action, done, reward):
        self.memory.storexp(state, next_state, action, done, reward)

    def update_target(self, tau=None):

        if tau is None:
            tau = self.tau

        weights1 = []
        targets1 = self.actor_target.weights
        for i, weight in enumerate(self.actor_main.weights):
            weights1.append(weight * tau + targets1[i]*(1-tau))
        self.actor_target.set_weights(weights1)

        weights2 = []
        targets2 = self.critic_target.weights
        for i, weight in enumerate(self.critic_main.weights):
            weights2.append(weight * tau + targets2[i]*(1-tau))
        self.critic_target.set_weights(weights2)


        weights3 = []
        targets3 = self.critic_target2.weights
        for i, weight in enumerate(self.critic_main2.weights):
            weights3.append(weight * tau + targets3[i]*(1-tau))
        self.critic_target2.set_weights(weights3)

  
    def train(self):
        if self.memory.cnt < self.batch_size:
            return
        states, next_states, rewards, actions, dones = self.memory.sample(self.batch_size)

        states = tf.convert_to_tensor(states, dtype= tf.float32)
        next_states = tf.convert_to_tensor(next_states, dtype= tf.float32)
        rewards = tf.convert_to_tensor(rewards, dtype= tf.float32)
        actions = tf.convert_to_tensor(actions, dtype= tf.float32)
        #dones = tf.convert_to_tensor(dones, dtype= tf.bool)

        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:

            target_actions = self.actor_target(next_states)
            target_actions += tf.clip_by_value(tf.random.normal(shape=[*np.shape(target_actions)], mean=0.0, stddev=0.2), -0.5, 0.5)
            target_actions = self.max_action * (tf.clip_by_value(target_actions, self.min_action, self.max_action))


            target_next_state_values = tf.squeeze(self.critic_target(next_states, target_actions), 1)
            target_next_state_values2 = tf.squeeze(self.critic_target2(next_states, target_actions), 1)

            critic_value = tf.squeeze(self.critic_main(states, actions), 1)
            critic_value2 = tf.squeeze(self.critic_main2(states, actions), 1)

            next_state_target_value = tf.math.minimum(target_next_state_values, target_next_state_values2)

            target_values = rewards + self.gamma * next_state_target_value * dones
            critic_loss1 = tf.keras.losses.MSE(target_values, critic_value)
            critic_loss2 = tf.keras.losses.MSE(target_values, critic_value2)
        grads1 = tape1.gradient(critic_loss1, self.critic_main.trainable_variables)
        grads2 = tape2.gradient(critic_loss2, self.critic_main2.trainable_variables)
        self.c_opt1.apply_gradients(zip(grads1, self.critic_main.trainable_variables))
        self.c_opt2.apply_gradients(zip(grads2, self.critic_main2.trainable_variables))
        self.trainstep +=1
        if self.trainstep % self.actor_update_steps == 0:
            with tf.GradientTape() as tape3:
                new_policy_actions = self.actor_main(states)
                actor_loss = -self.critic_main(states, new_policy_actions)
                actor_loss = tf.math.reduce_mean(actor_loss)
            grads3 = tape3.gradient(actor_loss, self.actor_main.trainable_variables)
            self.a_opt.apply_gradients(zip(grads3, self.actor_main.trainable_variables))

        #if self.trainstep % self.replace == 0:
        self.update_target()

    def act(self, state, evaluate=False):
        if self.trainstep > self.warmup:
            evaluate = True
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        actions = self.actor_main(state)
        if not evaluate:
            actions += tf.random.normal(shape=[self.n_actions], mean=0.0, stddev=0.1)
        actions = tf.clip_by_value(actions, self.min_action, self.max_action)
        return actions[0]

In [6]:
def main(seconde, position):
    return np.array([-(position[0]-10.*seconde),-(position[1]-(20.*seconde-2*seconde** 2))])

In [None]:
from Angrybird import AngryBird
scale = np.array([10., 100., 50.])
with tf.device('GPU:0'):
    tf.random.set_seed(336699)
    agent = Agent(2)
    env = AngryBird()
    episods = 20000
    ep_reward = []
    total_avgr = []
    target = False

    for s in range(episods):
        if target == True:
            break
        total_reward = 0
        state = env.reset()
        done = False
        ## model of wing
        cost = 0

        while not done:
            hand_made_state = state / scale
            if state[0] == 11.:
                action = np.zeros(2)
            else:
                action = agent.act(hand_made_state)
            action_local = action + main(state[0], state[1:])
            next_state, reward, done, _ = env.step(action_local)
            hand_made_next_state = next_state / scale
            reward *= -1
            agent.savexp(hand_made_state, hand_made_next_state, action, done, reward)
            agent.train()
            state = next_state
            total_reward += reward
        if done and s % 50 == 0:
            ep_reward.append(total_reward)
            avg_reward = np.mean(ep_reward[-100:])
            total_avgr.append(avg_reward)
            print("total reward after {} steps is {} and avg reward is {}".format(s, total_reward, avg_reward))
            if int(avg_reward) < 0.1 and False:
                target = True
        if (s+1)%5000 == 0:
            print("Saving model...")
            agent.actor_main.save_weights("td3_actor_{}".format(s+1))

total reward after 0 steps is -203.66074148644202 and avg reward is -203.66074148644202
total reward after 50 steps is -23.204048400815886 and avg reward is -113.43239494362895
total reward after 100 steps is -69.05250564844638 and avg reward is -98.63909851190142
total reward after 150 steps is -123.7394603692791 and avg reward is -104.91418897624584
total reward after 200 steps is -12.602475882822269 and avg reward is -86.45184635756112
total reward after 250 steps is -664.8414169278736 and avg reward is -182.85010811927987
total reward after 300 steps is -293.2870137029711 and avg reward is -198.62680891695004
total reward after 350 steps is -9.241229053163902 and avg reward is -174.9536114339768
total reward after 400 steps is -495.81024413466344 and avg reward is -210.60434840071974
total reward after 450 steps is -402.58453241802886 and avg reward is -229.80236680245065
total reward after 500 steps is -218.5981947520105 and avg reward is -228.7838057069561
total reward after 550 

total reward after 4600 steps is -408.2278854472787 and avg reward is -474.9947270984542
total reward after 4650 steps is -394.31233958872053 and avg reward is -474.1364038270741
total reward after 4700 steps is -597.3105428944348 and avg reward is -475.43297371199367
total reward after 4750 steps is -462.9240003243933 and avg reward is -475.3026719058727
total reward after 4800 steps is -395.4949614313009 and avg reward is -474.479912004073
total reward after 4850 steps is -413.44806641192133 and avg reward is -473.85713806945915
total reward after 4900 steps is -377.5447645233337 and avg reward is -472.88428581141756
total reward after 4950 steps is -396.44015264605855 and avg reward is -472.1198444797639
Saving model...
total reward after 5000 steps is -885.8868801620788 and avg reward is -478.94210586652036
total reward after 5050 steps is -434.1843310242894 and avg reward is -483.0519086927551
total reward after 5100 steps is -385.64313242843946 and avg reward is -486.217814960555

total reward after 9200 steps is -431.5834592896543 and avg reward is -528.0551162645628
total reward after 9250 steps is -645.3385737385149 and avg reward is -529.0765464128604
total reward after 9300 steps is -596.1831702975992 and avg reward is -520.2404770871959
total reward after 9350 steps is -670.878249703464 and avg reward is -520.2399172176503
total reward after 9400 steps is -650.4115417929681 and avg reward is -522.8054058975871
total reward after 9450 steps is -427.01434180566037 and avg reward is -523.2319524741297
total reward after 9500 steps is -711.2452546022748 and avg reward is -525.8772338318109
total reward after 9550 steps is -565.7043041895495 and avg reward is -527.1824223150962
total reward after 9600 steps is -446.73057604272014 and avg reward is -527.5674492210505
total reward after 9650 steps is -416.68447849560727 and avg reward is -527.7911706101195
total reward after 9700 steps is -750.8223092759313 and avg reward is -529.3262882739343
total reward after 

total reward after 13750 steps is -546.7299983494588 and avg reward is -583.6313684830661
total reward after 13800 steps is -386.9342700884399 and avg reward is -582.5511950682906
total reward after 13850 steps is -405.1713750913928 and avg reward is -576.3743101745594
total reward after 13900 steps is -710.1465664504722 and avg reward is -579.7144839537417
total reward after 13950 steps is -453.0201528420688 and avg reward is -575.8807534103646
total reward after 14000 steps is -684.321311634692 and avg reward is -578.6333305735022
total reward after 14050 steps is -636.9625040944502 and avg reward is -577.4500832824626
total reward after 14100 steps is -446.9323959723551 and avg reward is -577.6754791420299
total reward after 14150 steps is -383.89087844263236 and avg reward is -574.5881169087254
total reward after 14200 steps is -662.8386285337685 and avg reward is -576.9006686011667
total reward after 14250 steps is -409.65648811915906 and avg reward is -574.543847744973
total rewa

In [None]:
ep = [i  for i in range(len(total_avgr))]
plt.plot( range(len(total_avgr)),total_avgr,'b')
plt.title("Avg Test Aeward Vs Test Episods")
plt.xlabel("Test Episods")

plt.ylabel("Average Test Reward")
plt.grid(True)
plt.show()

In [None]:
state = env.reset()
for i in range(10):
    hand_made_state = state / scale 
    action = agent.act(hand_made_state)
    next_state, reward, done, _ = env.step(action)
    print(action, reward)
    state = next_state
traj = np.array(env.trajectoire)
plt.plot(traj[:, 1], traj[:, 2])#, label="{}".format())
plt.scatter(traj[-1][1], traj[-1][2])