In [2]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import gym
import numpy as np
import scipy.signal
import matplotlib.pyplot as plt

In [28]:
class GymEnvironment:
    def __init__(self, env_id, monitor_dir, max_timesteps=400): #default max_timesteps = 400, testing: 10
        self.max_timesteps = max_timesteps

        self.env = gym.make(env_id)

    def trainPPO(self, agent, no_episodes):
        self.runPPO(agent, no_episodes, training=True)

    def runPPO(self, agent, no_episodes, training=False):

        rew = []
        for episode in range(no_episodes):
            states = np.zeros((agent.actors*self.max_timesteps, agent.state_size), dtype = np.float32)
            actions = np.zeros(agent.actors*self.max_timesteps, dtype = np.int32)
            logprobs = np.zeros(agent.actors*self.max_timesteps, dtype = np.float32)
            advantages = []
            G_lams = []
            storage_counter = 0

            for n in range(0, agent.actors):
                state = self.env.reset().reshape(1, self.env.observation_space.shape[0])
                tot_rew = 0
                values = np.zeros(self.max_timesteps)
                rewards = np.zeros(self.max_timesteps+1)
                for t in range(self.max_timesteps):
                    # TODO: Fill out the respective to-dos in this loop and make sure that the overall algorithm works,
                    #  e.g., overwrite current state with next state entering a new time step

                    logit, action = agent.select_action(state)
                    next_state, reward, done, _ = self.env.step(action.numpy()[0])

                    next_state = next_state.reshape(1, self.env.observation_space.shape[0])
                    tot_rew += reward

                    states[storage_counter] = state
                    actions[storage_counter] = action

                    """
                    if (done == True) or (t == self.max_timesteps-1):
                        #checking performance:
                        print(f'Episode {episode} actor {n}: reward: {tot_rew} /{self.max_timesteps}')
                    """

                    #print(f'state: {type(state)}, {state}')
                    if training == True:
                        # TODO: Store relevant transition information such as rewards, values, etc. that you will need in
                        #  the calculation of the advantages later
                        rewards[t] = reward
                        values[t] = agent.critic(state)[0]
                        logprobs[storage_counter] = policy_probabilities(logit, action)

                    state = next_state
                    storage_counter += 1

                    if ((done == True) or (t == self.max_timesteps-1)) and (training == True):
                        # Calculate advantages when the function breaks or the last iteration is reached
                        # TODO: Call function for calculation and storage of advantages
                        adv, G_T = agent.calc_advantage(state, rewards, values, done, t)
                        advantages = np.append(advantages, adv)
                        G_lams = np.append(G_lams, G_T)
                        print(f'Episode {episode} actor {n}: reward: {tot_rew} /{self.max_timesteps}')
                        break

                rew.append(tot_rew)

            # TODO: If training, call function to update policy function weights using clipping
            # TODO: If training, Call function to update value function weights
            if training == True:
                G_lams = np.array(G_lams)
                G_lams = G_lams.astype(dtype = np.float32)
                advantages = np.array(advantages)
                advantages = advantages.astype(dtype = np.float32)
                advantages = (advantages - np.mean(advantages)) / np.std(advantages)
                states = states[0:len(G_lams)]
                actions = actions[0:len(G_lams)]
                logprobs = logprobs[0:len(G_lams)]
                agent.update_policy_parameters(states,actions,logprobs,advantages)
                agent.update_value_parameters(G_lams, states)
                # TODO: Implement here a function that evaulates the agent's performance for every x episodes by
                #  calling PPO directly and returns an average of total rewards for 100 runs, if your objective is
                #  reached, you can terminate training

        print(f'total reward: {sum(rew)} /{self.max_timesteps*agent.actors*no_episodes}')
        return rew

In [29]:
def policy_probabilities(logit, action):
    logprobs = tf.nn.log_softmax(logit)
    logprob = tf.reduce_sum(tf.one_hot(action, 2) * logprobs, axis=1)
    return logprob


# Sum of discountated rewards of vectors --> useful for advantage estimates or total rewards
def discounted_cumulative_sums(x, discount):
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

In [30]:
class PPO_Agent:
    def __init__(self, no_of_states, no_of_actions):
        self.state_size = no_of_states
        self.action_size = no_of_actions

        # TODO: Set hyperparameters and vary them
        self.gamma = 0.99  # discount rate, 0.9
        self.lam = 0.97  # lambda for TD(lambda), 0.6
        self.clip_ratio = 0.2  # Clipping ratio for calculating L_clip, 0.5
        self.lr = 0.0003  # learning rate, default: 0.0001
        self.actors = 10  # Number of parallel actors, default: 100, testing: 10

        self.policy_iterations = 20 # number of policy updates per update call
        self.value_iterations = 20 # number of value updates per update call

        self.actor = self.nn_model(self.state_size, self.action_size)
        self.actor_optimizer = keras.optimizers.Adam(learning_rate=self.lr)
        self.critic = self.nn_model(self.state_size, 1)
        self.critic_optimizer = keras.optimizers.Adam(learning_rate=self.lr)

    def select_action(self, state):
        # TODO: Implement action selection, i.e., sample an action from policy pi
        logit = agent.actor(state)
        action = tf.squeeze(tf.random.categorical(logit, 1), axis=1)
        return logit, action

    def calc_advantage(self, state, rewards, values, done, t):
        # TODO: Implement here the calculation of the advantage, e.g., using TD-lambda or eligibility traces
        # Using offline forward-looking TD(lambda) with one update per episode

        rewards = rewards[:t+1]
        values = values[:t+1]
        if done == True:
            values = np.array(np.append(values, 0))
        else:
            values = np.array(np.append(values, agent.critic(state)[0]))
        TD_delta = rewards + agent.gamma * values[1:] - values[:-1]
        adv = discounted_cumulative_sums(TD_delta, agent.gamma*agent.lam)

        if done == False:
            rewards = np.array(np.append(rewards, agent.critic(state)[0]))
            G_T = discounted_cumulative_sums(rewards, agent.gamma)[:-1] #G_t
        else:
            G_T = discounted_cumulative_sums(rewards, agent.gamma) #G_t


        """
        rewards = rewards[:t+1]
        values = values[:t+1]
        T_deltas = rewards + agent.gamma * values - values
        adv = discounted_cumulative_sums(T_deltas, agent.gamma*agent.lam)
        if done == True:
            rewards = np.array(np.append(rewards, 0))
        else:
            rewards = np.array(np.append(rewards, agent.critic(state)[0]))
        G_T = discounted_cumulative_sums(rewards, agent.gamma)[:-1]
        """


        """
        G_lam = []
        for t in range(T):
            acc_rew = 0
            rew_list = []
            for k in range(T - t):
                acc_rew = acc_rew + self.gamma ** k * rew[t + k]
                V_tmp = self.gamma ** (k + 1) * values[t + k + 1] if k < T - t - 1 else 0
                rew_list.append((acc_rew + V_tmp) * self.lam ** k)

            G_t = (1 - self.lam) * np.sum(rew_list[:-1]) + rew_list[-1]
            G_lam = np.append(G_lam, G_t)

            # Consider normalizing the advantages:
            # TD = (TD - np.mean(TD)) / (np.std(TD) + 1e-10)
        #G_lam = tf.convert_to_tensor(G_lam, dtype=float)
        #print(f'G_lam: {type(G_lam)}, {len(G_lam)}, {G_lam}')
        return G_lam
        """

        return adv, G_T

    def nn_model(self, state_size, output_size, ):
        # TODO: Define the neural network here, make sure that you account for the different requirements of the value
        input_layer = layers.Input(shape = (state_size,))
        layer_1 = layers.Dense(64, activation = "tanh")(input_layer)
        layer_2 = layers.Dense(64, activation = "tanh")(layer_1)
        layer_3 = layers.Dense(64, activation = "tanh")(layer_2)
        layer_4 = layers.Dense(128, activation = "tanh")(layer_3)
        output_layer = layers.Dense(output_size, activation = "tanh")(layer_4)
        model = keras.Model(inputs = input_layer, outputs = output_layer)

        return model

    #Delete?
    """
    # Here newly observed transitions are stored in the experience replay buffer
    def record(self):  # TODO: add the relevant input arguments that you will need to store
        return

    # TODO: Define here arrays in which you will store all the information that you need in the advantage
    #  calculation, e.g., rewards, values, states, etc.
    """

    @tf.function  # This is a wrapper that when adding it in front of a function, consisting only of tf syntax,
    # can improve speed
    def update_policy_parameters(self, states, actions, logprobs, advantages):

        for _ in range(self.policy_iterations):

            with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
                # TODO: Use the advantages and calculated policies to calculated the clipping function here and calculate
                #  the loss function
                ratio = tf.exp(policy_probabilities(agent.actor(states), actions) - logprobs)
                #min_advantage = tf.where(advantages > 0, (1 + self.clip_ratio) * advantages, (1 - self.clip_ratio) * advantages,)
                #pol_loss = -tf.reduce_mean(tf.minimum(ratio * advantages, min_advantage))
                clip = keras.backend.clip(ratio, min_value=1 - self.clip_ratio, max_value=1 + self.clip_ratio) * advantages
                pol_loss = -keras.backend.mean(tf.minimum(ratio * advantages, clip))

            pol_grads = tape.gradient(pol_loss, agent.actor.trainable_variables)
            agent.actor_optimizer.apply_gradients(zip(pol_grads, agent.actor.trainable_variables))

    # This is a wrapper that when adding it in front of a function, consisting only of tf syntax,
    # can improve speed
    @tf.function
    def update_value_parameters(self, G_lams, states):

        for _ in range(self.value_iterations):

            with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
                # TODO: Use the advantages and calculated policies to calculated the clipping function here and calculate
                #  the loss function
                #val_loss = tf.reduce_mean((G_lams - agent.critic(states)) ** 2)
                val_loss = tf.keras.metrics.mean_squared_error(G_lams, agent.critic(states))
            val_grads = tape.gradient(val_loss, agent.critic.trainable_variables)
            agent.critic_optimizer.apply_gradients(zip(val_grads, agent.critic.trainable_variables))

In [31]:
if __name__ == "__main__":
    environment = GymEnvironment('CartPole-v0', 'gymresults/cartpole-v0')

    no_of_states = 4  # TODO: Define number of states # [position of cart, velocity of cart, angle of pole, rotation rate of pole]
    no_of_actions = 2  # TODO: Define number of actions # [left, right]

    # The agent is initialized
    agent = PPO_Agent(no_of_states, no_of_actions)

    # Train your agent
    no_episodes = 10  # TODO: Play around with this number, default: 500, testing: 10
    environment.trainPPO(agent, no_episodes)

    # Run your agent
    no_episodes_run = 10 #default: 100
    agent.actors = 1  # This is set to one here as multiple actors are only required for training
    rew = environment.runPPO(agent, no_episodes_run)

    # TODO: Implement here a function visualizing/plotting, e.g., -- NOT YET
    # your agent's performance over the number of training episodes

Episode 0 actor 0: reward: 13.0 /400
Episode 0 actor 1: reward: 39.0 /400
Episode 0 actor 2: reward: 25.0 /400
Episode 0 actor 3: reward: 31.0 /400
Episode 0 actor 4: reward: 31.0 /400
Episode 0 actor 5: reward: 13.0 /400
Episode 0 actor 6: reward: 16.0 /400
Episode 0 actor 7: reward: 14.0 /400
Episode 0 actor 8: reward: 19.0 /400
Episode 0 actor 9: reward: 20.0 /400
Episode 1 actor 0: reward: 35.0 /400
Episode 1 actor 1: reward: 53.0 /400
Episode 1 actor 2: reward: 37.0 /400
Episode 1 actor 3: reward: 17.0 /400
Episode 1 actor 4: reward: 20.0 /400
Episode 1 actor 5: reward: 19.0 /400
Episode 1 actor 6: reward: 24.0 /400
Episode 1 actor 7: reward: 23.0 /400
Episode 1 actor 8: reward: 22.0 /400
Episode 1 actor 9: reward: 19.0 /400
Episode 2 actor 0: reward: 14.0 /400
Episode 2 actor 1: reward: 27.0 /400
Episode 2 actor 2: reward: 50.0 /400
Episode 2 actor 3: reward: 15.0 /400
Episode 2 actor 4: reward: 33.0 /400
Episode 2 actor 5: reward: 20.0 /400
Episode 2 actor 6: reward: 27.0 /400
E

In [23]:
rew = np.ones(5)
lam = 0.97
print(rew)
print(scipy.signal.lfilter([1], [1, float(-lam)], rew[::-1], axis=0)[::-1])

[1. 1. 1. 1. 1.]
[4.70886581 3.823573   2.9109     1.97       1.        ]
