# Homework 3 (15 Pts)

All homeworks are self-contained. They can be completed in their respective notebooks.
To edit and re-run code, you can therefore simply edit and restart the code cells below.
There is a timeout of about ~12 hours with Colab while it is active (and less if you close your browser window).
This file should automatically be synced with your Google Drive. We also save all recordings and logs in it by default so that you won't lose your work in the event of an instance timeout.
 However, you will need to re-mount your Google Drive and re-install packages with every new instance.

In [1]:
# Your work will be stored in a folder called `drl_ws21` by default to prevent Colab
# instance timeouts from deleting your edits.
# We do this by mounting your google drive on the virtual machine created in this colab
# session. For this, you will likely need to sign in to your Google account and copy a
# passcode into a field below

import os
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# Create paths in your google drive
DRIVE_PATH = '/content/gdrive/My\ Drive/drl_ws21'
DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\', '')
if not os.path.exists(DRIVE_PYTHON_PATH):
    % mkdir $DRIVE_PATH

# the space in `My Drive` causes some issues,
# make a symlink to avoid this
SYM_PATH = '/content/drl_ws21'
if not os.path.exists(SYM_PATH):
    !ln -s $DRIVE_PATH $SYM_PATH
% cd $SYM_PATH

/content/gdrive/My Drive/drl_ws21


In [3]:
# Install **python** packages

!pip install matplotlib numpy tqdm torch



In this homework we will be mainly working on Policy gradients (Lecture 5) 
and Natural Policy Gradients (Lecture 6). We are going to implement the 
REINFORCE, the Policy Gradient Theorem and the Natural Gradient algorithms. 

We start by importing all the necessary python modules and defining some helper
functions which you do not need to change. Still, make sure you are aware of
what they do.

In [4]:
# Imports and utility
# Progress bar
from copy import deepcopy
import time, os, tqdm

# Plotting
import matplotlib.pyplot as plt

# numerical python
import numpy as np

# Set random seeds
np.random.seed(0)

fig_dict = None


class ProgressBar:
    def __init__(self, num_iterations: int, verbose: bool = True):
        if verbose:  # create a nice little progress bar
            self.scalar_tracker = tqdm.tqdm(total=num_iterations, desc="Scalars", 
                                            bar_format="{desc}",
                                            position=0, leave=True)
            progress_bar_format = '{desc} {n_fmt:' + str(
                len(str(num_iterations))) + '}/{total_fmt}|{bar}|{elapsed}<{remaining}'
            self.progress_bar = tqdm.tqdm(total=num_iterations, desc='Iteration', 
                                          bar_format=progress_bar_format,
                                          position=1, leave=True)
        else:
            self.scalar_tracker = None
            self.progress_bar = None

    def __call__(self, **kwargs):
        if self.progress_bar is not None:
            formatted_scalars = {key: [f"{v:.3e}" for v in value] if isinstance(value, list) 
                                 else f"{value:.3e}"
                                 for key, value in kwargs.items()}
            description = ("Scalars: " + "".join([str(key) + "=" + value + ", "
                                                  for key, value in formatted_scalars.items()]))[:-2]
            self.scalar_tracker.set_description(description)
            self.progress_bar.update(1)


# specify the path to save the recordings of this run to.
data_path = '/content/drl_ws21/exercise_3'
data_path = os.path.join(data_path, time.strftime("%d-%m-%Y_%H-%M"))
if not (os.path.exists(data_path)):
    os.makedirs(data_path)


# this function will automatically save your figure into your google drive folder (if correctly mounted!)
def save_figure(save_name: str) -> None:
    assert save_name is not None, "Need to provide a filename to save to"
    plt.savefig(os.path.join(data_path, save_name + ".png"))

def save_plots_from_fig_dict(fig_dict, name):
    plt.figure(fig_dict["Rewards"].number)
    save_figure(name +'_rewards')
    
    plt.figure(fig_dict["Estimated Gradients"].number)
    save_figure(name +'_estimated_gradients')
    
    plt.figure(fig_dict["Contour Plot"].number)
    save_figure(name +'_contour_plot')
    

def do_contour_plot(env):
    contour_plot_policy = LinPolicy()
    b_max = 0
    b_min = -10
    n_points = 50
    x = np.linspace(start=b_min, stop=b_max, num=n_points)
    y = np.linspace(start=b_min, stop=b_max, num=n_points)
    X, Y = np.meshgrid(x, y)
    policy_params = np.vstack([X.reshape(-1), Y.reshape(-1)]).T
    policy_params = np.concatenate((policy_params, np.ones((policy_params.shape[0], 1))), 
                                   axis=1)
    rews = np.zeros(policy_params.shape[0])
    for i in range(policy_params.shape[0]):
        c_params = policy_params[i, :]
        contour_plot_policy.update_params(c_params)
        rews[i] = PG.test_policy(policy=contour_plot_policy, env=env, n_trials=1)
    fig, ax = plt.subplots()
    Z = np.clip(rews, -1500, 1500).reshape(X.shape)
    CS = ax.contour(X, Y, Z)
    ax.clabel(CS, inline=True, fontsize=10)
    plt.xlabel('k1')
    plt.ylabel('k2')
    return fig


def plot_params_in_contour_plot(fig, parameters, color):
    plt.figure(fig.number)
    plt.plot(parameters[:, 0], parameters[:, 1], color=color, alpha=0.5)
    plt.plot(parameters[:, 0], parameters[:, 1], 'x', color=color, alpha=0.5)
    plt.plot(parameters[-1, 0], parameters[-1, 1], 'x', color='r')
    return fig


def create_fig_dict():
    figures_dict = {}
    figures_dict["Rewards"] = plt.figure('Rewards')
    plt.xlabel('Iterations')
    plt.ylabel('Mean Rewards')

    figures_dict["Estimated Gradients"] = plt.figure('Estimated Gradients')
    plt.xlabel('Iterations')
    plt.ylabel('Gradients of Parameters')

    figures_dict["Contour Plot"] = None
    return figures_dict


def finish_training(estimated_grads, parameters, test_rewards, color, fig_dict, 
                    legend_label=None):
    plt.figure(fig_dict["Rewards"].number)
    plt.plot(test_rewards, color, label=legend_label)
    plt.legend()

    plt.figure(fig_dict["Estimated Gradients"].number)
    plt.plot(estimated_grads, color, label=legend_label)

    plot_params_in_contour_plot(fig_dict["Contour Plot"], parameters, color)

## **Continuous Control with Linear Controller**

We are going to consider a linear dynamical system with quadratic reward 
function. For the continous systems part, we will optimize for the parameters of a linear controller. 
The learning agent does not have any information about the system dynamics and the reward function. 

### **Linear System and Quadratic Reward Function** 

The linear dynamics are described as follows:

\begin{align}
      \boldsymbol{s'} = 
       \boldsymbol{As} + \boldsymbol{Ba},
\end{align}
where $\boldsymbol{s'}$ denotes the state in the next time step and $\boldsymbol{a}$ is the action input to the system. The identites of the system are given as 
\begin{align}
    \boldsymbol{A} = \begin{bmatrix}
                        1 & 0.1\\
                        0 & 0.99
                      \end{bmatrix}, ~~~~~
     \boldsymbol{B} = \begin{bmatrix}
                        0 \\
                        0.1 
                       \end{bmatrix}.
\end{align}
Thus, we have a two dimensional state-space and a one dimensional action space.

The immediate reward function, is given as
\begin{align}
    r(\boldsymbol{s_t}, a_t) = -\boldsymbol{s_t}^T\boldsymbol{M}\boldsymbol{s_t} - a_t^2Q,
\end{align}
resulting in an episode reward
\begin{align}
    R(\tau) = \sum_t r(\boldsymbol{s_t}, a_t)
\end{align}

The code block which describes the linear system and its corresponding quadratic reward function 
is given below.

In [5]:
class QuadReward:
    def __init__(self):
        self._M = np.array([[10, 0], [0, 1]])
        self._Q = np.array([1])

    def get_rew(self, s, a):
        return -s.T @ self._M @ s - a.T * self._Q * a


class LinEnv:
    def __init__(self):
        self.s_dim = 2
        self.a_dim = 1
        self.T = 50
        self.t = 0
        self._A = np.array([[1, 0.1], [0, 0.99]])
        self._B = np.array([0, 0.1])
        self._s = np.array([2, 1])
        self._s_init = np.array([2, 1])
        self.s_max = np.ones(2) * 12
        self.s_min = -np.ones(2) * 12
        self.a_max = 8
        self.a_min = -8
        self.rew = QuadReward()

    def reset(self):
        self._s = self._s_init
        self.t = 0
        return self._s.copy()

    def step(self, a):
        a_used = np.clip(a, self.a_min, self.a_max)
        s_prime = self._A @ self._s + self._B * a_used
        r = self.rew.get_rew(self._s, a)
        s_prime = np.clip(s_prime, self.s_min, self.s_max)
        self._s = np.copy(s_prime)
        self.t += 1
        return s_prime, r

### **Linear Controller**
We consider a linear, stochastic controller of the form
\begin{align}
    \pi(a|\boldsymbol{s}) = \mathcal{N}(a|\boldsymbol{Ks}, \sigma^2) =\frac{1}{\sqrt{2\pi \sigma^2}}e^{-\frac{1}{2}\frac{(a-\boldsymbol{Ks})^2}{\sigma^2}},
\end{align}
where $\boldsymbol{K} = [k_1, k_2]$ and $\sigma$ are the learnable parameters. Our learning agents will optimize
for those parameters.

The following code defines the Linear controller

In [6]:
class LinPolicy:
    def __init__(self):
        self.s_dim = 2
        self.a_dim = 1
        self._K = np.zeros(self.s_dim)
        self._var = None  # variance, i.e. sigma**2
        self._std = None  # standard deviation, i.e. sigma
        self.params = np.array([-10, -10, 1])
        self.n_params = self.params.shape[0]
        self.update_params(self.params)

    def update_params(self, params):
        self._K[:(self.n_params - self.a_dim)] = params[:-self.a_dim]
        self._var = np.diag(params[-self.a_dim:] ** 2)
        self._std = np.diag(params[-self.a_dim:])
        self.params = params

    def get_mean(self, s):
        return np.atleast_1d(self._K @ s)

    def sample(self, s):
        return np.random.multivariate_normal(mean=np.atleast_1d(self.get_mean(s)), 
                                             cov=np.atleast_2d(self._var))

    def grad_log_pi(self, s, a):
        grad = np.zeros(self.n_params)
        std_inv = 1 / (self._std + 1e-20)
        diff = a - self.get_mean(s)
        aff_state = s[:(self.n_params - self.a_dim)]
        grad_K = diff * aff_state.T * (std_inv ** 2)
        grad_sigma = -std_inv + (diff ** 2) * (std_inv ** 3)
        grad[:(self.n_params - self.a_dim)] = grad_K
        grad[-self.a_dim:] = grad_sigma.squeeze()
        return grad

## **Policy Gradients**
Next, we will have a closer look to the individual learning algorithms. Policy Gradient follow the same algorithm scheme, which is shown in the followin **pseudo code**

---

- **Repeat**  For $k=1, 2, \dots$
    - run policy: sample trajectories {$\tau_i$} {$i=1,...N$} from $\pi_{\boldsymbol{\theta}}(a|s) $
    - Estimate the gradient $\nabla_{\boldsymbol{\theta}}J(\boldsymbol{\theta})$ : estimate_grad()
    - Update the parameters: grad_ascent_step()

- **Until convergence**

---

Based on this general structure, in the following we define a base class **PG**, which contains shared attributes and functions. The function `train` implements the main algorithm loop. Please note that we only change the learning rate and the baseline flags in the following. The parameter `n_it` and `n_traj_samples` stay the same for the algorithms.

Also note that for the continuous control problem the discount factor $\gamma$ is ommited since we set it to 1.


In [9]:
class PG:
    def __init__(self, env, policy, n_it=150, n_traj_samples=25, lr=1e-4, baseline=False):
        self.env = env                            # current environment
        self.policy = policy                      # current policy object
        self.n_it = n_it                          # number of total iterations 
        self.n_traj_samples = n_traj_samples      # number of traj. samples per iteration
        self.lr = lr                              # learning rate
        self.baseline = baseline                  # wether to use baseline or not

    def estimate_grad(self, rews, state_traj, action_traj):
        raise NotImplementedError

    def train(self):
        """
        This function will perform the main loop of the policy gradient algorithms.
        For plotting and saving purposes, it will return the estimated gradients,
        the test rewards and the parameters of each iteration.
        :return: tuple of estimated_grads, test_rewards, parameters
                estimated_grads: np.ndarray [n_it x n_params]
                test_rewards: np.ndarray [n_it]
                parameters: np.ndarray[n_it+1 x n_params]
        """
        estimated_grads = np.zeros((self.n_it, self.policy.n_params))
        training_rewards = np.zeros(self.n_it)
        test_rewards = np.zeros(self.n_it)
        parameters = np.zeros((self.n_it + 1, self.policy.n_params)) # +1 means initialization?
        parameters[0, :] = self.policy.params.copy()
        progress_bar = ProgressBar(num_iterations=self.n_it)
        for k in range(self.n_it):
            rewards = np.zeros((self.n_traj_samples, self.env.T))
            states_trajs = np.zeros((self.n_traj_samples, self.env.T, self.env.s_dim))
            action_trajs = np.zeros((self.n_traj_samples, self.env.T, self.env.a_dim))
            for j in range(self.n_traj_samples):
                s = self.env.reset()
                for t in range(self.env.T):
                    a = self.policy.sample(s)
                    action_trajs[j, t] = a
                    states_trajs[j, t] = s
                    s, r = self.env.step(a)
                    rewards[j, t] = r
            grad_estimate = self.estimate_grad(rewards, states_trajs, action_trajs)
            estimated_grads[k, :] = grad_estimate
            new_params = self.grad_ascent_step(grad_estimate, states_trajs, action_trajs)
            parameters[k + 1, :] = new_params
            self.policy.update_params(new_params)
            training_rewards[k] = np.mean(np.sum(rewards, axis=1))
            test_rewards[k] = PG.test_policy(policy=self.policy, env=self.env)
            progress_bar(test_reward=test_rewards[k])
        return estimated_grads, test_rewards, parameters

    def grad_ascent_step(self, grad_estimate: np.ndarray, state_traj=None, action_traj=None):
        """
        This function performs the gradient ascent step.
        :param grad_estimate: The estimated gradient of the return: np.ndarray [n_params]
        :param state_traj: None (needed for Natural Policy Gradient Update)
        :param action_traj: None (needed for Natural Policy Gradient Update)
        :return: new parameters: np.ndarray [n_params]
        """
        return self.policy.params + self.lr * grad_estimate

    @staticmethod
    def test_policy(policy, env, n_trials=10):
        use_policy = policy
        ep_rewards = np.zeros((n_trials, env.T))
        for i in range(n_trials):
            s = env.reset()
            for t in range(env.T):
                a = use_policy.get_mean(s)
                s, r = env.step(a)
                ep_rewards[i, t] = r
        return np.mean(np.sum(ep_rewards, axis=1))

# **REINFORCE**
We start with the most basic **REINFORCE** algorithm. The **pseudocode** is given as

---

- **Repeat**  For $k=1, 2, \dots$
    - run policy: sample trajectories {$\tau_i$}$_{i=1,...N}$ from $\pi_{\boldsymbol{\theta}}(a|\boldsymbol{s})$
    - Estimate the gradient:
        - $\nabla_{\boldsymbol{\theta}}J(\boldsymbol{\theta}) \approx \frac{1}{N}\sum_i \left(\sum_t\nabla_{\boldsymbol{\theta}}\log \pi_\boldsymbol{\theta}(a_{i,t}|\boldsymbol{s}_{i,t})\right)\left(\sum_t\gamma^tr(\boldsymbol{s}_{i,t}a_{i,t})\right)$
    - Update the parameters:
        - $\boldsymbol{\theta}\leftarrow \boldsymbol{\theta} + \alpha \nabla_{\boldsymbol{\theta}}J(\boldsymbol{\theta})$

- **Until convergence**

---

The following class inherits from the basic Polciy Gradient class and only needs to override the function
'estimate_grad' according to the pseudo code given above. This function estimates the gradient of the return
with respect to the parameters of the policy. 

## **TASK 1: REINFORCE** (3 Points)
Your task is to implement the `estimate_grad` function according to the pseudo code shown above (**Don't implement the policy update!**). Through the class attribute `baseline` we can choose if we would like to estimate the gradient with or without a baseline. 
Make sure that both options are working in your code. 
Use the following baseline
\begin{align}
    b = \frac{1}{N}\sum_i\sum_tr(\boldsymbol{s}_{i,t}, a_{i,t})
\end{align}

After you completed the implementation, run the **Execute REINFORCE** cell. This cell will save three different figures which you will need to submit.

*Hint: To get the gradient of the log policy for current state $\boldsymbol{s}_{i,t}$ and current action $a_{i,t}$ from the rollout i at time step t, you will need to call `self.policy.grad_log_pi(current_state, current_action)`, where current_state is $\boldsymbol{s}_{i,t}$ and current action is $a_{i,t}$.*

In [8]:
class REINFORCE(PG):
    def estimate_grad(self, rews, state_traj, action_traj):
        """
        This function returns the gradient estimate of the return.
        :param rews: all training rewards of the last iteration: np.ndarray 
                     [n_traj_samples x T], where T is the horizon length
        :param state_traj: all states of the last iteration: np.ndarray 
                     [n_traj_samples x T x s_dim], where T is the horizon length 
                     and s_dim is the state dimension
        :param action_traj: all taken actions of the last iteration: np.ndarray 
                     [n_traj_samples x T x a_dim], where T is the horizon length 
                     and a_dim is the action dimension
        :return: grad_estimate: the estimated gradient: np.ndarray [n_params]
        """
        ####### TODO ######
        
        if self.baseline:
            baseline = .....
        else:
            baseline = 0
        return 0

SyntaxError: ignored

### **Execute REINFORCE**
Given that **REINFORCE** can be executed with and without baseline, we can now compare both versions of the algorithm. The following cell will executed both versions and plot the rewards, gradient estimates as well as the
update history of the parameters $k_1$ and $k_2$ on the contour plot of the return depending on the policy parameters.

If you have implemented the algorithm correctly, you should clearly see that the baseline approach is propperly converging and the version without the baseline does not manage to solve the problem.

*Hint: The hyperparameters are already tuned. You do not need to tune them.* 

In [None]:
# this will create new plt figures 
fig_dict = create_fig_dict()
fig_dict["Contour Plot"] = do_contour_plot(LinEnv())


def execute_reinforce(fig_dict):
    np.random.seed(0)

    reinforce = REINFORCE(env=LinEnv(), policy=LinPolicy(), lr=1e-6, baseline=False)
    grads_reinfocre, test_rewards_reinforce, parameters_reinforce = reinforce.train()
    finish_training(grads_reinfocre, parameters_reinforce, test_rewards_reinforce, color='cornflowerblue',
                    fig_dict=fig_dict,
                    legend_label='REINFORCE')

    
    np.random.seed(0)
    reinforce_w_baseline = REINFORCE(env=LinEnv(), policy=LinPolicy(), lr=1e-3, baseline=True)
    grads_reinforce_w_baseline, test_rewards_reinforce_w_baseline, parameters_reinforce_w_baseline = reinforce_w_baseline.train()
    finish_training(grads_reinforce_w_baseline, parameters_reinforce_w_baseline, test_rewards_reinforce_w_baseline,
                    color='blue', fig_dict=fig_dict, legend_label='REINFORCE_WITH_BASELINE')

    return fig_dict


fig_dict = execute_reinforce(fig_dict)
save_plots_from_fig_dict(fig_dict, name='reinforce')

# **Policy Gradient Theorem**

The basic version of **REINFORCE** suffers from high variance in the gradient estimates. One fix to this high variance is to exploit the temporal structure in the reward estimate. This leads to use the Monte-Carlo estimate of the reward to come (Q-function) instead of the return. The pseudo code is given as

---

- **Repeat**  For $k=1, 2, \dots$
    - run policy: sample trajectories {$\tau_i$}$_{i=1,...N}$ from $\pi_{\boldsymbol{\theta}}(a|\boldsymbol{s})$
    - Estimate the gradient:
        - $\nabla_{\boldsymbol{\theta}}J(\boldsymbol{\theta}) \approx \frac{1}{N}\sum_i \sum_t\nabla_{\boldsymbol{\theta}}\log \pi_\boldsymbol{\theta}(a_{i,t}|\boldsymbol{s}_{i,t})\left(\sum_{k=t}\gamma^{k-t}r(\boldsymbol{s}_{i,k}a_{i,k}\right)$
    - Update the parameters:
        - $\boldsymbol{\theta}\leftarrow \boldsymbol{\theta} + \alpha \nabla_{\boldsymbol{\theta}}J(\boldsymbol{\theta})$

- **Until convergence**

---

The following class inherits from the basic Polciy Gradient class and only needs to override the function
`estimate_grad` according to the pseudo code given above. This function estimates the gradient of the return
with respect to the parameters of the policy. 

## **TASK 2: Policy Gradient Theorem** (3 Points)
Your task is to implement the *'estimate_grad'* function according to the pseudo code shown above (**Don't implement the policy update!**). Through the class attribute `baseline` we can choose if we would like to estimate the gradient with or without a baseline. 
Make sure that both options are working in your code. 

Use the following time-dependent baseline when you are in time-step *t*
\begin{align}
    b_t = \frac{1}{N}\sum_i\sum_{k=t}r(\boldsymbol{s}_{i,k}, a_{i,k})
\end{align}

After you completed the implementation, run the **Execute Policy Gradient Theorem** cell. This cell will save three different figures which you will need to submit.

*Hint: To get the gradient of the log policy for current state $\boldsymbol{s}_{i,t}$ and current action $a_{i,t}$ from the rollout i at time step t, you will need to call `self.policy.grad_log_pi(current_state, current_action)`, where current_state is $\boldsymbol{s}_{i,t}$ and current action is $a_{i,t}$.*

In [None]:
class PGTheorem(PG):

    def estimate_grad(self, rews, state_traj, action_traj):
        """
        This function returns the gradient estimate of the return.
        :param rews: all training rewards of the last iteration: np.ndarray 
                     [n_traj_samples x T], where T is the horizon length
        :param state_traj: all states of the last iteration: np.ndarray 
                     [n_traj_samples x T x s_dim],where T is the horizon length 
                     and s_dim is the state dimension
        :param action_traj: all taken actions of the last iteration: np.ndarray 
                     [n_traj_samples x T x a_dim], where T is the horizon length 
                     and a_dim is the action dimension
        :return: grad_estimate: the estimated gradient: np.ndarray [n_params]
        """
        ####### TODO ######
        # Hint: You might need to use if self.baseline: ..... in order to 
        #       to distinguish wether to use or not use a baseline
        return 0

### **Execute Policy Gradient Theorem**
Given that **REINFORCE** with the **Polciy Gradient Theorem** extension can be executed with and without baseline, we can now compare both versions of the algorithm. The following cell will executed both versions and plot the rewards, gradient estimates as well as the
update history of the parameters $k_1$ and $k_2$ on the contour plot of the return depending on the policy parameters.

If you have implemented the algorithm correctly, you should clearly see that the baseline approach is propperly converging and the version without the baseline is very slow and ends in a very low performance quality.

*Hint: The hyperparameters are already tuned. You do not need to tune them.* 

In [None]:
fig_dict = create_fig_dict()
fig_dict["Contour Plot"] = do_contour_plot(LinEnv())


def execute_pgt(fig_dict):
    np.random.seed(0)

    pgt = PGTheorem(env=LinEnv(), policy=LinPolicy(), lr=1e-5, baseline=False)
    grads_pgt, test_rewards_pgt, parameters_pgt = pgt.train()
    finish_training(grads_pgt, parameters_pgt, test_rewards_pgt, color='olive', fig_dict=fig_dict,
                    legend_label='PGTheorem')

    np.random.seed(0)
    pgt_with_baseline = PGTheorem(env=LinEnv(), policy=LinPolicy(), lr=1e-3, baseline=True)
    grads_pgt_with_baseline, test_rewards_pgt_with_baseline, parameters_pgt_with_baseline = pgt_with_baseline.train()
    finish_training(grads_pgt_with_baseline, parameters_pgt_with_baseline, test_rewards_pgt_with_baseline,
                    color='green', fig_dict=fig_dict, legend_label='PGTheorem with baseline')
    return fig_dict


fig_dict = execute_pgt(fig_dict)
save_plots_from_fig_dict(fig_dict, name='pgt')

# **Natural Policy Gradient**

A common approach in policy search is to apply a trust region constraint, where the policy update is bounded. It has been shown that trust regions highly stabilize the learning process. 

A trust region approach which can be applied to continous control problems with parametric policy distributions is **Natural Policy Gradient**. Here, the KL-constraint is approximated with the second order Taylor approximation, which results in the **Fisher Information** matrix **F**.

Concretely, we can easily calculate **F** as 
\begin{align}
    \boldsymbol{F} = \frac{1}{TN}\sum_i^N\sum_t^T \left[\nabla_{\theta}\log \pi_{\theta}(a_{i,t}|s_{i,t})\nabla_{\theta}\log \pi_{\theta}(a_{i,t}|s_{i,t})^T\right].
\end{align}

The policy's parameter update is then given as
\begin{align}
    \boldsymbol{\theta} \leftarrow \boldsymbol{\theta} + \alpha\boldsymbol{F}^{-1}\nabla_{\boldsymbol{\theta}}J(\boldsymbol{\theta}),
\end{align}
where for the gradient estimate $\nabla_{\boldsymbol{\theta}}J(\boldsymbol{\theta})$ standard techniques like the policy gradient theorem is used.

In Natural Policy Gradient, the learning rate parameter $\alpha ~~(\eta^{-1}\text{ in the slides})$ can be solved in closed-form by finding the optimal solution of the dual function to the according Lagrangian (see Task 4). More specifically, $\alpha$ is given as 
\begin{align}
    \alpha = \sqrt{\frac{4\epsilon}{\nabla_{\boldsymbol{\theta}}J(\boldsymbol{\theta})^{T}\boldsymbol{F}^{-1}\nabla_{\boldsymbol{\theta}}J(\boldsymbol{\theta})}},
\end{align}
where $\epsilon$ is the hyperparameter, bounding the expected KL between the new and the old policy.

Since we will use the gradient estimate from the **Policy Gradient Theorem**, the following class inherits from the **PGTheorem** class, i.e. you need to have properly solved Task 2 in order to be able to solve this task.


## **TASK 3: Natural Policy Gradient** (4 Points)

- Implement the function `get_F`, which will return the Fisher information matrix given state-action trajectories. Apply the equations mentioned above. 
- Implement the function `grad_ascent_step`, which calculates and returns the new parameters of the policy according to the update rule mentioned above. When implementing the update, please also use the closed-form solution to $\alpha$ to scale the update. We will use $\epsilon=0.5$.

*Note: You will need to have properly solved TASK 2 in order to be able to solve this task.*

After you completed the implementation, run the **Natural Policy Gradient** cell. This cell will save three different figures which you will need to submit.

*Hint: To get the gradient of the log policy for current state $\boldsymbol{s}_{i,t}$ and current action $a_{i,t}$ from the rollout i at time step t, you will need to call `self.policy.grad_log_pi(current_state, current_action)`, where current_state is $\boldsymbol{s}_{i,t}$ and current action is $a_{i,t}$.*

In [None]:
class NPG(PGTheorem):
    def __init__(self, env, policy, n_it=150, n_traj_samples=25, eps=0.5, baseline=True):
        super(NPG, self).__init__(env, policy, n_it=n_it, n_traj_samples=n_traj_samples, 
                                  lr=None, baseline=baseline)
        self.eps = eps
        
    def get_F(self, state_traj, action_traj):
        """
        This function calculates the Fisher Information matrix.
        :param state_traj: all states of the last iteration: np.ndarray [n_traj_samples x T x s_dim],
                     where T is the horizon length and s_dim is the state dimension
        :param action_traj: all taken actions of the last iteration: np.ndarray [n_traj_samples x T x a_dim],
                     where T is the horizon length and a_dim is the action dimension
        :return: F: returns the Fisher information matrix: np.ndarray [n_params x n_params]
        """
        ####### TODO ######
        # implement here the sample based Fisher information matrix F described in the 
        # equations above
        return 0

    def grad_ascent_step(self, grad_estimate, state_traj, action_traj):
        """
        Performs an updated on the parameters of the policy according to the Natural Policy Gradient Rule by using
        the current gradient estimate (grad_estimate) of the Policy Gradient theorem with baseline, the
        state trajectories and the action trajectories.
        :param grad_estimate: np.ndarray [n_params]
        :param state_traj: np.ndarray [n_traj_samples x T x s_dim]
        :param action_traj: np.ndarray [n_traj_samples x T x a_dim]
        :return: updated policy parameters: np.ndarray [n_params]
        """
        ####### TODO ######
        # implement here the gradient update of the parameters described in the equations
        # above. Also implement the optimal step size parameter alpha described in the 
        # equations above.
        return 0

### **Execute Natural Policy Gradient**
The following cell will executed the natural policy gradient implementation which uses the Policy gradient Theorem with baseline to estimate the vanilla gradient and plot the rewards, gradient estimates as well as the
update history of the parameters $k_1$ and $k_2$ on the contour plot of the return depending on the policy parameters.

*Hint: The hyperparameters are already tuned. You do not need to tune them.* 

In [None]:

fig_dict = create_fig_dict()
fig_dict["Contour Plot"] = do_contour_plot(LinEnv())


def execute_natural_pg(fig_dict):
    np.random.seed(0)
    npg = NPG(env=LinEnv(), policy=LinPolicy(), eps=0.5, baseline=True)
    grads_npg, test_rewards_npg, parameters_npg = npg.train()
    finish_training(grads_npg, parameters_npg, test_rewards_npg, color='orange', fig_dict=fig_dict,
                    legend_label='NPG')
    return fig_dict


fig_dict = execute_natural_pg(fig_dict)
save_plots_from_fig_dict(fig_dict, name='npg')



**The following cell will execute all algorithms and plot the results into one plot. It will furthermore save the figures to your google drive. You will need to submit the plots together with the ipython notebook in a compressed file.**


In [None]:
fig_dict = create_fig_dict()
fig_dict["Contour Plot"] = do_contour_plot(LinEnv())
fig_dict = execute_reinforce(fig_dict)
fig_dict = execute_pgt(fig_dict)
fig_dict = execute_natural_pg(fig_dict)
save_plots_from_fig_dict(fig_dict, name='all')

## **Task 4: Natural Policy Gradient Step Size** (4 Points)

Recall that Natural Gradients use a Taylor approximation of the trust region problem, i.e., the objective is given as 
$$\boldsymbol{g}^* = \underset{\boldsymbol{g}}{\textrm{argmax}} ~~ \boldsymbol{g}^T\nabla_{\boldsymbol{\theta}} \boldsymbol{J} ~~ \textrm{s.t.} ~~ \boldsymbol{g}^T \boldsymbol{F} \boldsymbol{g} \leq \epsilon.$$
By introducing a Lagrangian multiplier $\eta$ we can construct the corresponding Lagrangian
$$ L(\boldsymbol{g}, \eta) = \boldsymbol{g}^T \nabla_{\boldsymbol{\theta}} \boldsymbol{J} + \eta \left(\epsilon - \boldsymbol{g}^T\boldsymbol{F}\boldsymbol{g}\right).$$

**Exercise:** Derive $\boldsymbol{g}^*$. Also solve the dual, your solution should not depend on $\eta$ anymore! You can use the markdown cell below to answer this task.

*Note:You can also submit any other file format (photo, pdf, ...) as long as we are able to identify your solution.*




## **Discrete Control with Deep Neurel Net Controller**

Next, we will consider training Deep Neural Net policies to solve the discrete pole balancing environment called 'CartPole'. We will use the 'CartPole-v1' environment from OpenAI Gym, which you can find here:

https://gym.openai.com/envs/CartPole-v1/. 

**Policy gradients With Neural Network Policies.**

We will use the **REINFORCE** algorithm **with baselines** and **policy gradient theorem** as shown below. 

---

- **Repeat**  For $k=1, 2, \dots$
    - run policy: sample trajectories {$\tau_i$}$_{i=1,...N}$ from $\pi_{\boldsymbol{\theta}}(a|\boldsymbol{s})$
    - Estimate the gradient:
        - $\nabla_{\boldsymbol{\theta}}J(\boldsymbol{\theta}) \approx \frac{1}{N}\sum_i \left(\sum_t\nabla_{\boldsymbol{\theta}}\log \pi_\boldsymbol{\theta}(a_{i,t}|\boldsymbol{s}_{i,t})\right)\left(Q_t-b_t\right)$
    - Update the parameters:
        - $\boldsymbol{\theta}\leftarrow \boldsymbol{\theta} + \alpha \nabla_{\boldsymbol{\theta}}J(\boldsymbol{\theta})$

- **Until convergence**

---
$Q_t$ is the Q-value at time t, $Q^{\pi}(s_t, a_t)$, and $b_t$ is a baseline.
The baseline is given as
\begin{align}
    b_t = \frac{1}{N}\sum_i\sum_{k=t}r(\boldsymbol{s}_{i,k}, a_{i,k}).
\end{align}
However we will now replace the linear policies with deep neural networks and compute policy gradients with automatic differentiation. To do this we create a graph in such a way that its gradient is the policy gradient. 

We first import the necessary packages.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions as distributions
import gym

from typing import Tuple

In [None]:
train_env = gym.make('CartPole-v1')
test_env = gym.make('CartPole-v1')

SEED = 1234

train_env.seed(SEED);
test_env.seed(SEED+1);
np.random.seed(SEED);
torch.manual_seed(SEED);

### **Building Neural Network in PyTorch** 
    
This part of the code creates a feed forward neural network using pytorch library, which will form the policy  $\pi_{\boldsymbol{\theta}}(a|\boldsymbol{s})$.

We use pytorch modules torch.nn.Linear and torch.nn.ReLU() to do so.

In [None]:
class MLP(nn.Module):
    def __init__(self, state_dim: int, act_dim: int, hidden_units=[60,60,60]):
        """
        :param state_dim: dimension of the state space
        :param state_dim: dimension of the actions
        :param hidden_units: list of integers corresponding to hidden units
        """
        super(MLP, self).__init__()

        self._state_dim = state_dim
        self._act_dim = act_dim
        self._hidden_units = hidden_units
        # Define network
        layers = []
        last_hidden = self._state_dim
        # hidden layers
        for hidden_dim in self._hidden_units:
            layers.append(nn.Linear(in_features=last_hidden, out_features=hidden_dim))
            layers.append(nn.ReLU())
            last_hidden = hidden_dim
        self._hidden_layers, size_last_hidden = nn.ModuleList(layers), last_hidden


        self._mean_layer = nn.Linear(in_features=size_last_hidden, out_features=self._act_dim)
        

    def forward(self, input: torch.Tensor) \
            -> Tuple[torch.Tensor]:
        """ forward pass of decoder
        :param input:
        :return: output mean
        """
        h = input
        for layer in self._hidden_layers:
            h = layer(h)

        mean = self._mean_layer(h)


        return mean

### **Computing Q-values**
The code block computes numpy arrays for Q-values which will be used to compute advantages.

Recall that the expression for the policy gradient PG is

\begin{align}
    J = \text{E}_{\tau\sim p(\tau)}\left[\sum_{t=0}^T\nabla_{\boldsymbol{\theta}}\log \pi(a_t|s_t)(Q_t-b_t)\right],
\end{align}
where $ \tau=(s_0, a_0, ...)$ is a trajectory, $Q_t$ is the Q-value at time $t$ and $b_t$ is a baseline.


  We can obtain four different cases, controlled by the flag 'pg_theorem' and 'baselines' flag:

**Case 1: trajectory-based Policy Gradients (vanilla REINFORCE)**
**(pg_theorem = False)**

Instead of $Q^{\pi}(s_t, a_t)$, we use the total discounted reward summed over
entire trajectory (regardless of which time step the Q-value should be for).
For this case, the policy gradient estimator is $ \text{E}_{\tau\sim p(\tau)}\left[ \sum_{t=0}^T \nabla_{\theta} \log \pi(a_t|s_t) \cdot \text{R}(\tau)\right]$,  where $ \text{R}(\tau) = \sum_{k=0}^T \gamma^{k} r_{k} $.

**Case 2: PG Theorem applied**
**(pg_theroem = True)**

Here, you estimate $Q^{\pi}(s_t, a_t)$ by the discounted sum of rewards starting from time step t. Thus, you should compute $Q_t = \sum_{k=t}^T \gamma^{(k-t)} r_{k}$

**Case 3: No baselines**
**(baselines = False)**

Here we set b_t, the baselines to be 0. This is the 'vanilla' PG without baselines.

 \begin{align}
    PG =\text{E}_{\tau\sim p(\tau)} \sum_{t=0}^T \nabla_{\boldsymbol{\theta}} \log \pi(a_t|s_t)  (Q_t )
\end{align}

**Case 4: PG with baselines**
**(baselines = True)**

Here we use one of the baselines discussed in the lecture. Simplest one being the mean of the rewards.

 \begin{align}
    PG =\text{E}_{\tau\sim p(\tau)} \sum_{t=0}^T \nabla_{\boldsymbol{\theta}} \log \pi(a_t|s_t)  (Q_t - b_t )
\end{align}




In [None]:
def calculate_returns(rewards, discount_factor, baselines, pg_theorem, device):
    
    returns = []
    R = 0
    
    if pg_theorem:
      for r in reversed(rewards):
          R = r + R * discount_factor
          returns.insert(0, R)
    else:
      for r in reversed(rewards):
        R = r + R * discount_factor
      for r in reversed(rewards):
        returns.insert(0, R)
        
    returns = torch.tensor(returns).to(device)
    
    if baselines:
        returns = (returns - returns.mean()) 
        
    return returns

## **Task 5: Computing Loss** (1 Points)

Here we create a "pseudo loss" which is the weighted maximum likelihood, $\sum_{t=0}^T \log \pi_{\theta}(a_t|s_t)  (Q_t - b_t )$, using the stored `log_prob_actions` and `returns / Q `values computed in the previous section. The gradient of this loss function with respect to the neural network parameters ($\theta$) is the policy gradient.

After computing the loss, execute all the cells given below. In the end it will plot a reward curve with the given hyperparameters and save the figures to your google drive. You will need to submit the plot (for this task named `Deep_PG_Reward.png`) together with the ipython notebook in a compressed file.

In [None]:
def update_policy(returns, log_prob_actions, optimizer):
    """
    This function calculates the loss and backpropagate the errors using pytorch optmizers.
    :param returns: all the rewards recieved from the previous trajectory: torch.Tensor 
                 [T], where T is the horizon length
    :param log_prob_actions: all the log probabilities of the policy from the previous 
                             trajectory torch.Tensor [T],where T is the horizon length.
    :param optimizer: the torch optimizer object https://pytorch.org/docs/stable/optim.html
    :return: updated loss value: torch.Tensor [1], of float type 
    """
    
    returns = returns.detach()
    
    ##### TODO: implement here the loss described in the Task
    loss = .....
    
    ## Backpropagate using pytorch autograd. We will use the Adam Optimizer to do this, though any optimizer
    ## would work in practice.
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()
    
    return loss.item()

### **Training the network by sampling discrete actions**

Here we provide the code for training and evaluation loop. 

In [None]:
def train(env, policy, optimizer, discount_factor, baselines, pg_theorem, device):
    
    policy.train()
    
    log_prob_actions = []
    rewards = []
    done = False
    episode_reward = 0

    state = env.reset()

    while not done:

        state = torch.FloatTensor(state).unsqueeze(0).to(device)

        action_pred = policy(state)
        
        action_prob = F.softmax(action_pred, dim = -1)
        
        ## Here we use torch.distributions to sample discrete actions from a categorical
        ## distribution. https://pytorch.org/docs/stable/distributions.html
        dist = distributions.Categorical(action_prob)

        action = dist.sample()
        
        ## Here we use torch.distributions to calculate the log probability.
        ## https://pytorch.org/docs/stable/distributions.html
        log_prob_action = dist.log_prob(action)
        
        state, reward, done, _ = env.step(action.item())

        log_prob_actions.append(log_prob_action)
        rewards.append(reward)

        episode_reward += reward

    log_prob_actions = torch.cat(log_prob_actions)
        
    returns = calculate_returns(rewards, discount_factor, baselines, pg_theorem, device)

    loss = update_policy(returns, log_prob_actions, optimizer)

    return loss, episode_reward

def evaluate(env, policy, device):
    
    policy.eval()
    
    done = False
    episode_reward = 0

    state = env.reset()

    while not done:
        
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        
        with torch.no_grad():
        
            action_pred = policy(state)
            
            action_prob = F.softmax(action_pred, dim = -1)
                            
        ## Here we select the action with the highest probability.
        action = torch.argmax(action_prob, dim = -1)

        state, reward, done, _ = env.step(action.item())

        episode_reward += reward
        
    return episode_reward

### **Tuning Hyperparameters and plotting reward curve.**

Here we run the algorithms for 5 times and report the reward curve with mean value and error bounds.

You can play around with the hyperparameters and see how each of these 4 affect the algorithms performance. However, please submit the saved figures with the default parameters given here.

In [None]:
##Hyperparameters
hidden_units = [30,15]
baselines = True
pg_theorem = True
learning_rate = 5e-3


## Run the experiments 5 times
max_episodes = 300
discount_factor = 0.99
input_dim = train_env.observation_space.shape[0]
output_dim = train_env.action_space.n

device = torch.device('cuda')

n_runs = 5
train_rewards = torch.zeros(n_runs, max_episodes)
test_rewards = torch.zeros(n_runs, max_episodes)

for run in range(n_runs):
    
    policy = MLP(input_dim, output_dim, hidden_units)
    policy = policy.to(device)
    optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    
    for episode in tqdm.tqdm(range(max_episodes), desc=f'Run: {run}'):
        
        loss, train_reward = train(train_env, policy, optimizer, discount_factor, baselines, pg_theorem, 
                                   device)
        
        test_reward = evaluate(test_env, policy, device)
        
        train_rewards[run][episode] = train_reward
        test_rewards[run][episode] = test_reward
        
        
## Plot the Reward Curves        
idxs = range(max_episodes)
fig, ax = plt.subplots(1, figsize=(10,6))
ax.plot(idxs, test_rewards.mean(0))
ax.fill_between(idxs, test_rewards.min(0).values, test_rewards.max(0).values, alpha=0.1)
ax.set_xlabel('Steps')
ax.set_ylabel('Rewards');
save_figure('Deep_PG_Reward')