# Policy gradient algorithm
This notebook implements the policy gradient algorithm and applies it to the trading environment. The code partly origins
from [spinning-up](https://github.com/openai/spinningup/blob/master/spinup/examples/pytorch/pg_math/1_simple_pg.py) adapted to a continuous action space.

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.distributions import Normal
from torch.optim import Adam
import numpy as np

Define neural network for the policy. It is a simple linear model pre-initialized with the linear regression weights.
We know from the data analysis that at least 6 interests lags are needed.

In [None]:
class Agent(nn.Module):

    def __init__(self, n_obs, n_acts):
        super().__init__()
        # define one linear layer where each weight is used for one interest rate
        self.linear_interest_rates_layer = nn.Linear(n_obs, 1, bias=False)

        # define 1 output for the mean of the action distribution
        self.mean_out = nn.Linear(1, n_acts * 1, bias=False)

        # init the weights of the layer with the previously calculated coefficients of the linear
        # regression
        with torch.no_grad():
            self.linear_interest_rates_layer.weight.copy_(torch.as_tensor([
                [
                    0.01703097,
                    0.00321324,
                    0.0409251,
                    0.09984709,
                    0.07125695,
                    0.56419391
                ]
            ]))

        # set the weight to 1.0
        self.mean_out.weight.data.fill_(1.0)

        # define a parameter for the standard deviation of the action distribution
        # This could be changed to a separate output value of the network.
        log_std = -0.5 * np.ones(n_acts, dtype=np.float32)
        self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))

    def forward(self, x):
        # reduce the dimension of the input to a simple array of [interest rate_1, interest rate_2, ... interest rate_n]
        interest_rate_input = torch.flatten(x, 1)

        # send the first layer
        interest_rate_output = self.linear_interest_rates_layer(interest_rate_input)

        # get the mean of the action
        mean = self.mean_out(interest_rate_output)

        # get the standard deviation of the action
        std = torch.exp(self.log_std)

        # return the mean and the standard deviation of the action distribution
        return mean, std

    # make function to compute action distribution
    def get_policy(self, obs):
        mean, std = self.forward(torch.as_tensor(obs, dtype=torch.float32))
        # define a normal distribution with the output of the policy
        normal_dist = Normal(mean, std)
        return normal_dist

    # make action selection function (outputs int actions, sampled from policy)
    def get_action(self, obs):
        # sample an action from the current action distribution
        action = self.get_policy(obs).sample()
        return action.clamp(-1, 1)

## Normal Distribution
Our Agent needs to produce continuous actions. The actions will be sampled from a Normal distribution. The parameters of
the normal distribution depend on the outcome of the network.


Define the training loop for optimizing the agent.

In [None]:
def train(train_env,
          eval_env,
          agent,
          lr=0.01,
          epochs=50,
          batch_size=6000,
          evaluation_interval=100
          ):
    """
    Trains a Policy Gradient based agent in a training environment and evaluates on the evaluation environment.
    Parameters
    ----------
    train_env: Training environment
    eval_env: Evaluation environment
    agent: agent based on neural network
    lr: learning rate
    epochs: number of epochs for training
    batch_size: number of observation/action pairs after which a gradient update should happen
    evaluation_interval: number of epochs after which the agent should be tested in the test environment

    Returns
    -------

    """

    # make loss function whose gradient, for the right data, is policy gradient
    def compute_loss(obs, act, weights):
        logp = agent.get_policy(obs).log_prob(act)
        return -(logp.sum(axis=-1) * weights).mean()

    # make optimizer
    optimizer = Adam(agent.parameters(), lr=lr)

    def evaluate(_env):

        infos = list()
        obs = _env.reset()

        print("Weights of neural network")
        print(agent.linear_interest_rates_layer.weight.data)
        print(agent.mean_out.weight.data)

        while True:

            # act in the environment
            act = agent.get_action(torch.as_tensor(obs, dtype=torch.float32))
            act_processed = act.numpy()[0]
            obs, rew, done, info = _env.step(act_processed)

            infos.append(info)

            if done:
                # if episode is over, record info about episode
                break

        return infos

    # for training policy
    def train_one_epoch():

        # make some empty lists for logging.
        batch_obs = []  # for observations
        batch_acts = []  # for actions
        batch_weights = []  # for R(tau) weighting in policy gradient
        batch_rets = []  # for measuring episode returns
        batch_lens = []  # for measuring episode lengths

        # reset episode-specific variables
        obs = train_env.reset()  # first obs comes from starting distribution
        ep_rews = []  # list for rewards accrued throughout episode
        ep_interest_rates = []

        # collect experience by acting in the environment with current policy
        while True:

            # save obs
            batch_obs.append(obs.copy()[0, :])

            # act in the environment
            act = agent.get_action(torch.as_tensor(obs, dtype=torch.float32))
            act_processed = act.numpy()[0]

            # collect new observation, reward, done and additional info
            obs, rew, done, info = train_env.step(act_processed)

            # save action, reward
            batch_acts.append(act)
            ep_rews.append(rew.copy())
            ep_interest_rates.append(info['interest_rate'])

            # Rollout is finished
            if done:

                # Calculate the maximum possible return of the episode.
                # This will be reference that the agent could reach (assuming no transaction costs)
                max_possible_return = (np.abs(np.array(ep_interest_rates)) + 1).cumprod()[-1]

                # if episode is over, record info about episode
                episode_return, episode_length = info['value'] - max_possible_return, len(ep_rews)

                batch_rets.append(info['value'] - 1)
                batch_lens.append(episode_length)

                # the weight for each logprob(a|s) is R(tau)
                batch_weights += [episode_return] * episode_length

                # reset episode-specific variables
                obs, done, ep_rews = train_env.reset(), False, []

                # end experience loop if we have enough of it
                if len(batch_obs) > batch_size:
                    break

        # take a single policy gradient update step
        optimizer.zero_grad()
        batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
                                  act=torch.as_tensor(batch_acts, dtype=torch.float32),
                                  weights=torch.as_tensor(batch_weights, dtype=torch.float32))

        # calculate gradients and optimize
        batch_loss.backward()
        optimizer.step()

        return batch_loss, batch_rets, batch_lens

    training_returns = list()
    test_returns = list()

    # training loop
    for i in range(epochs):
        batch_loss, batch_rets, batch_lens = train_one_epoch()
        print('Epoch: %3d \t Loss: %.3f \t Return: %.3f' %
              (i, batch_loss, np.mean(batch_rets)))

        training_returns.append({"epoch": i, "mean_return": np.mean(batch_rets)})

        if (i + 1) % evaluation_interval == 0:
            def plot_results(_infos, title):
                import cufflinks as cf

                cf.go_offline()

                # Create a dataframe for further processing and plotting
                info_df = pd.DataFrame({"info": _infos})
                info_df = info_df["info"].apply(pd.Series).set_index("timestamp")
                info_df['value'] = info_df['value'].apply(lambda x: x.squeeze())
                info_df['position'] = info_df['position'].apply(lambda x: x.squeeze())
                info_df["cum_prod_interest_rate"] = (info_df["interest_rate"] + 1).cumprod()

                info_df[["value", "position", "cum_prod_interest_rate"]].iplot(secondary_y="position",
                                                                               title=title)

            plot_results(evaluate(train_env), "Evaluation on Training data")
            test_infos = evaluate(eval_env)
            test_returns.append({"epoch": i, "mean_return": (test_infos[-1]['value'] - 1).squeeze()})
            plot_results(test_infos, "Evaluation on Test data")

    return training_returns, test_returns

Define one Environment for training the agent and one environment for testing the agent on unseen data.

In [None]:
from interest_rate_environment_pytorch import InterestEnv

window_length = 6

train_env_config = {
    "product_path": '../data/interest_rates_p1.csv',
    "window_length": window_length,
    "end_timestamp": "2020-01-01 00:00:00"
}

eval_env_config = {
    "product_path": '../data/interest_rates_p1.csv',
    "window_length": window_length,
    "start_timestamp": "2020-01-01 00:00:00"
}

In [None]:
train_env = InterestEnv(train_env_config)
eval_env = InterestEnv(eval_env_config)
agent = Agent(train_env_config['window_length'], 1)
training_returns, test_returns = train(train_env, eval_env, agent, lr=0.01, epochs=250, batch_size=500,
                                       evaluation_interval=50)


Plot Learning Progress

In [None]:
fig = pd.DataFrame.from_records(training_returns).set_index('epoch').iplot(title='Mean Return over training epochs',
                                                                           xTitle='Epochs',
                                                                           yTitle='Return',
                                                                           asFigure=True)
fig.update_layout(yaxis=dict(tickformat=".2%"))
fig.show()
fig = pd.DataFrame.from_records(test_returns).set_index('epoch').iplot(title='Mean Return over test epochs',
                                                                       xTitle='Epochs',
                                                                       yTitle='Return',
                                                                       asFigure=True)
fig.update_layout(yaxis=dict(tickformat=".2%"))
fig.show()


## Tasks

1. Increase the window_length to 10. Add multiple zeros at the beginning of the weight init part. How does the agent perform now?
