# Policy gradient algorithm
This notebook implements the policy gradient algorithm and applies it to the trading environment. The code partly origins
from [spinning-up](https://github.com/openai/spinningup/blob/master/spinup/examples/pytorch/pg_math/1_simple_pg.py) adapted to a continuous action space.

In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.distributions import Normal
from torch.optim import Adam
import numpy as np

Define neural network for the policy. It is a simple linear model pre-initialized with the linear regression weights.
We know from the data analysis that at least 6 interests lags are needed.

In [2]:
class Net(nn.Module):

    def __init__(self, n_obs, n_acts):
        super().__init__()
        # define one linear layer where each weight is used for one interest rate
        self.linear_interest_rates_layer = nn.Linear(n_obs, 1, bias=False)

        # define 1 output for the mean of the action distribution
        self.mean_out = nn.Linear(1, n_acts, bias=False)
        # set the weight to 1.0
        self.mean_out.weight.data.fill_(1.0)

        # define a parameter for the standard deviation of the action distribution
        # This could be changed to a separate output value of the network.
        log_std = -0.5 * np.ones(n_acts, dtype=np.float32)
        self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))

        # init the weights of the layer with the previously calculated coefficients of the linear
        # regression
        with torch.no_grad():
            self.linear_interest_rates_layer.weight.copy_(torch.as_tensor([
                [
                    0.0,
                    0.0,
                    0.0,
                    0.01703097,
                    0.0980103,
                    0.00187909,
                    0.0409251,
                    0.09913933,
                    0.07125695,
                    0.56226713
                ]
            ]))

    def forward(self, x):
        # reduce the dimension of the input to a simple array of [interest rate_1, interest rate_2, ... interest rate_n]
        interest_rate_input = torch.flatten(x, 1)

        # send the first layer
        interest_rate_output = self.linear_interest_rates_layer(interest_rate_input)

        # get the mean of the action
        mean = self.mean_out(interest_rate_output)

        # get the standard deviation of the action
        std = torch.exp(self.log_std)

        # return the mean and the standard deviation of the action distribution
        return mean, std


Define the training loop for optimizing the agent.

In [3]:
def train(train_env,
          eval_env,
          net,
          lr=0.01,
          epochs=50,
          batch_size=6000,
          evaluation_interval=100
          ):
    """
    Trains a Policy Gradient based agent in a training environment and evaluates on the evaluation environment.
    Parameters
    ----------
    train_env: Training environment
    eval_env: Evaluation environment
    net: neural network
    lr: learning rate
    epochs: number of epochs for training
    batch_size: number of observation/action pairs after which a gradient update should happen
    evaluation_interval: number of epochs after which the agent should be tested in the test environment

    Returns
    -------

    """

    # make function to compute action distribution
    def get_policy(obs):
        mean, std = net.forward(torch.as_tensor(obs, dtype=torch.float32))
        # define a normal distribution with the output of the policy
        normal_dist = Normal(mean, std)
        return normal_dist

    # make action selection function (outputs int actions, sampled from policy)
    def get_action(obs):
        # sample an action from the current action distribution
        action = get_policy(obs).sample()
        return action.clamp(-1, 1)

    # make loss function whose gradient, for the right data, is policy gradient
    def compute_loss(obs, act, weights):
        logp = get_policy(obs).log_prob(act)
        return -(logp.sum(axis=-1) * weights).mean()

    # make optimizer
    optimizer = Adam(net.parameters(), lr=lr)

    def evaluate(_env):

        infos = list()
        obs = _env.reset()

        print("Weights of neural network")
        print(net.linear_interest_rates_layer.weight.data)
        print(net.mean_out.weight.data)

        while True:

            # act in the environment
            act = get_action(torch.as_tensor(obs, dtype=torch.float32))
            act_processed = act.numpy()[0]
            obs, rew, done, info = _env.step(act_processed)

            infos.append(info)

            if done:
                # if episode is over, record info about episode
                break

        return infos

    # for training policy
    def train_one_epoch():

        # make some empty lists for logging.
        batch_obs = []  # for observations
        batch_acts = []  # for actions
        batch_weights = []  # for R(tau) weighting in policy gradient
        batch_rets = []  # for measuring episode returns
        batch_lens = []  # for measuring episode lengths

        # reset episode-specific variables
        obs = train_env.reset()  # first obs comes from starting distribution
        ep_rews = []  # list for rewards accrued throughout episode
        ep_interest_rates = []

        # collect experience by acting in the environment with current policy
        while True:

            # save obs
            batch_obs.append(obs.copy()[0, :])

            # act in the environment
            act = get_action(torch.as_tensor(obs, dtype=torch.float32))
            act_processed = act.numpy()[0]

            # collect new observation, reward, done and additional info
            obs, rew, done, info = train_env.step(act_processed)

            # save action, reward
            batch_acts.append(act)
            ep_rews.append(rew.copy())
            ep_interest_rates.append(info['interest_rate'])

            if done:

                # Calculate the maximum possible return of the episode.
                # This will be reference that the agent could reach (assuming no transaction costs)
                max_possible_return = (np.abs(np.array(ep_interest_rates)) + 1).cumprod()[-1]

                # if episode is over, record info about episode
                episode_return, episode_length = info['value'] - max_possible_return, len(ep_rews)

                batch_rets.append(info['value'] - 1)
                batch_lens.append(episode_length)

                # the weight for each logprob(a|s) is R(tau)
                batch_weights += [episode_return] * episode_length

                # reset episode-specific variables
                obs, done, ep_rews = train_env.reset(), False, []

                # end experience loop if we have enough of it
                if len(batch_obs) > batch_size:
                    break

        # take a single policy gradient update step
        optimizer.zero_grad()
        batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
                                  act=torch.as_tensor(batch_acts, dtype=torch.float32),
                                  weights=torch.as_tensor(batch_weights, dtype=torch.float32))

        # calculate gradients and optimize
        batch_loss.backward()
        optimizer.step()

        return batch_loss, batch_rets, batch_lens

    training_returns = list()
    test_returns = list()

    # training loop
    for i in range(epochs):
        batch_loss, batch_rets, batch_lens = train_one_epoch()
        print('Epoch: %3d \t Loss: %.3f \t Return: %.3f' %
              (i, batch_loss, np.mean(batch_rets)))

        training_returns.append({"epoch": i, "mean_return": np.mean(batch_rets)})

        if (i + 1) % evaluation_interval == 0:
            def plot_results(_infos, title):
                import cufflinks as cf

                cf.go_offline()

                # Create a dataframe for further processing and plotting
                info_df = pd.DataFrame({"info": _infos})
                info_df = info_df["info"].apply(pd.Series).set_index("timestamp")
                info_df['value'] = info_df['value'].apply(lambda x: x.squeeze())
                info_df['position'] = info_df['position'].apply(lambda x: x.squeeze())
                info_df["cum_prod_interest_rate"] = (info_df["interest_rate"] + 1).cumprod()

                info_df[["value", "position", "cum_prod_interest_rate"]].iplot(secondary_y="position",
                                                                               title=title)

            plot_results(evaluate(train_env), "Evaluation on Training data")
            test_infos = evaluate(eval_env)
            test_returns.append({"epoch": i, "mean_return": (test_infos[-1]['value'] - 1).squeeze()})
            plot_results(test_infos, "Evaluation on Test data")

    return training_returns, test_returns

Define one Environment for training the agent and one environment for testing the agent on unseen data.

In [4]:
from interest_rate_environment_pytorch import InterestEnv

window_length = 10

train_env_config = {
    "product_path": '../data/interest_rates_p1.csv',
    "window_length": window_length,
    "end_timestamp": "2020-01-01 00:00:00"
}

eval_env_config = {
    "product_path": '../data/interest_rates_p1.csv',
    "window_length": window_length,
    "start_timestamp": "2020-01-01 00:00:00"
}

In [5]:
train_env = InterestEnv(train_env_config)
eval_env = InterestEnv(eval_env_config)
net = Net(train_env_config['window_length'], 1)
training_returns, test_returns = train(train_env, eval_env, net, lr=0.01, epochs=250, batch_size=500,
                                       evaluation_interval=50)


Calling init env with product path: ../data/interest_rates_p1.csv 
 window-length: 10 
 start_timestamp: None 
 end_timestamp: 2020-01-01 00:00:00
Creating observations for product: ../data/interest_rates_p1.csv
Calling init env with product path: ../data/interest_rates_p1.csv 
 window-length: 10 
 start_timestamp: 2020-01-01 00:00:00 
 end_timestamp: None
Creating observations for product: ../data/interest_rates_p1.csv


  batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),


Epoch:   0 	 Loss: -3399.479 	 Return: -0.693
Epoch:   1 	 Loss: -3549.672 	 Return: -0.715
Epoch:   2 	 Loss: -3553.941 	 Return: -0.713
Epoch:   3 	 Loss: -3698.935 	 Return: -0.713
Epoch:   4 	 Loss: -3732.990 	 Return: -0.704
Epoch:   5 	 Loss: -3781.413 	 Return: -0.722
Epoch:   6 	 Loss: -3765.238 	 Return: -0.708
Epoch:   7 	 Loss: -3827.292 	 Return: -0.711
Epoch:   8 	 Loss: -3939.595 	 Return: -0.712
Epoch:   9 	 Loss: -4026.982 	 Return: -0.709
Epoch:  10 	 Loss: -4086.199 	 Return: -0.707
Epoch:  11 	 Loss: -4110.419 	 Return: -0.698
Epoch:  12 	 Loss: -4102.012 	 Return: -0.693
Epoch:  13 	 Loss: -4295.488 	 Return: -0.713
Epoch:  14 	 Loss: -4303.643 	 Return: -0.701
Epoch:  15 	 Loss: -4315.085 	 Return: -0.693
Epoch:  16 	 Loss: -4403.646 	 Return: -0.689
Epoch:  17 	 Loss: -4415.336 	 Return: -0.672
Epoch:  18 	 Loss: -4427.312 	 Return: -0.661
Epoch:  19 	 Loss: -4652.992 	 Return: -0.680
Epoch:  20 	 Loss: -4745.878 	 Return: -0.680
Epoch:  21 	 Loss: -4708.154 	 Ret

Weights of neural network
tensor([[0.6189, 0.6189, 0.6187, 0.6353, 0.7159, 0.6197, 0.6583, 0.7158, 0.6871,
         1.1761]])
tensor([[1.6271]])


Epoch:  50 	 Loss: -16967.543 	 Return: -0.232
Epoch:  51 	 Loss: -18302.789 	 Return: -0.230
Epoch:  52 	 Loss: -19426.195 	 Return: -0.205
Epoch:  53 	 Loss: -20761.148 	 Return: -0.189
Epoch:  54 	 Loss: -21995.863 	 Return: -0.163
Epoch:  55 	 Loss: -23157.146 	 Return: -0.136
Epoch:  56 	 Loss: -24555.688 	 Return: -0.115
Epoch:  57 	 Loss: -27101.906 	 Return: -0.128
Epoch:  58 	 Loss: -28713.283 	 Return: -0.105
Epoch:  59 	 Loss: -29956.023 	 Return: -0.069
Epoch:  60 	 Loss: -32915.305 	 Return: -0.077
Epoch:  61 	 Loss: -34405.422 	 Return: -0.043
Epoch:  62 	 Loss: -37198.613 	 Return: -0.036
Epoch:  63 	 Loss: -38317.273 	 Return: 0.006
Epoch:  64 	 Loss: -42476.027 	 Return: -0.008
Epoch:  65 	 Loss: -44586.371 	 Return: 0.020
Epoch:  66 	 Loss: -47749.207 	 Return: 0.033
Epoch:  67 	 Loss: -50498.613 	 Return: 0.055
Epoch:  68 	 Loss: -54441.250 	 Return: 0.062
Epoch:  69 	 Loss: -59769.121 	 Return: 0.056
Epoch:  70 	 Loss: -62856.469 	 Return: 0.081
Epoch:  71 	 Loss: -

Weights of neural network
tensor([[1.5564, 1.5564, 1.5560, 1.5725, 1.6529, 1.5566, 1.5948, 1.6520, 1.6228,
         2.1107]])
tensor([[2.5813]])


Epoch: 100 	 Loss: -601934.188 	 Return: 0.278
Epoch: 101 	 Loss: -648480.375 	 Return: 0.284
Epoch: 102 	 Loss: -713194.000 	 Return: 0.280
Epoch: 103 	 Loss: -769333.688 	 Return: 0.285
Epoch: 104 	 Loss: -839813.062 	 Return: 0.285
Epoch: 105 	 Loss: -910397.938 	 Return: 0.288
Epoch: 106 	 Loss: -995898.625 	 Return: 0.287
Epoch: 107 	 Loss: -1084522.625 	 Return: 0.288
Epoch: 108 	 Loss: -1159750.000 	 Return: 0.297
Epoch: 109 	 Loss: -1272765.375 	 Return: 0.295
Epoch: 110 	 Loss: -1398132.750 	 Return: 0.291
Epoch: 111 	 Loss: -1496202.625 	 Return: 0.300
Epoch: 112 	 Loss: -1648547.875 	 Return: 0.296
Epoch: 113 	 Loss: -1813609.125 	 Return: 0.292
Epoch: 114 	 Loss: -1979167.500 	 Return: 0.292
Epoch: 115 	 Loss: -2147352.750 	 Return: 0.294
Epoch: 116 	 Loss: -2341710.500 	 Return: 0.295
Epoch: 117 	 Loss: -2545945.250 	 Return: 0.296
Epoch: 118 	 Loss: -2795449.250 	 Return: 0.293
Epoch: 119 	 Loss: -3013957.000 	 Return: 0.299
Epoch: 120 	 Loss: -3280158.750 	 Return: 0.300

Weights of neural network
tensor([[2.7821, 2.7820, 2.7817, 2.7981, 2.8784, 2.7822, 2.8204, 2.8774, 2.8481,
         3.3359]])
tensor([[3.8129]])


Epoch: 150 	 Loss: -43853096.000 	 Return: 0.307
Epoch: 151 	 Loss: -47832376.000 	 Return: 0.307
Epoch: 152 	 Loss: -52247668.000 	 Return: 0.306
Epoch: 153 	 Loss: -56929832.000 	 Return: 0.307
Epoch: 154 	 Loss: -62071696.000 	 Return: 0.307
Epoch: 155 	 Loss: -67811064.000 	 Return: 0.306
Epoch: 156 	 Loss: -73968632.000 	 Return: 0.306
Epoch: 157 	 Loss: -80645440.000 	 Return: 0.307
Epoch: 158 	 Loss: -87866344.000 	 Return: 0.307
Epoch: 159 	 Loss: -95548376.000 	 Return: 0.308
Epoch: 160 	 Loss: -104479536.000 	 Return: 0.307
Epoch: 161 	 Loss: -114095832.000 	 Return: 0.307
Epoch: 162 	 Loss: -124291264.000 	 Return: 0.307
Epoch: 163 	 Loss: -135631616.000 	 Return: 0.307
Epoch: 164 	 Loss: -147808768.000 	 Return: 0.308
Epoch: 165 	 Loss: -161340720.000 	 Return: 0.307
Epoch: 166 	 Loss: -176011216.000 	 Return: 0.307
Epoch: 167 	 Loss: -191528688.000 	 Return: 0.308
Epoch: 168 	 Loss: -209007248.000 	 Return: 0.308
Epoch: 169 	 Loss: -228193376.000 	 Return: 0.308
Epoch: 170

Weights of neural network
tensor([[4.2246, 4.2245, 4.2242, 4.2405, 4.3209, 4.2246, 4.2628, 4.3198, 4.2905,
         4.7782]])
tensor([[5.2578]])


Epoch: 200 	 Loss: -3389525504.000 	 Return: 0.311
Epoch: 201 	 Loss: -3696812800.000 	 Return: 0.312
Epoch: 202 	 Loss: -4035316992.000 	 Return: 0.312
Epoch: 203 	 Loss: -4403053568.000 	 Return: 0.312
Epoch: 204 	 Loss: -4805983232.000 	 Return: 0.312
Epoch: 205 	 Loss: -5246347264.000 	 Return: 0.312
Epoch: 206 	 Loss: -5725941760.000 	 Return: 0.312
Epoch: 207 	 Loss: -6251663872.000 	 Return: 0.312
Epoch: 208 	 Loss: -6819704832.000 	 Return: 0.313
Epoch: 209 	 Loss: -7449577472.000 	 Return: 0.313
Epoch: 210 	 Loss: -8128929792.000 	 Return: 0.313
Epoch: 211 	 Loss: -8880410624.000 	 Return: 0.313
Epoch: 212 	 Loss: -9700888576.000 	 Return: 0.313
Epoch: 213 	 Loss: -10585585664.000 	 Return: 0.313
Epoch: 214 	 Loss: -11558784000.000 	 Return: 0.313
Epoch: 215 	 Loss: -12619778048.000 	 Return: 0.313
Epoch: 216 	 Loss: -13784804352.000 	 Return: 0.313
Epoch: 217 	 Loss: -15053429760.000 	 Return: 0.313
Epoch: 218 	 Loss: -16441106432.000 	 Return: 0.313
Epoch: 219 	 Loss: -17968

Weights of neural network
tensor([[5.8448, 5.8447, 5.8444, 5.8608, 5.9411, 5.8448, 5.8830, 5.9400, 5.9107,
         6.3983]])
tensor([[6.8793]])


Plot Learning Progress

In [6]:
fig = pd.DataFrame.from_records(training_returns).set_index('epoch').iplot(title='Mean Return over training epochs',
                                                                           xTitle='Epochs',
                                                                           yTitle='Return',
                                                                           asFigure=True)
fig.update_layout(yaxis=dict(tickformat=".2%"))
fig.show()
fig = pd.DataFrame.from_records(test_returns).set_index('epoch').iplot(title='Mean Return over test epochs',
                                                                       xTitle='Epochs',
                                                                       yTitle='Return',
                                                                       asFigure=True)
fig.update_layout(yaxis=dict(tickformat=".2%"))
fig.show()
