In [None]:
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

In [None]:
def g_func(eps, A):
  return (1+eps)*A if A>=0 else (1-eps)*A

In [None]:
def PPO_CLIP_objective(trajectories, last_trajectory, advantages, eps):
  sum = 0
  for trajectory in range(trajectories.shape[0]):
    for timestep in range(trajectories.shape[1]):
        sum += min(trajectories[trajectory][timestep]/last_trajectory[timestep],g_func(eps, advantages[trajectories][timestep]))
  return (1/(trajectories.shape[0]*trajectories.shape[2]))*sum

In [None]:
class Actor(nn.Module):
   def __init__(self, in_size, hidden_size, out_size):
     self.in_size = in_size
     self.hidden_size = hidden_size
     self.out_size = out_size
     self.l1 = nn.Linear(in_size, hidden_size)
     self.ac1 = nn.ReLU()
     self.l2 = nn.Linear(hidden_size, out_size)
     self.ac2 = nn.ReLU()
     
   def forward(self, x):
     x = self.l1(x)
     x = self.ac1(x)
     x = self.l2(x)
     x = self.ac2(x)
     return x

In [None]:
class Critic(nn.Module):
   def __init__(self, in_size, hidden_size, out_size):
     self.in_size = in_size
     self.hidden_size = hidden_size
     self.out_size = out_size
     self.l1 = nn.Linear(in_size, hidden_size)
     self.ac1 = nn.ReLU()
     self.l2 = nn.Linear(hidden_size, out_size)
     self.ac2 = nn.ReLU()
     
   def forward(self, x):
     x = self.l1(x)
     x = self.ac1(x)
     x = self.l2(x)
     x = self.ac2(x)
     return x

In [None]:
class GAE:
  def __init__(self, n_workers, worker_steps, gamma, lambda_):
        self.lambda_ = lambda_
        self.gamma = gamma
        self.worker_steps = worker_steps
        self.n_workers = n_workers
  
  def __call__(self, done, rewards, values):
    advantages = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
    last_advantage = 0
    last_value = values[:, -1]
    for t in reversed(range(self.worker_steps)):
      mask = 1.0 - done[:, t]
      last_value = last_value * mask
      last_advantage = last_advantage * mask
      advantages[:, t] = last_advantage
      last_value = values[:, t]
      return advantages

Laydown of how algo works:
   - Initialize networks (and GAE if using it)
  
While running:
   - Collects a batch of trajectories (for the next k timesteps) from the Actor
   - Computes the reward-to-go for each trajectory in each step
   - Computes the estimated advantage for each trajectory (either from Critic or GAE, we need to choose 1)
   - Update Actor's params by maximizing PPO_Clip_Objective for the generated trajectories (Gradient Ascent via Adam)
   - Update Critic's params by minimizing MSE loss between its estimated advantage and the actual advantage (environment reward)

Hyperparams:
   - Hidden Size Actor Net
   - Hidden Size Critic Net
   - Number of epochs train for
   - lambda, gamma, worker_steps, n_workers if using GAE
   - eps for g_func (determines clipping)
   - learning_rate
   - activation functions, optimizer used
   - types of layers used in nets (will probably end up adding convolutional layers, but using linear layers to start)



In [None]:
loss_func1 = nn.MSELoss()
model1 = Actor(some_vars)
model2 = Critic(some_vars)
opt1 = optim.Adam(model1.params())
opt2 = optim.Adam(model2.params())
#Some vars will depend on which subset of variables we decide to use

In [None]:
'''Need to implement above training loop here'''