In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import torch
from torch import nn 
import torch.optim as optim
from torch.distributions import Normal
from environment.WalkerEnv import WalkerEnv
from WalkerPolicy import WalkerPolicy
from solution import ppo_loss, value_loss
import torch
from utils.plotting import plot_training
import numpy as np
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR

In [None]:
VISUALIZE = True
N=5

In [None]:
def sample_trajectories(env, pi, T):
    """given an environment env, a stochastic policy pi and number of timesteps T, interact with the environment for T steps 
    using actions sampled from policy. Return torch tensors of collected states, actions and rewards"""
    states = np.zeros((T + 1, N, env.num_states), dtype=float)  # states from s(0) to s(T+1)
    actions = np.zeros((T, N, env.num_actions), dtype=float)  # actions from a(0) to a(T)
    rewards = np.zeros((T, N), dtype=float)  # rewards from r(0) to r(T)

    s = env.vector_reset()
    states[0] = s
    for t in range(T):
        a = pi.sample_actions(torch.tensor(states[t]).float())  # policy needs float torch tensor (N, state_dim)
        s_next, r = env.vector_step(np.array(a))  # env needs numpy array of (Nx1)
        states[t + 1], actions[t], rewards[t] = s_next, a, r

    tensor_s = torch.tensor(states).float()  # (T+1, N, state_dim)  care for the extra timestep at the end!
    tensor_a = torch.tensor(actions).float()  # (T, N, 1)
    tensor_r = torch.tensor(rewards).float()  # (T, N)

    return tensor_s, tensor_a, tensor_r

In [None]:
from solution import discount_cum_sum

def compute_advantage_estimates(tensor_r, values, gamma, bootstrap=False):
    """given reward tensor (T, N), value estimates tensor (T+1, N) and gamma scalar"""
    if bootstrap:  # use last value estimates as a return estimate
        terminal_value_estimates = values[-1].unsqueeze(0)  # values of the last states (1, N)
        rs_v = torch.cat((tensor_r, terminal_value_estimates), dim=0)
        value_targets = discount_cum_sum(rs_v, gamma)[:-1]
    else:
        value_targets = discount_cum_sum(tensor_r, gamma)
    advantages = value_targets - values[:-1]
    return value_targets, advantages


def compute_gae(tensor_r, values, gamma, lambda_):
    """generalized advantage estimation (GAE) implementation"""
    delta_t = tensor_r + gamma * values[1:] - values[:-1]
    advantages = discount_cum_sum(delta_t, gamma * lambda_)
    value_targets = advantages + values[:-1]
    return value_targets, advantages

In [None]:
def test_policy(pi, config, T=128, deterministic=True):
    test_env = WalkerEnv(config)
    mean_reward = 0
    
    s = test_env.vector_reset()
    x = s[0, 0]
    for i in range(T):
        with torch.no_grad():
            if deterministic:
                actions = pi.determine_actions(torch.tensor(s).float()) 
            else:
                actions = pi.sample_actions(torch.tensor(s).float()) 
        s, r = test_env.vector_step(actions.numpy())
        x = max(x, s[0, 0])
        mean_reward += sum(r) / (T * config['N'])

    print(f"Max x: {x}")
    test_env.close()
    return mean_reward

In [None]:
def walker_reward(state, action):
    pos = state[:15]  # first 15 elements of state vector are generalized coordinates [xyz, quat, joint_angles]
    vel = state[15:]  # last 14 elements of state vector are generalized velocities [xyz_vel, omega, joint_velocities]
    return vel[0]*1.5  # return the x velocity as the reward by default

In [None]:
N = 256
base_config = {
    "N": N,
    "vis": False,
    "track": 0,
    # "reward_fcn": walker_reward
}
torch.manual_seed(42)

# training parameters
epochs = 500
gamma = 0.95
epsilon = 0.2
sgd_iters = 5
T = 512
# policy, environment and optimizer
pi = WalkerPolicy(state_dim=29, action_dim=8)
train_env = WalkerEnv(base_config)
lr = 0.001
optimizer = optim.Adam(pi.parameters(), lr=lr)
mean_rewards, p_losses, v_losses = np.zeros(epochs), np.zeros(epochs), np.zeros(epochs)  # for logging mean rewards over epochs

In [None]:
for epoch in range(epochs):
    tensor_s, tensor_a, tensor_r = sample_trajectories(train_env, pi, T)  # collect trajectories using current policy

    with torch.no_grad():  # compute the old probabilities
        logp_old = pi.log_prob(tensor_a, tensor_s[:T]).squeeze(2)  # compute log(pi(a_t | s_t))

    for i in range(sgd_iters):  # we can even do multiple gradient steps
        values = pi.value_estimates(tensor_s)  # estimate value function for all states
        logp = pi.log_prob(tensor_a, tensor_s[:T]).squeeze(2)  # compute log(pi(a_t | s_t))

        with torch.no_grad(): 
            value_targets, advantage_estimates = compute_advantage_estimates(tensor_r, values, gamma, bootstrap=True)
            advantage_estimates = (advantage_estimates - advantage_estimates.mean()) / advantage_estimates.std()  # normalize advantages

        L_v = value_loss(values[:T], value_targets)  # add the value loss

        p_ratios = torch.exp(logp - logp_old)  # compute the ratios r_\theta(a_t | s_t)
        L_ppo = ppo_loss(p_ratios, advantage_estimates, epsilon=epsilon)  # compute the policy gradient loss
        total_loss = L_v + L_ppo

        optimizer.zero_grad()
        total_loss.backward()  # backprop and gradient step
        optimizer.step()
    if epoch % 10 == 0:
        print('Epoch %d, mean reward: %.3f, value loss: %.3f' % (epoch, tensor_r.mean(), L_v.item()))
    mean_rewards[epoch] = tensor_r.mean()
    v_losses[epoch] = L_v.item()
    p_losses[epoch] = L_ppo.item()

plot_training(mean_rewards, p_losses, v_losses)

In [None]:
config = {
    "N": 4,
    "vis": True,
    "track": 0,
    "reward_fcn": walker_reward
}
test_policy(pi, config, 1024)

In [None]:
pi.save_weights()

In [None]:
pi.save_weights()