In [1]:
import gym
import numpy as np
from gym import wrappers
from policy_v2 import Policy
from value import NNValueFunction
import scipy.signal
from utils import Logger, Scaler
from datetime import datetime
import os
import argparse
import signal

In [2]:
# Run single episode: it returns a scalar for the entropy
# and a 4-tuple of NumPy arrays for observations, actions, rewards and unscaled observations 
def run_episode(env, policy, scaler, animate=False): # animate: if True uses env.render() to animate episode
    obs = env.reset()
    observes, actions, rewards, unscaled_observes = [], [], [], []
    done = False
    step = 0.0
    scale, offset = scaler.get()
    scale[-1] = 1.0  # don't scale time step feature
    offset[-1] = 0.0  # don't offset time step feature
    while not done:
        if animate:
            env.render()
        obs = np.concatenate([obs, [step]])  # add time step feature
        obs = obs.astype(np.float32).reshape((1, -1))
        unscaled_observes.append(obs)
        obs = np.float32((obs - offset) * scale)  # center and scale observations
        observes.append(obs)
        action = policy.sample(obs)
        actions.append(action)
        obs, reward, done, _ = env.step(action.flatten())
        rewards.append(reward)
        step += 1e-3  # increment time step feature

    return (np.concatenate(observes), # observes: shape = (episode len, obs_dim)
            np.concatenate(actions), # actions: shape = (episode len, act_dim)
            np.array(rewards, dtype=np.float32), #rewards: shape = (episode len,)
            np.concatenate(unscaled_observes)) # unscaled_observes: useful for training scaler, shape = (episode len, obs_dim)

In [3]:
# Run policy: we run a fixed number of episodes and we collect the relative trajectories
def run_policy(env, policy, scaler, logger, episodes):
    total_steps = 0
    trajectories = []
    for e in range(episodes):
        observes, actions, rewards, unscaled_observes = run_episode(env, policy, scaler)
        total_steps += observes.shape[0]
        trajectory = {'observes': observes, 'actions': actions, 'rewards': rewards,
                      'unscaled_observes': unscaled_observes}
        trajectories.append(trajectory)
    unscaled = np.concatenate([t['unscaled_observes'] for t in trajectories])
    scaler.update(unscaled)  # update running statistics for scaling observations
    if logger is not None:
        logger.log({'_MeanReward': np.mean([t['rewards'].sum() for t in trajectories]), 'Steps': total_steps})

    return trajectories # Returns: a list of trajectory dictionaries (observes, actions, rewards, unscaled_observes)

In [4]:
# Adds estimated value to all time steps of all trajectories
def add_value(trajectories, value_func): # value_func: takes observations and returns predicted state value
    for trajectory in trajectories:
        observes = trajectory['observes']
        values = value_func.predict(observes)
        trajectory['values'] = values.flatten()

In [6]:
# Compute discounted forward sum of a sequence at each point
def discount(x, gamma):
    return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1]

# Compute thte discount sum rewards and the advantages of all trajectories
def add_discount_sum_reward_and_advantage(trajectories, gamma):
    for trajectory in trajectories:
        if gamma < 0.999:  # don't scale for gamma ~= 1
            rewards = trajectory['rewards'] * (1 - gamma)
        else:
            rewards = trajectory['rewards']
        disc_sum_rew = discount(rewards, gamma)
        trajectory['disc_sum_rew'] = disc_sum_rew
        values = trajectory['values']
        
        # temporal difference error to estimate advantage
        advantages = rewards - values + np.append(values[1:] * gamma, 0)
        trajectory['advantages'] = advantages

In [7]:
# Log various batch statistics
def log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode):
    logger.log({'_mean_obs': np.mean(observes), '_min_obs': np.min(observes),
                '_max_obs': np.max(observes), '_std_obs': np.mean(np.var(observes, axis=0)),
                '_mean_act': np.mean(actions), '_min_act': np.min(actions),
                '_max_act': np.max(actions), '_std_act': np.mean(np.var(actions, axis=0)),
                '_mean_adv': np.mean(advantages), '_min_adv': np.min(advantages),
                '_max_adv': np.max(advantages), '_std_adv': np.var(advantages),
                '_mean_discrew': np.mean(disc_sum_rew), '_min_discrew': np.min(disc_sum_rew),
                '_max_discrew': np.max(disc_sum_rew), '_std_discrew': np.var(disc_sum_rew), '_Episode': episode})

In [8]:
num_episodes = 3000
gamma = 0.995 
batch_size = 20
hid1_size = 10
kl_targ = 0.01
init_logvar = -1.0

# Initialize gym environment
env_name = "HalfCheetah-v2"
env = gym.make(env_name)
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

# Initialize some global variables
now = datetime.now().strftime("%b-%d_%H:%M:%S")  # create unique directories for logs
logger = Logger(logname=env_name, now=now) # Training log
nameDirLogWeights = f"log-weights/TRPO-HalfCheetah-v2-{now}" # Weights log
aigym_path = os.path.join('/tmp', env_name, now)
env = wrappers.Monitor(env, aigym_path, force=True)
scaler = Scaler(obs_dim)
value_function = NNValueFunction(obs_dim, hid1_size)
policy = Policy(obs_dim, act_dim, kl_targ, hid1_size, init_logvar)

# run a few episodes of untrained policy to initialize scaler:
run_policy(env, policy, scaler, logger, episodes=5)

episode = 0
while episode < num_episodes:
    trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
    episode += len(trajectories)
    add_value(trajectories, value_function)  # add estimated values to episodes
    add_discount_sum_reward_and_advantage(trajectories, gamma) # calculate discounted sum of Rs and the advantage
    # concatenate all episodes into single NumPy arrays
    observes = np.concatenate([t['observes'] for t in trajectories])
    actions = np.concatenate([t['actions'] for t in trajectories])
    disc_sum_rew = np.concatenate([t['disc_sum_rew'] for t in trajectories])
    advantages = np.concatenate([t['advantages'] for t in trajectories])
    # add various stats to training log:
    log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
    policy.update(observes, actions, advantages, logger)  # update policy
    value_function.fit(observes, disc_sum_rew, logger, episode, env_name)  # update value function
    logger.write(display=True)  # write logger results to file and stdout

logger.close()

Value Params -- h1: 180, h2: 30, h3: 5, lr: 0.00183
Policy Params -- h1: 180, h2: 103, h3: 60, lr: 8.87e-05, logvar_speed: 12
***** Episode 20, Mean R = -305.9 *****
Beta: 0.667
ExplainedVarNew: -0.239
ExplainedVarOld: -0.72
KL: 0.000906
PolicyEntropy: 5.49
PolicyLoss: 0.00046
Steps: 2e+04
ValFuncLoss: 0.0283


***** Episode 40, Mean R = -296.1 *****
Beta: 0.444
ExplainedVarNew: -0.313
ExplainedVarOld: -0.328
KL: 0.000329
PolicyEntropy: 5.49
PolicyLoss: -0.000354
Steps: 2e+04
ValFuncLoss: 0.0294


***** Episode 60, Mean R = -290.6 *****
Beta: 0.296
ExplainedVarNew: -0.286
ExplainedVarOld: -0.268
KL: 0.000367
PolicyEntropy: 5.49
PolicyLoss: -0.000149
Steps: 2e+04
ValFuncLoss: 0.0327


***** Episode 80, Mean R = -322.9 *****
Beta: 0.198
ExplainedVarNew: -0.313
ExplainedVarOld: -0.233
KL: 0.000523
PolicyEntropy: 5.48
PolicyLoss: 7.03e-05
Steps: 2e+04
ValFuncLoss: 0.0419


***** Episode 100, Mean R = -283.4 *****
Beta: 0.132
ExplainedVarNew: -0.288
ExplainedVarOld: -0.395
KL: 0.000979
Poli

***** Episode 880, Mean R = 1139.5 *****
Beta: 0.0964
ExplainedVarNew: -0.742
ExplainedVarOld: -0.682
KL: 0.0273
PolicyEntropy: 3.52
PolicyLoss: -0.00315
Steps: 2e+04
ValFuncLoss: 0.271


***** Episode 900, Mean R = 1224.7 *****
Beta: 0.0964
ExplainedVarNew: -0.742
ExplainedVarOld: -0.61
KL: 0.0094
PolicyEntropy: 3.46
PolicyLoss: -0.002
Steps: 2e+04
ValFuncLoss: 0.193


***** Episode 920, Mean R = 1249.6 *****
Beta: 0.0964
ExplainedVarNew: -0.753
ExplainedVarOld: -0.749
KL: 0.00847
PolicyEntropy: 3.42
PolicyLoss: -0.00197
Steps: 2e+04
ValFuncLoss: 0.312


***** Episode 940, Mean R = 1334.1 *****
Beta: 0.0964
ExplainedVarNew: -0.755
ExplainedVarOld: -0.67
KL: 0.00891
PolicyEntropy: 3.33
PolicyLoss: -0.00181
Steps: 2e+04
ValFuncLoss: 0.213


***** Episode 960, Mean R = 1426.4 *****
Beta: 0.0964
ExplainedVarNew: -0.716
ExplainedVarOld: -0.781
KL: 0.00765
PolicyEntropy: 3.27
PolicyLoss: -0.00222
Steps: 2e+04
ValFuncLoss: 0.193


***** Episode 980, Mean R = 1453.0 *****
Beta: 0.0964
Explain

***** Episode 1760, Mean R = 3025.2 *****
Beta: 0.0643
ExplainedVarNew: -0.923
ExplainedVarOld: -0.852
KL: 0.00942
PolicyEntropy: 0.976
PolicyLoss: -0.00128
Steps: 2e+04
ValFuncLoss: 0.83


***** Episode 1780, Mean R = 3072.0 *****
Beta: 0.0643
ExplainedVarNew: -0.964
ExplainedVarOld: -0.923
KL: 0.00933
PolicyEntropy: 0.893
PolicyLoss: -0.00128
Steps: 2e+04
ValFuncLoss: 0.836


***** Episode 1800, Mean R = 3093.5 *****
Beta: 0.0643
ExplainedVarNew: -0.967
ExplainedVarOld: -0.977
KL: 0.00886
PolicyEntropy: 0.819
PolicyLoss: -0.00092
Steps: 2e+04
ValFuncLoss: 0.834


***** Episode 1820, Mean R = 3152.3 *****
Beta: 0.0643
ExplainedVarNew: -0.942
ExplainedVarOld: -0.934
KL: 0.00684
PolicyEntropy: 0.739
PolicyLoss: -0.00111
Steps: 2e+04
ValFuncLoss: 0.848


***** Episode 1840, Mean R = 3206.0 *****
Beta: 0.0643
ExplainedVarNew: -0.917
ExplainedVarOld: -0.896
KL: 0.00705
PolicyEntropy: 0.679
PolicyLoss: -0.00122
Steps: 2e+04
ValFuncLoss: 0.881


***** Episode 1860, Mean R = 3228.1 *****
Beta

***** Episode 2640, Mean R = 1568.6 *****
Beta: 0.0286
ExplainedVarNew: -0.809
ExplainedVarOld: -0.897
KL: 0.000241
PolicyEntropy: 0.295
PolicyLoss: -5.33e-05
Steps: 2e+04
ValFuncLoss: 0.263


***** Episode 2660, Mean R = 1593.3 *****
Beta: 0.0286
ExplainedVarNew: -0.992
ExplainedVarOld: -0.838
KL: 0.000588
PolicyEntropy: 0.293
PolicyLoss: -0.00055
Steps: 2e+04
ValFuncLoss: 0.276


***** Episode 2680, Mean R = 1581.9 *****
Beta: 0.0286
ExplainedVarNew: -0.924
ExplainedVarOld: -0.953
KL: 0.00149
PolicyEntropy: 0.285
PolicyLoss: -0.000247
Steps: 2e+04
ValFuncLoss: 0.282


***** Episode 2700, Mean R = 1569.5 *****
Beta: 0.0286
ExplainedVarNew: -0.905
ExplainedVarOld: -0.797
KL: 0.00144
PolicyEntropy: 0.278
PolicyLoss: -0.000109
Steps: 2e+04
ValFuncLoss: 0.344


***** Episode 2720, Mean R = 1633.9 *****
Beta: 0.0286
ExplainedVarNew: -0.887
ExplainedVarOld: -0.835
KL: 0.00229
PolicyEntropy: 0.274
PolicyLoss: -0.000585
Steps: 2e+04
ValFuncLoss: 0.294


***** Episode 2740, Mean R = 1697.0 ***

In [None]:
# run_episode(env, policy, scaler, animate=True)