In [1]:
import gym
import tensorflow as tf
import numpy as np
from gym import wrappers
from policy import Policy
from value import NNValueFunction
import scipy.signal
from utils import Logger, Scaler
from datetime import datetime
import os
import argparse
import signal

In [2]:
# Run single episode: it returns a scalar for the entropy
# and a 4-tuple of NumPy arrays for observations, actions, rewards and unscaled observations 
def run_episode(env, policy, scaler, animate=False): # animate: if True uses env.render() to animate episode
    obs = env.reset()
    observes, actions, rewards, unscaled_observes = [], [], [], []
    done = False
    step = 0.0
    scale, offset = scaler.get()
    scale[-1] = 1.0  # don't scale time step feature
    offset[-1] = 0.0  # don't offset time step feature
    last_action = None
    entropy = 0
    while not done:
        if animate:
            env.render()
        obs = np.concatenate([obs, [step]])  # add time step feature
        obs = obs.astype(np.float32).reshape((1, -1))
        unscaled_observes.append(obs)
        obs = np.float32((obs - offset) * scale)  # center and scale observations
        observes.append(obs)
        action, action_prob = policy.sample(obs, last_action, env) # draw a sample from the policy
        entropy += -tf.reduce_sum(action_prob*tf.math.log(action_prob))
        actions.append(action)
        obs, reward, done, _ = env.step(action) # perform a step in the enviroment and get the 
                                                # relative observation and reward
        rewards.append(reward)
        step += 1e-3  # increment time step feature
    return (np.concatenate(observes), # observes: shape = (episode len, obs_dim)
            actions, # actions: shape = list of actions
            np.array(rewards, dtype=np.float32), #rewards: shape = (episode len,)
            np.concatenate(unscaled_observes), # unscaled_observes: useful for training scaler, shape = (episode len, obs_dim)
            entropy)

In [3]:
# Run policy: we run a fixed number of episodes and we collect the relative trajectories
def run_policy(env, policy, scaler, logger, episodes):
    total_steps = 0
    trajectories = []
    for e in range(episodes):
        observes, actions, rewards, unscaled_observes, entropy = run_episode(env, policy, scaler)
        total_steps += observes.shape[0]
        trajectory = {'observes': observes, 'actions': actions, 'rewards': rewards,
                      'unscaled_observes': unscaled_observes}
        trajectories.append(trajectory)
    unscaled = np.concatenate([t['unscaled_observes'] for t in trajectories])
    scaler.update(unscaled)  # update running statistics for scaling observations
    if logger is not None:
        logger.log({'_MeanReward': np.mean([t['rewards'].sum() for t in trajectories]), 'Steps': total_steps})

    return trajectories, entropy # Returns: the entropy and a list of trajectory dictionaries (observes, actions, rewards, unscaled_observes)

In [4]:
# Adds estimated value to all time steps of all trajectories
def add_value(trajectories, value_func): # value_func: takes observations and returns predicted state value
    for trajectory in trajectories:
        observes = trajectory['observes']
        values = value_func.predict(observes)
        trajectory['values'] = values.flatten()

In [5]:
# Compute discounted forward sum of a sequence at each point
def discount(x, gamma):
    return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1]

# Compute thte discount sum rewards and the advantages of all trajectories
def add_discount_sum_reward_and_advantage(trajectories, gamma):
    for trajectory in trajectories:
        if gamma < 0.999:  # don't scale for gamma ~= 1
            rewards = trajectory['rewards'] * (1 - gamma)
        else:
            rewards = trajectory['rewards']
        disc_sum_rew = discount(rewards, gamma)
        trajectory['disc_sum_rew'] = disc_sum_rew
        values = trajectory['values']
        
        # temporal difference error to estimate advantage
        advantages = rewards - values + np.append(values[1:] * gamma, 0)
        trajectory['advantages'] = advantages

In [6]:
# Log various batch statistics
def log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode):
    logger.log({'_mean_obs': np.mean(observes), '_min_obs': np.min(observes),
                '_max_obs': np.max(observes), '_std_obs': np.mean(np.var(observes, axis=0)),
                '_mean_act': np.mean(actions), '_min_act': np.min(actions),
                '_max_act': np.max(actions), '_std_act': np.mean(np.var(actions, axis=0)),
                '_mean_adv': np.mean(advantages), '_min_adv': np.min(advantages),
                '_max_adv': np.max(advantages), '_std_adv': np.var(advantages),
                '_mean_discrew': np.mean(disc_sum_rew), '_min_discrew': np.min(disc_sum_rew),
                '_max_discrew': np.max(disc_sum_rew), '_std_discrew': np.var(disc_sum_rew), '_Episode': episode})

In [7]:
num_episodes = 3000
gamma = 0.995 
batch_size = 20
hid1_size = 8

# Initialize gym environment
env_name = "HalfCheetah-v2"
env = gym.make(env_name)
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

# Initialize some global variables
now = datetime.now().strftime("%b-%d_%H:%M:%S")  # create unique directories for logs
logger = Logger(logname=env_name, now=now) # Training log
nameDirLogWeights = f"log-weights/TRPO-HalfCheetah-v2-{now}" # Weights log
aigym_path = os.path.join('/tmp', env_name, now)
env = wrappers.Monitor(env, aigym_path, force=True)
scaler = Scaler(obs_dim)
value_function = NNValueFunction(obs_dim, hid1_size)
policy = Policy(obs_dim, act_dim, hid1_size)

# Run a few episodes of untrained policy to initialize scaler:
run_policy(env, policy, scaler, logger, episodes=5)

episode = 0
while episode < num_episodes:
    # run batch_size episodes and collect trajectories
    trajectories, entropy = run_policy(env, policy, scaler, logger, episodes=batch_size)
    episode += len(trajectories)
    add_value(trajectories, value_function)  # compute and add estimated values to episodes
    # compute and add discount sum rewards and advantages to trajectories
    add_discount_sum_reward_and_advantage(trajectories, gamma) 
    # concatenate all episodes into single Numpy arrays
    observes = np.concatenate([t['observes'] for t in trajectories])
    actions = np.concatenate([t['actions'] for t in trajectories])
    disc_sum_rew = np.concatenate([t['disc_sum_rew'] for t in trajectories])
    advantages = np.concatenate([t['advantages'] for t in trajectories])
    # normalize advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)
    # add various stats to training log
    log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
    policy.update(observes, actions, advantages, logger, entropy, env)  # update policy
    value_function.fit(observes, disc_sum_rew, logger, episode, nameDirLogWeights)  # update value function
    logger.write(display=True)  # write logger results to file and stdout

logger.close()

Value Params -- h1: 144, h2: 26, h3: 5, lr: 0.00196
Policy Params -- h1: 144, h2: 58
Linesearch worked at  0
***** Episode 20, Mean R = -707.4 *****
ExplainedVarNew: -0.886
ExplainedVarOld: -0.177
KL: -6e-08
PolicyEntropy: 1.79e+03
PolicyLoss: 1.79
Steps: 2e+04
ValFuncLoss: 0.0504


Linesearch worked at  0
***** Episode 40, Mean R = -783.5 *****
ExplainedVarNew: -0.886
ExplainedVarOld: -0.708
KL: -6e-08
PolicyEntropy: 1.78e+03
PolicyLoss: 1.78
Steps: 2e+04
ValFuncLoss: 0.0598


Linesearch worked at  0
***** Episode 60, Mean R = -789.1 *****
ExplainedVarNew: -1.05
ExplainedVarOld: -1.02
KL: -6e-08
PolicyEntropy: 1.79e+03
PolicyLoss: 1.79
Steps: 2e+04
ValFuncLoss: 0.0585


Linesearch worked at  0
***** Episode 80, Mean R = -641.1 *****
ExplainedVarNew: -1.08
ExplainedVarOld: -1.07
KL: -6e-08
PolicyEntropy: 1.78e+03
PolicyLoss: 1.78
Steps: 2e+04
ValFuncLoss: 0.0603


Linesearch worked at  0
***** Episode 100, Mean R = -556.8 *****
ExplainedVarNew: -1.03
ExplainedVarOld: -1.63
KL: -6e-08
P

Linesearch worked at  0
***** Episode 840, Mean R = 469.1 *****
ExplainedVarNew: -0.894
ExplainedVarOld: -0.784
KL: -2.48e-08
PolicyEntropy: 270
PolicyLoss: 0.353
Steps: 2e+04
ValFuncLoss: 0.029


Linesearch worked at  0
***** Episode 860, Mean R = 471.7 *****
ExplainedVarNew: -0.872
ExplainedVarOld: -0.83
KL: -2.3e-08
PolicyEntropy: 261
PolicyLoss: 0.33
Steps: 2e+04
ValFuncLoss: 0.0311


Linesearch worked at  0
***** Episode 880, Mean R = 491.8 *****
ExplainedVarNew: -0.893
ExplainedVarOld: -0.904
KL: -2.14e-08
PolicyEntropy: 266
PolicyLoss: 0.338
Steps: 2e+04
ValFuncLoss: 0.0299


Linesearch worked at  0
***** Episode 900, Mean R = 515.3 *****
ExplainedVarNew: -0.912
ExplainedVarOld: -0.838
KL: -1.97e-08
PolicyEntropy: 248
PolicyLoss: 0.318
Steps: 2e+04
ValFuncLoss: 0.0323


Linesearch worked at  0
***** Episode 920, Mean R = 522.8 *****
ExplainedVarNew: -0.953
ExplainedVarOld: -0.887
KL: -1.88e-08
PolicyEntropy: 247
PolicyLoss: 0.319
Steps: 2e+04
ValFuncLoss: 0.0338


Linesearch wor

KeyboardInterrupt: 

In [8]:
#run_episode(env, policy, scaler, animate=True)

Creating window glfw


(array([[ 6.11046553e-01, -5.06131113e-01, -1.37420461e-01, ...,
         -4.54428606e-02, -4.02506962e-02,  0.00000000e+00],
        [ 5.46202421e-01, -6.13432646e-01,  6.17069378e-02, ...,
          1.77567685e+00,  1.20282304e+00,  1.00000005e-03],
        [ 3.95448714e-01, -7.19012439e-01,  3.75297546e-01, ...,
          1.23307860e+00,  1.04113460e+00,  2.00000009e-03],
        ...,
        [ 2.00358592e-02, -1.14863664e-01,  2.78079838e-01, ...,
          1.66731268e-01, -3.92918177e-02,  9.96999979e-01],
        [ 1.29601276e-02, -9.64251384e-02,  2.69828349e-01, ...,
         -5.16755246e-02,  1.05855510e-01,  9.98000026e-01],
        [ 1.06102169e-01,  3.75017673e-02, -1.37826040e-01, ...,
         -1.95069090e-01, -8.95883888e-02,  9.99000013e-01]], dtype=float32),
 [2,
  1,
  0,
  1,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  1,