In [31]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import numpy as np
import pandas as pd
import cvxpy as cp
import gym
import matplotlib.pyplot as plt
from numpy.linalg import LinAlgError
import gurobipy

import sys
sys.path.append('./asebo/')
sys.path.append('./rl/')
from optimizers import Adam
from worker import worker, get_policy
from es import ES

from methods import Gradient_LP, Hessian_LP, Hessian_LP_structured, \
                    get_PTinverse, run_HessianES, aggregate_rollouts_hessianES,\
                    run_asebo

## Training

In [33]:
########### Setting up params ##########
params = {
# 'env_name': 'Swimmer-v2',
'env_name': 'HalfCheetah-v2',
# 'env_name': 'InvertedPendulum-v2',
# 'env_name': 'Reacher-v2',
# 'env_name': 'Hopper-v2',
'steps':1000,
'h_dim':16,
'start':0,
'max_iter':200,
'seed':0,
'k':140, # ASEBO only?
'num_sensings':125,
'log':0,
'threshold':0.995,
'decay':0.99,
'learning_rate':1,#0.05
'filename':'',
'policy':'Linear', # Linear or Toeplitz
'shift':0,
'min':10,
'sigma':1e-4,
'backtracking':True,
'alpha': 1e-6,
'beta': 0.5
}

gradient_estimator = gradient_antithetic_estimator
invhessian_estimator = invHessian_LP_structured_PTinv_estimator

In [34]:
########### Auto Param Setups ##########
params['dir'] = params['env_name'] + params['policy'] + '_h' + str(params['h_dim']) + '_lr' + str(params['learning_rate']) + '_num_sensings' + str(params['num_sensings']) +'_' + params['filename']
env = gym.make(params['env_name'])
params['ob_dim'] = env.observation_space.shape[0]
params['ac_dim'] = env.action_space.shape[0]
params['k'] += -1
# params['alpha'] = 1 # ASEBO only
params['zeros'] = False
if params['log']:
    params['num_sensings'] = 4 + int(3 * np.log(master.N))

## Train

In [35]:
sigmas = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]
params_set = [dict(params) for _ in range(len(sigmas))]
for idx, params in enumerate(params_set):
    params['sigma'] = sigmas[idx]

In [36]:
single_params = params_set[1]
single_params['learning_rate'] = 0.05
master = get_policy(single_params)
ts2, rewards2, master2 = run_asebo(single_params, master)

Iteration: 1, Rollouts: 250, Reward: -633.292750866657, Alpha: 1, Samples: 125
Iteration: 2, Rollouts: 500, Reward: -556.9645111738272, Alpha: 1, Samples: 125
Iteration: 3, Rollouts: 750, Reward: -348.4882223768085, Alpha: 1, Samples: 125
Iteration: 4, Rollouts: 1000, Reward: -486.0734762902246, Alpha: 1, Samples: 125
Iteration: 5, Rollouts: 1250, Reward: -372.39899645135324, Alpha: 1, Samples: 125
Iteration: 6, Rollouts: 1500, Reward: -104.53064364818921, Alpha: 1, Samples: 125
Iteration: 7, Rollouts: 1750, Reward: 127.06389460600107, Alpha: 1, Samples: 125
Iteration: 8, Rollouts: 2000, Reward: 171.81763463425838, Alpha: 1, Samples: 125
Iteration: 9, Rollouts: 2250, Reward: 121.8836890897548, Alpha: 1, Samples: 125
Iteration: 10, Rollouts: 2500, Reward: 184.8695350937522, Alpha: 1, Samples: 125
Iteration: 11, Rollouts: 2750, Reward: 257.81306034754914, Alpha: 1, Samples: 125
Iteration: 12, Rollouts: 3000, Reward: 393.37387860828517, Alpha: 1, Samples: 125
Iteration: 13, Rollouts: 3250

KeyboardInterrupt: 

## Save and Plot

In [None]:
np.save("./data/{}/asebo_params.npy".format(params['dir']), master2.params)
np.save("./data/{}/asebo_ts.npy".format(params['dir']), ts2)
np.save("./data/{}/asebo_rs.npy".format(params['dir']), rewards2)




In [None]:
# asebo_ts = np.load("./data/InvertedPendulum-v2Toeplitz_h32_lr0.05_k140__asebo_ts.npy")
# asebo_rewards = np.load("./data/InvertedPendulum-v2Toeplitz_h32_lr0.05_k140__asebo_rs.npy")

hessian_ts = np.load("./data/HalfCheetah-v2Linear_h16_lr1_num_sensings125__hessian_ts.npy")
hessian_rewards = np.load("./data/HalfCheetah-v2Linear_h16_lr1_num_sensings125__hessian_rs.npy")

In [30]:
# plt.plot(asebo_ts, asebo_rewards, label="ASEBO")
# plt.plot(lpgrad_ts, lpgrad_rewards, label="LP gradient")
plt.plot(ts, rewards, label="LP Hessian")
plt.legend()


NameError: name 'ts' is not defined

# Testing

In [None]:
master = get_policy(params)
master.params=np.load("./data/{}_hessian.npy".format(params['dir']))
test_policy = worker(params, master, np.zeros([1, master.N]), 0)


In [None]:
from gym.wrappers import Monitor
env = Monitor(gym.make(params['env_name']), './video', force=True)
env._max_episode_steps = params['steps']

In [None]:
def play(env, worker):
    state = env.reset()
    while 1:
        action = worker.policy.evaluate(state)
        action = np.clip(action, worker.env.action_space.low[0], worker.env.action_space.high[0])
        action = action.reshape(len(action), )
        state, reward, done, info = env.step(action)
        env.render()
        if done: 
            break

In [None]:
play(env, test_policy)

In [None]:
np.save("./data/{}_hessian_ts.npy".format(params['dir']), ts)
np.save("./data/{}_hessian_rs.npy".format(params['dir']), rewards)

In [None]:
!open .
