In [1]:
import numpy as np
import pandas as pd
import cvxpy as cp
import gym
import matplotlib.pyplot as plt
from numpy.linalg import LinAlgError


import sys
sys.path.append('./asebo/')
from optimizers import Adam
from worker import worker, get_policy
from es import ES

In [16]:
def F(theta):
    if theta.ndim == 1:
        theta = np.expand_dims(theta, 0)
    return -np.sum((theta - 0.5) ** 2, axis=tuple(range(theta.ndim)[1:]))


def Gradient_LP(y, epsilons):
    """
    y = (F(theta + sigma*epsilons) - F(theta)) / sigma
    epsilons: the perturbations with UNIT VARIANCE
    """
    n, d = epsilons.shape
    
    var_z = cp.Variable(n)
    var_g = cp.Variable(d)
    obj = sum(var_z)
    constraints = [var_z >= y - epsilons @ var_g,
                   var_z >= -y + epsilons @ var_g]
    prob = cp.Problem(cp.Minimize(obj), constraints)
    prob.solve(solver=cp.GLPK, eps=1e-6, glpk={'msg_lev': 'GLP_MSG_OFF'})
    if prob.status == 'optimal':
        return var_g.value
    return None

def Hessian_LP(y, epsilons):
    """
    y = (F(theta + sigma*epsilons) + F(theta - sigma*epsilons) - 2*F(theta)) / (sigma**2)
    epsilons: the perturbations with UNIT VARIANCE
    """
    n, d = epsilons.shape
    
    
    X = np.zeros((n, d*(d+1)//2))
    idx = 0
    for j in range(d):
        X[:,idx] = epsilons[:,j]**2
        idx += 1
        if j == d-1:
            break
        X[:,idx:idx+d-j-1] = 2 * epsilons[:,j:j+1] * epsilons[:,j+1:]
        idx += d-j-1
#         X[:,j*(j+1)//2:(j+1)*(j+2)//2-1] = 2 * epsilons[:,j:j+1] * epsilons[:,:j]
#         X[:,(j+1)*(j+2)//2-1] = epsilons[:,j]**2
    
    var_z = cp.Variable(n)
    var_H = cp.Variable(d*(d+1)//2)
    
    obj = sum(var_z)
    
    constraints = []
    for i in range(n):
        constraints += [var_z[i] >= y[i] - X[i] @ var_H]
        constraints += [var_z[i] >= - y[i] + X[i] @ var_H]
    
    prob = cp.Problem(cp.Minimize(obj), constraints)
    prob.solve(solver=cp.GLPK, eps=1e-6, glpk={'msg_lev': 'GLP_MSG_OFF'})

    if prob.status == 'optimal':
        H = np.zeros((d,d))
        idx = 0
        for j in range(d):
            H[j,j:] = var_H[idx:idx+d-j].value
            H[j:,j] = var_H[idx:idx+d-j].value
            idx += d-j
#             H[j,0:j+1] = var_H[j*(j+1)//2:(j+1)*(j+2)//2].value
#             H[1:j+1,j] = H[j,1:j+1]
        return H
    return None


def get_dct_mtx(d):
    # DCT matrix
    # unitary, symmetric and real
    # Orthonormal eigenbasis for structured H
    n = 2*d
    i_idx = np.array([range(n//2 )])
    idx = 2 * np.transpose(i_idx) @ i_idx
    dct_mtx = np.cos(idx*np.pi / n) * 2 / np.sqrt(d)
    dct_mtx[0,0] = 1
    dct_mtx[0,d-1] = 1
    dct_mtx[d-1,0] = 1
    dct_mtx[d-1,d-1] = (-1)**(d)
    return dct_mtx
def Hessian_LP_structured(y, epsilons):
    """
    y = (F(theta + sigma * epsilons) + F(theta - sigma * epsilons) - 2 * F(theta)) / (sigma ** 2)
    """
    # LP formulation to estimate Hessian
    # Minimizing over the space of matrices of the form
    # shown in the example 7 & 8 in the reference
    # [MATRICES DIAGONALIZED BY THE DISCRETECOSINE AND DISCRETE SINE TRANSFORMS]
    
    n, d = epsilons.shape
    
    # Define and solve the LP for Hessian here
    var_z = cp.Variable(n)
    var_H_diag = cp.Variable(d)
    
    # Lower triangular mtx H
    dct_mtx = get_dct_mtx(d)
    obj = sum(var_z)
    
    constraints = []
    for i in range(n):
        Uv = epsilons[i:i+1,:] @ dct_mtx
        Uv_sq = Uv * Uv
        constraints += [var_z[i] >= y[i] - Uv_sq @ var_H_diag]
        constraints += [var_z[i] >= - y[i] + Uv_sq @ var_H_diag]
    for i in range(d):
        constraints += [var_H_diag[i] <= 0]
        
    prob = cp.Problem(cp.Minimize(obj), constraints)
    prob.solve(solver=cp.GLPK, eps=1e-6, glpk={'msg_lev': 'GLP_MSG_OFF'})
    
    # 
    if prob.status == 'optimal':
        return dct_mtx @ np.diag(var_H_diag.value) @ np.transpose(dct_mtx)
    
    return None


In [36]:
def aggregate_rollouts_hessianES(master, A, params, n_samples):
    
    all_rollouts = np.zeros([n_samples+1, 2])

    timesteps = 0
    
    # F(theta + sigma*epsilons), and F(theta - sigma*epsilons)
    assert A.shape[0] == n_samples+1
    for i in range(n_samples+1):
        w = worker(params, master, A, i)
        all_rollouts[i] = np.reshape(w.do_rollouts(), 2)
        timesteps += w.timesteps

    all_rollouts = (all_rollouts - np.mean(all_rollouts)) / (np.std(all_rollouts)  + 1e-8)
    
    # (F(theta + sigma*epsilons) - F(theta)) / sigma
    gradient_y = np.array(all_rollouts[:-1, 0] - sum(all_rollouts[-1])/2) / params["sigma"]
    # (F(theta + sigma*epsilons) + F(theta - sigma*epsilons) - 2*F(theta)) / (sigma**2)
    hessian_y = np.array(all_rollouts[:-1, 0] + all_rollouts[:-1, 1] - sum(all_rollouts[-1])) / (params["sigma"]**2)
    return(gradient_y, hessian_y, timesteps)

def HessianES(params, master):
        
#     if params['n_iter'] >= params['k']:
#         pca = PCA()
#         pca_fit = pca.fit(G)
#         var_exp = pca_fit.explained_variance_ratio_
#         var_exp = np.cumsum(var_exp)
#         n_samples = np.argmax(var_exp > params['threshold']) + 1
#         if n_samples < params['min']:
#             n_samples = params['min']
#         U = pca_fit.components_[:n_samples]
#         UUT = np.matmul(U.T, U)
#         U_ort = pca_fit.components_[n_samples:]
#         UUT_ort = np.matmul(U_ort.T, U_ort)
#         alpha = params['alpha']
#         if params['n_iter'] == params['k']:
#             n_samples = params['num_sensings']
#     else:
#         UUT = np.zeros([master.N, master.N])
#         alpha = 1
#         n_samples = params['num_sensings']
    
    n_samples = params['num_sensings']
    
#     np.random.seed(None)
    cov = np.identity(master.N)*(params["sigma"]**2)
    mu = np.repeat(0, master.N)
    A = np.random.multivariate_normal(mu, cov, n_samples)
#     A /= np.linalg.norm(A, axis =-1)[:, np.newaxis]
    A = np.vstack([A, mu]) # Adding a reference evaluation
        
    gradient_y, hessian_y, timesteps = aggregate_rollouts_hessianES(master, A, params, n_samples)
    
    g = Gradient_LP(gradient_y, A[:-1, :]/params["sigma"])
#     H = Hessian_LP(hessian_y, A[:-1, :]/params["sigma"])#-0.1*np.identity(master.N)
    H = Hessian_LP_structured(hessian_y, A[:-1, :]/params["sigma"]) - 0.1*np.identity(master.N)
#     H = -np.identity(len(g))
    try:
        Hinv = True
        update_direction = -np.linalg.inv(H)@g
    except LinAlgError:
        Hinv = False
        update_direction = g
#     if params['n_iter'] >= params['k']:
#         params['alpha'] = np.linalg.norm(np.dot(g, UUT_ort))/np.linalg.norm(np.dot(g, UUT))
    
    return(update_direction, n_samples, timesteps, Hinv)

## Manuel Tests

In [48]:
params = {
# 'env_name': 'InvertedPendulum-v2',
'env_name': 'InvertedDoublePendulum-v2',
'steps':1000,
'h_dim':8,
'start':0,
'max_iter':1000,
'seed':0,

'k':140,
'num_sensings':100,#100
'log':0,
'threshold':0.995,
'decay':0.99,
'learning_rate':0.05,#0.05
'filename':'',
'policy':'Linear', # Linear or Toeplitz

'shift':0,
'min':10,
'sigma':1
}

params['dir'] = params['env_name'] + params['policy'] + '_h' + str(params['h_dim']) + '_lr' + str(params['learning_rate']) + '_num_sensings' + str(params['num_sensings']) +'_' + params['filename']

# if not(os.path.exists('data/'+params['dir'])):
#     os.makedirs('data/'+params['dir'])
# os.chdir('data/'+params['dir'])

In [49]:
env = gym.make(params['env_name'])
params['ob_dim'] = env.observation_space.shape[0]
params['ac_dim'] = env.action_space.shape[0]

m = 0
v = 0

params['k'] += -1
params['alpha'] = 1

params['zeros'] = False
master = get_policy(params)

if params['log']:
    params['num_sensings'] = 4 + int(3 * np.log(master.N))

if params['k'] > master.N:
    params['k'] = master.N

n_eps = 0
n_iter = 1
ts_cumulative = 0
ts = []
rollouts = []
rewards = []
samples = []
alphas = []
G = [] # Don't need this for HessianES
Hinv_success = 0

n_samples = params['num_sensings']+200
# np.random.seed(None)
cov = np.identity(master.N)*(params["sigma"]**2)
mu = np.repeat(0, master.N)
A = np.random.multivariate_normal(mu, cov, n_samples)
A /= np.linalg.norm(A, axis =-1)[:, np.newaxis]
A = np.vstack([A, mu]) # Adding a reference evaluation

gradient_y, hessian_y, timesteps = aggregate_rollouts_hessianES(master, A, params, n_samples)

In [50]:
A.shape

(301, 11)

In [51]:
%%time
g = Gradient_LP(gradient_y, A[:-1, :]/params["sigma"])

CPU times: user 159 ms, sys: 2.58 ms, total: 162 ms
Wall time: 161 ms


In [52]:
%%time
Hstruc = Hessian_LP_structured(hessian_y, A[:-1, :]/params["sigma"])

CPU times: user 786 ms, sys: 4.71 ms, total: 791 ms
Wall time: 790 ms


In [53]:
Hstruc

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [54]:
np.linalg.eig(Hstruc)

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]]))

In [55]:
%%time
H = Hessian_LP(hessian_y, A[:-1, :]/params["sigma"])

CPU times: user 1.02 s, sys: 13.2 ms, total: 1.04 s
Wall time: 1.04 s


In [56]:
H.shape

(11, 11)

In [57]:
np.linalg.eig(H)

(array([ 5.54529529, -2.88300251,  1.02019975, -0.85271169, -0.64320833,
         0.52638407,  0.48230831,  0.31038016, -0.11536853,  0.02823345,
         0.08519172]),
 array([[ 2.23390725e-02,  3.63937228e-02,  9.62646048e-03,
          2.36568066e-02, -2.75866216e-02, -1.80773563e-01,
         -3.76259224e-01, -5.90301367e-01,  4.67709135e-01,
          4.54464097e-01,  2.20834527e-01],
        [-2.48164043e-02, -1.15013590e-01, -1.22580185e-01,
          1.07831726e-01,  5.69345083e-01,  2.24963910e-01,
          4.88878309e-01, -3.17794748e-01, -1.54833121e-01,
          5.60885177e-02,  4.66503090e-01],
        [ 3.85236403e-02,  5.86577713e-02,  8.63316832e-02,
          5.30649221e-01, -1.98509448e-01,  7.59138984e-02,
          2.51541529e-01,  5.10160441e-01,  2.79569183e-01,
          4.46099782e-01,  2.45283021e-01],
        [ 1.27808863e-01, -6.76439958e-01,  3.65455488e-01,
          7.66422619e-02, -1.25271218e-01, -5.12402042e-01,
          2.22139257e-02, -1.02794600e-

## Training

In [58]:
# def run_hessianES(params):

env = gym.make(params['env_name'])
params['ob_dim'] = env.observation_space.shape[0]
params['ac_dim'] = env.action_space.shape[0]

m = 0
v = 0

params['k'] += -1
params['alpha'] = 1

params['zeros'] = False
master = get_policy(params)

if params['log']:
    params['num_sensings'] = 4 + int(3 * np.log(master.N))

if params['k'] > master.N:
    params['k'] = master.N

n_eps = 0
n_iter = 1
ts_cumulative = 0
ts = []
rollouts = []
rewards = []
samples = []
alphas = []
G = [] # Don't need this for HessianES
Hinv_success = 0
while n_iter < params['max_iter']:

    params['n_iter'] = n_iter
    gradient, n_samples, timesteps, Hinv = HessianES(params, master)
    Hinv_success += Hinv
#     print(Hinv)
#     gradient, n_samples, timesteps = ES(params, master, G)
    
    ts_cumulative += timesteps
    ts.append(ts_cumulative)
    alphas.append(params['alpha'])

    if n_iter == 1:
        G = np.array(gradient)
    else:
        G *= params['decay']
        G = np.vstack([G, gradient])
    n_eps += 2 * n_samples
    rollouts.append(n_eps)
#     gradient /= (np.linalg.norm(gradient) / master.N + 1e-8)

#     update, m, v = Adam(gradient, m, v, params['learning_rate'], n_iter)
    update = params['learning_rate']*gradient

    master.update(update)
    test_policy = worker(params, master, np.zeros([1, master.N]), 0)
    reward = test_policy.rollout(train=False)
    rewards.append(reward)
    samples.append(n_samples)

    print('Iteration: %s, Rollouts: %s, Reward: %s, Alpha: %s, Samples: %s' %(n_iter, n_eps, reward, params['alpha'], n_samples))
    n_iter += 1

    out = pd.DataFrame({'Rollouts': rollouts, 'Reward': rewards, 'Samples': samples, 'Timesteps': ts, 'Alpha': alphas})
    out.to_csv('HessianES_Seed%s.csv' %(params['seed']), index=False)     

Iteration: 1, Rollouts: 200, Reward: 54.30934128157487, Alpha: 1, Samples: 100
Iteration: 2, Rollouts: 400, Reward: 101.30230492081712, Alpha: 1, Samples: 100
Iteration: 3, Rollouts: 600, Reward: 91.7750724063481, Alpha: 1, Samples: 100
Iteration: 4, Rollouts: 800, Reward: 128.96629614882036, Alpha: 1, Samples: 100
Iteration: 5, Rollouts: 1000, Reward: 101.51074985723784, Alpha: 1, Samples: 100
Iteration: 6, Rollouts: 1200, Reward: 101.34323810119672, Alpha: 1, Samples: 100
Iteration: 7, Rollouts: 1400, Reward: 101.06793244325972, Alpha: 1, Samples: 100
Iteration: 8, Rollouts: 1600, Reward: 101.1731966117439, Alpha: 1, Samples: 100
Iteration: 9, Rollouts: 1800, Reward: 101.26107964750899, Alpha: 1, Samples: 100
Iteration: 10, Rollouts: 2000, Reward: 101.0694330741772, Alpha: 1, Samples: 100
Iteration: 11, Rollouts: 2200, Reward: 101.0230435733484, Alpha: 1, Samples: 100
Iteration: 12, Rollouts: 2400, Reward: 101.31394919551505, Alpha: 1, Samples: 100
Iteration: 13, Rollouts: 2600, Rewa

Iteration: 101, Rollouts: 20200, Reward: 221.56803263411192, Alpha: 1, Samples: 100
Iteration: 102, Rollouts: 20400, Reward: 203.47345005635503, Alpha: 1, Samples: 100
Iteration: 103, Rollouts: 20600, Reward: 165.72430509602086, Alpha: 1, Samples: 100
Iteration: 104, Rollouts: 20800, Reward: 119.96948892379527, Alpha: 1, Samples: 100
Iteration: 105, Rollouts: 21000, Reward: 120.05134239761276, Alpha: 1, Samples: 100
Iteration: 106, Rollouts: 21200, Reward: 129.3057430476964, Alpha: 1, Samples: 100
Iteration: 107, Rollouts: 21400, Reward: 101.39073693255773, Alpha: 1, Samples: 100
Iteration: 108, Rollouts: 21600, Reward: 101.18961777543147, Alpha: 1, Samples: 100
Iteration: 109, Rollouts: 21800, Reward: 92.03082122988293, Alpha: 1, Samples: 100
Iteration: 110, Rollouts: 22000, Reward: 257.7333726821944, Alpha: 1, Samples: 100
Iteration: 111, Rollouts: 22200, Reward: 101.48411471873861, Alpha: 1, Samples: 100
Iteration: 112, Rollouts: 22400, Reward: 129.00897592794925, Alpha: 1, Samples:

Iteration: 199, Rollouts: 39800, Reward: 138.29144282649102, Alpha: 1, Samples: 100
Iteration: 200, Rollouts: 40000, Reward: 166.23092596725434, Alpha: 1, Samples: 100
Iteration: 201, Rollouts: 40200, Reward: 155.7413063864278, Alpha: 1, Samples: 100
Iteration: 202, Rollouts: 40400, Reward: 212.32930515095774, Alpha: 1, Samples: 100
Iteration: 203, Rollouts: 40600, Reward: 201.4172977000889, Alpha: 1, Samples: 100
Iteration: 204, Rollouts: 40800, Reward: 184.0476688101044, Alpha: 1, Samples: 100
Iteration: 205, Rollouts: 41000, Reward: 183.99281765008746, Alpha: 1, Samples: 100
Iteration: 206, Rollouts: 41200, Reward: 193.26142017146327, Alpha: 1, Samples: 100
Iteration: 207, Rollouts: 41400, Reward: 146.9803807293929, Alpha: 1, Samples: 100
Iteration: 208, Rollouts: 41600, Reward: 183.30733978707124, Alpha: 1, Samples: 100
Iteration: 209, Rollouts: 41800, Reward: 156.1161128252999, Alpha: 1, Samples: 100
Iteration: 210, Rollouts: 42000, Reward: 257.40956783022614, Alpha: 1, Samples: 1

KeyboardInterrupt: 

In [42]:
np.save("./data/{}_hessian.npy".format(params['dir']), master.params)
np.save("./data/{}_hessian_ts.npy".format(params['dir']), ts)
np.save("./data/{}_hessian_rs.npy".format(params['dir']), rewards)




In [None]:
asebo_ts = np.load("./data/InvertedPendulum-v2Toeplitz_h32_lr0.05_k140__asebo_ts.npy")
asebo_rewards = np.load("./data/InvertedPendulum-v2Toeplitz_h32_lr0.05_k140__asebo_rs.npy")

lpgrad_ts = np.load("./data/InvertedPendulum-v2Linear_h32_lr0.05_num_sensings100__LP_ts.npy")
lpgrad_rewards = np.load("./data/InvertedPendulum-v2Linear_h32_lr0.05_num_sensings100__LP_rs.npy")



In [None]:
# plt.plot(asebo_ts, asebo_rewards, label="ASEBO")
# plt.plot(lpgrad_ts, lpgrad_rewards, label="LP gradient")
plt.plot(ts, rewards, label="LP Hessian")
plt.legend()


# Testing

In [43]:
# master = get_policy(params)
# master.params=np.load("./data/{}_hessian.npy".format(params['dir']))
test_policy = worker(params, master, np.zeros([1, master.N]), 0)


In [44]:
from gym.wrappers import Monitor
env = Monitor(gym.make(params['env_name']), './video', force=True)
env._max_episode_steps = params['steps']

In [45]:
def play(env, worker):
    state = env.reset()
    while 1:
        action = worker.policy.evaluate(state)
        action = np.clip(action, worker.env.action_space.low[0], worker.env.action_space.high[0])
        action = action.reshape(len(action), )
        state, reward, done, info = env.step(action)
        env.render()
        if done: 
            break

In [46]:
play(env, test_policy)

Creating offscreen glfw
Creating window glfw


In [None]:
len(ts)

In [None]:
params['dir']

In [None]:
np.save("./data/{}_hessian_ts.npy".format(params['dir']), ts)
np.save("./data/{}_hessian_rs.npy".format(params['dir']), rewards)

In [None]:
sigma = 0.05
# np.random.seed(0)
# theta = np.random.uniform(-5,5,5)
theta = 0.0 * np.ones(5)
n = 100
d = len(theta)

print('theta:')
print(theta)
epsilons = np.random.multivariate_normal(mean = np.zeros(d), cov = np.identity(d), size = n)

g = Gradient_LP(theta, sigma, epsilons)
print('gradient:')
print(g)

H = Hessian_LP(theta, sigma, epsilons)
print('Hessian:')
print(H)

In [None]:
!open .
