In [1]:
import numpy as np
import pickle as pickle

In [2]:
#hyperparameters
H = 10 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = False


In [3]:
# actor model initialization
R = 3
C = 4
D = R * C # input dimensionality: delays, costs and user preferences for 4 tunnels
if resume:
  actor = pickle.load(open('actor_saved.p', 'rb'))
else:
  actor = {}
  actor['W1'] = np.random.randn(H,R) / np.sqrt(R) # "Xavier" initialization
  actor['W2'] = np.random.randn(H) / np.sqrt(H)

actor_grad_buffer = { k : np.zeros_like(v) for k,v in actor.items() } # update buffers that add up gradients over a batch
actor_rmsprop_cache = { k : np.zeros_like(v) for k,v in actor.items() } # rmsprop memory

In [4]:
# critic model initialization
R = 4
C = 4
D = R * C # input dimensionality: delays, costs and user preferences for 4 tunnels
if resume:
  critic = pickle.load(open('critic_saved.p', 'rb'))
else:
  critic = {}
  critic['W1'] = np.random.randn(H,R) / np.sqrt(R) # "Xavier" initialization
  critic['W2'] = np.random.randn(H) / np.sqrt(H)
  critic['W3'] = np.random.randn(C) / np.sqrt(C)

critic_grad_buffer = { k : np.zeros_like(v) for k,v in critic.items() } # update buffers that add up gradients over a batch
critic_rmsprop_cache = { k : np.zeros_like(v) for k,v in critic.items() } # rmsprop memory

In [5]:
def sigmoid(layer):
  for i in range(len(layer)):
    layer[i] = 1.0 / (1.0 + np.exp(-layer[i]))
  return layer # sigmoid "squashing" function for each to input to interval [0,1]

In [6]:
def act(x):
  h = actor['W1'] @ x
  h[h<0] = 0 # ReLU nonlinearity, h is 10x4
  weights = actor['W2'] @ h # weights is 1x4
  w = sigmoid(weights)
  return w, h # return new link weights and hidden state

def actor_update(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel() # ep x ep --> 1 x H ??
  dh = np.outer(epdlogp, actor['W2']) # ep x H
  dh[eph <= 0] = 0 # backprop relu
  dW1 = np.dot(dh.T, epx) # Hx1
  return {'W1':dW1, 'W2':dW2}

def critique(x):
  h = critic['W1'] @ x
  h[h<0] = 0 # ReLU nonlinearity, h is 10x4
  h2 = critic['W2'] @ h # h2 is 1x4
  h2[h2<0] = 0 # ReLU nonlinearity
  V = critic['W3'] @ h2 # V is 1x1
  return V, h, h2 # return value and hidden state

def critic_update(epx, eph, eph2, epdV):
  """ backward pass. (eph is array of intermediate hidden states) """
  #output layer gradients
  dW3 = np.dot(eph2.T, epdV).ravel()
  #second hidden layer
  dh2 = np.dot(epdV, critic['W3']) #10, 4
  dh2[eph2.reshape(-1) <= 0] = 0 # backprop relu
  dW2 = np.dot(eph, dh2.T).ravel() # (100,4) x (4,10) = (100,10) --> (1,1000)
  #first hidden layer
  dh = np.outer(critic['W2'], dh2.reshape(-1)).reshape(-1, 4)
  dh[eph <= 0] = 0 # backprop relu
  dW1 = np.dot(epx, dh.T)
  return {'W1':dW1.T, 'W2':dW2, 'W3':dW3}

def calc_reward(w, x):
  user_pref = (w[0]>w[1] and w[0]>w[2] and w[0]>w[3]) + 0.75*(w[1]>w[2] and w[1]>w[3]) + 0.5*(w[2]>w[3]) 
  cost = - np.dot(w, x[1])
  net_perf = np.dot(w, x[0]) 
  return user_pref + cost + net_perf

In [7]:
prev_x = None # used in computing the difference frame
axs,ahs,das,rs = [],[],[],[]
cxs,chs,ch2s,dqs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0

In [None]:
fixed_x = np.array([[100,40,50,79],
     [2,1,3,4]]) # first row for costs, second is preference
iters = 0
while True:
     raw_weights = np.random.rand(1,4)
     x = np.vstack((raw_weights, fixed_x))

     # forward actor network
     a, h = act(x) #action, and hidden state

     # record intermediates
     axs.append(x) # observation
     ahs.append(h) # hidden state

     # calculate reward from action and state
     reward = calc_reward(a, x)

     rs.append(reward) # needed?

     # stack state and action for input into critic network
     critic_input = np.vstack((x, a))
     Q, ch, ch2 = critique(critic_input)

     cxs.append(critic_input)
     chs.append(ch)
     ch2s.append(ch2)
     dqs.append(reward - Q)


     iters += 1
     # if iters == 10: # episode ends
     #      iters = 0
     #      # stack together all inputs, hidden states, action gradients, and rewards for this episode
     #      epax = np.vstack(axs)
     #      epah = np.vstack(ahs)
     #      #epda = np.vstack(das)
     #      epr = np.vstack(rs)
     #      epcx = np.vstack(cxs)
     #      epch = np.vstack(chs)
     #      epch2 = np.vstack(ch2s)
     #      epdq = np.vstack(dqs)

     #      #reset memory
     #      axs,ahs,das,rs = [],[],[],[]
     #      cxs,chs,ch2s,dqs = [],[],[],[]

          # get the gradient from the critic
          #critic_grad = critic_update(epcx, epch, epch2, epdq)
     actor_grad = actor_update(x, h, -Q)
     for k in actor: actor_grad_buffer[k] += actor_grad[k]
     for k,v in actor.items():
          g = actor_grad_buffer[k]
          actor_rmsprop_cache[k] = decay_rate * actor_rmsprop_cache[k] + (1 - decay_rate) * g**2
          actor[k] += learning_rate * g / (np.sqrt(actor_rmsprop_cache[k]) + 1e-5)
          actor_grad_buffer[k] = np.zeros_like(v)

     critic_grad = critic_update(critic_input, ch, ch2, reward - Q) # i think i need to adjust this error to the TD error
     for k in critic: critic_grad_buffer[k] += critic_grad[k]
     for k,v in critic.items():
          g = critic_grad_buffer[k]
          critic_rmsprop_cache[k] = decay_rate * critic_rmsprop_cache[k] + (1 - decay_rate) * g**2
          critic[k] += learning_rate * g / (np.sqrt(critic_rmsprop_cache[k]) + 1e-5)
          critic_grad_buffer[k] = np.zeros_like(v)
     #how to define error for backprop: reward - expected reward (critic output, i.e. Q value)
     #forward state + action through critic network. more efficient way than manual backprop?
     if iters%100==0: print(critic['W1'], critic['W2'], critic['W3'])
    