In [1]:
import sys
import torch  
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd

# hyperparameters
hidden_size = 256
learning_rate = 3e-4

# Constants
GAMMA = 0.99
num_steps = 300
max_episodes = 3000

In [2]:
# don't like it as one, better to keep them separate and update weights separately for clearer understanidng
class Actor (nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
        super(Actor, self).__init__()
        self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
        self.actor_linear2 = nn.Linear(hidden_size, num_actions)
    
    def forward(self, state):
        hidden = F.relu(self.actor_linear1(state))
        action = F.softmax(self.actor_linear2(hidden), dim=0)
        return action

class Critic (nn.Module):
    def __init__(self, num_inputs, hidden_size, learning_rate=3e-4):
        super(Critic, self).__init__()
        self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
        self.critic_linear2 = nn.Linear(hidden_size, 1)
    
    def forward(self, state, action):
        input = torch.cat((state, action), 0)
        hidden = F.relu(self.critic_linear1(input))
        value = self.critic_linear2(hidden)
        return value

# class ActorCritic(nn.Module):
#     def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
#         super(ActorCritic, self).__init__()

#         self.num_actions = num_actions
#         self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
#         self.critic_linear2 = nn.Linear(hidden_size, 1)

#         self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
#         self.actor_linear2 = nn.Linear(hidden_size, num_actions)
    
#     def forward(self, state):
#         state = Variable(torch.from_numpy(state).float().unsqueeze(0))
#         value = F.relu(self.critic_linear1(state))
#         value = self.critic_linear2(value)
        
#         policy_dist = F.relu(self.actor_linear1(state))
#         policy_dist = F.softmax(self.actor_linear2(policy_dist), dim=1)

#         return value, policy_dist

def calc_reward(w, x):
  user_pref = (w[0]>w[1] and w[0]>w[2] and w[0]>w[3]) + 0.75*(w[1]>w[2] and w[1]>w[3]) + 0.5*(w[2]>w[3]) 
  cost = - torch.dot(w, x[4:8])
  net_perf = torch.dot(w, x[0:4]) 
  return user_pref + cost + net_perf


In [23]:
actor_input_dims = 3 * 4 # input dimensionality: delays, costs and user preferences for 4 tunnels
actor_output_dims = 4 # 4 weights

critic_input_dims = 4 * 4 #state + action
critic_output_dims = 1 # value


actor = Actor(actor_input_dims, actor_output_dims, hidden_size)
critic = Critic(critic_input_dims, hidden_size)
actor_optimizer = optim.Adam(actor.parameters(), lr=learning_rate)
critic_optimizer = optim.Adam(critic.parameters(), lr=learning_rate)

fixed_x = torch.FloatTensor([[100,40,50,79],
     [2,1,3,4]]) # first row for costs, second is preference

all_lengths = []
average_lengths = []
all_rewards = []

#main loop
for episode in range(max_episodes):
     actions = []
     values = []
     rewards = []
     raw_weights = torch.rand(1,4)
     state = torch.cat((raw_weights, fixed_x)).flatten()
     state.requires_grad = True
     for steps in range(num_steps):
          #action and value both automatically have requires_grad = True
          action = actor.forward(state)
          actions.append(action)

          value = critic.forward(state, action)

          raw_weights = torch.rand(1,4)
          state = torch.cat((raw_weights, fixed_x)).flatten()
          reward = calc_reward(action, state)

          rewards.append(reward)
          values.append(value)

          if steps == num_steps-1:
               Qval = critic.forward(state, action)
               Qval = Qval.detach().numpy()
               all_rewards.append(sum(rewards))
               all_lengths.append(steps)
               average_lengths.append(np.mean(all_lengths[-10:]))
               if episode % 10 == 0:                    
                    sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, sum(rewards), steps, average_lengths[-1]))
               break
     
     actor_optimizer.zero_grad()
     critic_optimizer.zero_grad()

     # following from bridgit use policy gradients to update critic network -- doesn't make sense
     for action, value in zip(actions, values):
          value.backward(torch.ones_like(value), retain_graph=True) 
          action_gradients = action.grad
          policy_gradient = torch.Tensor([1,1,1,1]) #wrong
          action.backward(policy_gradient, retain_graph=True)

     # compute Q values
     #this tensor instantiation for some reason makes it so that value no longer has requires_grad
     values = torch.tensor(values)
     Qvals = torch.zeros_like(values)
     for t in reversed(range(len(rewards))):
          #Qval is the TD Target rewards[t] is current reward + expected reward from future (thus why backward)
          #starts at 0
          Qval = rewards[t] + GAMMA * Qval.item()
          Qvals[t] = Qval

     # compute advantage (assuming Adv = TD Error = TD Target - V(S))
     advantage = Qvals - values
     # update critic network by backpropagating the mean squared error between Qvals and values (TD error)
     critic_loss = 0.5*(advantage.pow(2)).mean()
     critic_loss.backward()
     critic_optimizer.step()


episode: 0, reward: -22616.076171875, total length: 299, average length: 299.0 


  action_gradients = action.grad


episode: 10, reward: -22616.640625, total length: 299, average length: 299.0 
episode: 20, reward: -22599.59765625, total length: 299, average length: 299.0 
episode: 30, reward: -22597.1015625, total length: 299, average length: 299.0 
episode: 40, reward: -22605.619140625, total length: 299, average length: 299.0 
episode: 50, reward: -22593.634765625, total length: 299, average length: 299.0 
episode: 60, reward: -22596.865234375, total length: 299, average length: 299.0 
episode: 70, reward: -22576.75, total length: 299, average length: 299.0 
episode: 80, reward: -22595.990234375, total length: 299, average length: 299.0 
episode: 90, reward: -22590.775390625, total length: 299, average length: 299.0 
episode: 100, reward: -22587.810546875, total length: 299, average length: 299.0 
episode: 110, reward: -22592.90625, total length: 299, average length: 299.0 
episode: 120, reward: -22593.43359375, total length: 299, average length: 299.0 
episode: 130, reward: -22609.482421875, tot

KeyboardInterrupt: 