In [1]:
import numpy as np
from reacher_environment import ReacherEnvironment

In [2]:
convert_fn = lambda x: x
env = ReacherEnvironment('./p2_continuous-control/multiagent/Reacher.app', convert_fn)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Number of agents: 20
Number of actions: 4
There are 20 agents. Each observes a state with length: 33


### Defining Policy-Value Network

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import beta
import numpy as np
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class Policy_Network(nn.Module):
    def __init__(self, input_size, action_size):
        super(Policy_Network, self).__init__()
        # outputs beta likelyhood parameters
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 2*action_size)
        self.fcalpha = nn.Linear(2*action_size, action_size)
        self.fcbeta = nn.Linear(2*action_size, action_size)
        self.softplus_act = nn.Softplus()
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        a = self.softplus_act(self.fcalpha(x)) + 1.
        b = self.softplus_act(self.fcbeta(x)) + 1.
        beta_dist = beta.Beta(a,b)
        return beta_dist, x

In [None]:
from reinforce import REINFORCE
import torch.optim as optim
episode = 3000
tmax = 1000
learning_rate = 1e-3
state_dim = 33
action_dim = 4

# Initializing policy network, optimizer and agent
policy_network = Policy_Network(state_dim,action_dim).to(device)
optimizer = optim.Adam(policy_network.parameters(), lr=learning_rate)
reinforce_agent = REINFORCE(policy_network,
                 optimizer, 
                 entropy_reg_schedule = lambda old_value, rewards: 0.995*old_value,
                 entropy_reg_start=0.01,
                 gamma_schedule = lambda old_value, rewards: 1. - 0.998*(1. - old_value),
                 gamma_start = 0.96,
                 target_score=30,
                 target_score_window=100,
                 action_transform = lambda x: -1+2*x,
                 loss_type = "ratio",
                 verbosity = 50)
reinforce_agent.training(env, tmax, tmax, episode)

In [4]:
from ppo import PPO
import torch.optim as optim
learning_rate = 1e-3
episode = 5000
tmax = 1000
state_dim = 33
action_dim = 4

# Initializing policy network, optimizer and agent
policy_network = Policy_Network(state_dim,action_dim).to(device)
optimizer = optim.Adam(policy_network.parameters(), lr=learning_rate)
ppo_agent = PPO(policy_network,
                 optimizer,
                 SGD_steps=4,
                 ppo_policy_epsilon_schedule=lambda old_value, rewards: 0.9996*old_value,
                 ppo_policy_epsilon_start=0.16,
                 entropy_reg_schedule = lambda old_value, rewards: 0.9998*old_value,
                 entropy_reg_start=0.01,
                 gamma_schedule = lambda old_value, rewards: 1.- 0.9998*(1. - old_value),
                 gamma_start = 0.98,
                 target_score=30,
                 target_score_window=100,
                 action_transform = lambda x: -1+2*x,
                 verbosity = 100)
ppo_agent.training(env, tmax, tmax, episode)

0.4054999909363687 0.15993600000000002 <function <lambda>.<locals>.<lambda> at 0x7fee69843d90> 0.009998


TypeError: unsupported format string passed to function.__format__

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import beta
import numpy as np
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class Policy_Value_Network(nn.Module):
    def __init__(self, input_size, action_size):
        super(Policy_Value_Network, self).__init__()
        # outputs beta likelyhood parameters
        self.policy_network = Policy_Network(input_size, action_size)
        self.fcval = nn.Linear(2*action_size, 1)
        
    def forward(self, x):
        beta_dist,x = self.policy_network(x)
        values = self.fcval(x)
        return beta_dist, values

In [5]:
from a2c import A2C_PPO_LOSS
from itertools import chain
import torch.optim as optim
learning_rate = 1e-3
episode = 10000
tmax = 1000
state_dim = 33
action_dim = 4

# Initializing policy network, optimizer and agent
policy_value_network = Policy_Value_Network(state_dim,action_dim).to(device)
optimizer = optim.Adam(policy_value_network.parameters(), lr=learning_rate)
a2c_agent = A2C_PPO_LOSS(policy_value_network,
                 optimizer,
                 value_loss_coef=1,
                 n_boot_strap=1,
                 normalize_advantage =True,
                 use_gae_advantage = True,
                 lambda_bootstrap_schedule = lambda old_value, rewards: min([0.9999*old_value,0.3]),
                 lambda_bootstrap_start=0.8,
                 ppo_value_epsilon_schedule = lambda old_value, rewards: min([0.9996*old_value, 0.1]),
                 ppo_value_epsilon_start = 0.5,
                 SGD_steps=4,
                 ppo_policy_epsilon_schedule=lambda old_value, rewards: min([0.9996*old_value, 0.03]),
                 ppo_policy_epsilon_start=0.15,
                 entropy_reg_schedule = lambda old_value, rewards: 0.9999*old_value,
                 entropy_reg_start=0.01,
                 gamma_schedule = lambda old_value, rewards: 1.- 0.9999*(1. - old_value),
                 gamma_start = 0.985,
                 target_score=30,
                 target_score_window=100,
                 action_transform = lambda x: -1+2*x,
                 verbosity = 100)
a2c_agent.training(env, tmax, tmax, episode)

 Episode : 100	 Average reward in last 100 episode : 0.90 epsilon_policy : 0.144 epsilon_value : 0.480 lambda_gae : 0.792 gamma : 0.985 entropy_reg : 0.010
 Episode : 200	 Average reward in last 100 episode : 3.12 epsilon_policy : 0.138 epsilon_value : 0.462 lambda_gae : 0.784 gamma : 0.985 entropy_reg : 0.010
 Episode : 300	 Average reward in last 100 episode : 4.75 epsilon_policy : 0.133 epsilon_value : 0.443 lambda_gae : 0.776 gamma : 0.985 entropy_reg : 0.010
 Episode : 400	 Average reward in last 100 episode : 6.00 epsilon_policy : 0.128 epsilon_value : 0.426 lambda_gae : 0.769 gamma : 0.986 entropy_reg : 0.010
 Episode : 500	 Average reward in last 100 episode : 8.31 epsilon_policy : 0.123 epsilon_value : 0.409 lambda_gae : 0.761 gamma : 0.986 entropy_reg : 0.010
 Episode : 600	 Average reward in last 100 episode : 10.45 epsilon_policy : 0.118 epsilon_value : 0.393 lambda_gae : 0.753 gamma : 0.986 entropy_reg : 0.009
 Episode : 700	 Average reward in last 100 episode : 11.89 epsi

[0.17049999618902803,
 0.23499999474734068,
 0.2269999949261546,
 0.18099999595433475,
 0.1719999961555004,
 0.10449999766424298,
 0.14799999669194222,
 0.2079999953508377,
 0.15349999656900765,
 0.13099999707192184,
 0.2404999946244061,
 0.2249999949708581,
 0.26049999417737124,
 0.3069999931380153,
 0.22949999487027525,
 0.4274999904446304,
 0.41849999064579607,
 0.3439999923110008,
 0.48799998909235,
 0.37499999161809683,
 0.6789999848231674,
 0.4779999893158674,
 0.4399999901652336,
 0.5199999883770943,
 0.5094999886117876,
 0.5124999885447323,
 0.6459999855607748,
 0.4604999897070229,
 0.5684999872930347,
 0.5899999868124723,
 0.6644999851472676,
 0.5954999866895377,
 0.6664999851025641,
 0.7574999830685556,
 0.6034999865107238,
 0.7619999829679728,
 0.6694999850355089,
 0.6724999849684536,
 0.8354999813251197,
 0.9324999791570008,
 0.8179999817162752,
 0.580999987013638,
 0.7599999830126762,
 0.9464999788440764,
 1.1764999737031758,
 0.9384999790228903,
 0.6034999865107238,
 1.00

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import beta
import numpy as np
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class Policy_Value_Network(nn.Module):
    def __init__(self, input_size, action_size):
        super(Policy_Value_Network, self).__init__()
        # outputs beta likelyhood parameters
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fcalpha = nn.Linear(64, action_size)
        self.fcbeta = nn.Linear(64, action_size)
        self.fcval = nn.Linear(64, 1)
        self.softplus_act = nn.Softplus()
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        a = self.softplus_act(self.fcalpha(x)) + 1.
        b = self.softplus_act(self.fcbeta(x)) + 1.
        beta_dist = beta.Beta(a,b)
        value = self.fcval(x)
        return beta_dist, value

### Defining function to collect trajectories

In [None]:
def collect_trajectories(policy_value, env, tmax):
    states_list = []
    actions_list = []
    rewards_list = []
    log_probs_list = []
    values_list = []
    
    states = env.reset_env()                             # reset the environment    
    scores = np.zeros(env.nagents) 
    t = 0
    while t <= tmax:
        states_tensor = torch.tensor(states, dtype=torch.float32, device=device) 
        states_list.append(states_tensor)                         # storing states
        dists, values = policy_value(states_tensor)        # getting distribution out of the policy network
        
        # getting log prob of sampled actions
        samples = dists.sample()                            # sampling from distribution
        log_probs = torch.sum(dists.log_prob(samples),1)
        log_probs_list.append(log_probs) # storing model probs
        
        # converting samples in [0,1] to actions in [-1,1]
        actions_list.append(samples) 
        actions = -1. + 2. * samples.detach().cpu().numpy()
        
        # generating rewards by interacting with environment
        next_states, rewards, dones = env.take_action(actions)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32, device=device)[None,:]
        rewards_list.append(rewards_tensor)                # storing rewards
        
        # storing values for each state
        values_list.append(torch.transpose(values,0,1))
        scores += np.array(rewards)                        # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
        t += 1
        
    next_states_tensor = torch.tensor(next_states, dtype=torch.float32, device=device) 
    states_list.append(next_states_tensor)                        # storing states
    dists, values = policy_value(next_states_tensor)
    dones_tensor = torch.tensor(dones, dtype=torch.float32, device=device)[None,:]
    values = torch.transpose(values,0,1)
    
    values = torch.where(dones_tensor == 0,values, torch.zeros_like(values))
    values_list.append(values)
    rewards_list.append(values)
    
    return torch.cat(states_list,0), torch.cat(actions_list,0), \
            torch.cat(rewards_list,0), torch.cat(values_list,0), torch.cat(log_probs_list,0), scores

In [None]:
states, actions,rewards, log_prob, values, scores = collect_trajectories(policy_value_network, env, 100)

In [None]:
states.shape, actions.shape, rewards.shape, log_prob.shape, values.shape, scores.shape

In [None]:
def total_future_rewards(R, gamma):
    discounts = np.array([gamma**i for i in range(R.shape[0])])
    dis_R = R*discounts[:,None]
    tot_dis_f_R = dis_R[::-1].cumsum(axis=0)[::-1]
    tot_dis_f_R = tot_dis_f_R/discounts[:,None]
    return tot_dis_f_R
    
def kstep_boot_strapped_advantage(Rf, n_bootstrap, V, gamma):
    A = Rf[:-n_bootstrap] + (gamma**n_bootstrap) * (V[n_bootstrap:] - Rf[n_bootstrap:])  \
        - V[:-n_bootstrap]
    return A

In [None]:
R = rewards
gamma = 0.99
discounts = torch.tensor([[gamma**i] for i in range(R.shape[0])], dtype=torch.float64, device=device)
dis_R = R*discounts
tot_dis_f_R = dis_R + dis_R.sum(dim=0, keepdims=True) - dis_R.cumsum(dim=0)
tot_dis_f_R = tot_dis_f_R/discounts
A = tot_dis_f_R.detach().numpy()

B = total_future_rewards(R.detach().numpy(), gamma)
(A-B).sum()

In [None]:
states[0].shape, actions[0].shape, rewards[0].shape, log_prob[0].shape

In [None]:
torch.tensor(dones, dtype=torch.float32, device=device)[None,:].shape

In [None]:
values[0].shape

In [None]:
import numpy as np
from collections import deque
# training loop max iterations
episode = 1500
discount_rate = .99
epsilon = 0.1
beta_val = .01
tmax = 1000
SGD_epoch = 4
learning_rate = 1e-4
score_window_len = 100
state_dim = 33
action_dim = 4
max_score = 30
n_bootstrap = 5

# Initializing policy and value network, and optimizer
policy_value_network = Policy_Value(state_dim,action_dim).to(device)

states, actions, rewards, log_probs, values,  scores = \
            collect_trajectories(policy_value_network, env, tmax)
scores.mean()

In [None]:
print(rewards.shape, values.shape)

### Defining k step booststrap reward

In [None]:
def total_future_rewards(R, gamma):
    discounts = np.array([gamma**i for i in range(R.shape[0])])
    dis_R = R*discounts[:,None]
    tot_dis_f_R = dis_R[::-1].cumsum(axis=0)[::-1]
    tot_dis_f_R = tot_dis_f_R/discounts[:,None]
    return tot_dis_f_R
    
def kstep_boot_strapped_advantage(Rf, n_bootstrap, V, gamma):
    A = Rf[:-n_bootstrap] + (gamma**n_bootstrap) * (V[n_bootstrap:] - Rf[n_bootstrap:])  \
        - V[:-n_bootstrap]
    return A

### Defining Advantage Actor-Critic update function

In [None]:
from datetime import datetime
def update_A2C(optimizer, 
               policy_value_network, 
               old_log_probs, 
               states, 
               actions, 
               rewards,
               values,
               gamma = 0.995, 
               epsilon=0.1, 
               beta_val=0.01, 
               n_bootstrap=5):
    
    start = datetime.now()
    # old probs
    old_log_probs = torch.tensor(old_log_probs[:-n_bootstrap+1].flatten(), 
                             dtype=torch.float32, device=device)
        
    # total future reward for value function update
    Rf = total_future_rewards(rewards, gamma)
    RF = torch.tensor(Rf[:-n_bootstrap].flatten(),dtype=torch.float32, device=device)
    
    # advantage calculation for policy update
    A = kstep_boot_strapped_advantage(Rf,n_bootstrap,values,gamma)
    A = torch.tensor(A.flatten(), dtype=torch.float32, device=device)
    
    # actions
    actions = (np.concatenate(actions[:-n_bootstrap+1]) + 1.)/2.
    actions = torch.tensor(actions, dtype=torch.float32, device=device)
    
    # convert states to prob and logprob
    states = torch.cat(states[:-n_bootstrap],0)
    dist,pred_values = policy_value_network(states)
    
    # PPO loss on policy
    new_log_probs = torch.sum(dist.log_prob(actions),1)
    log_ratio = new_log_probs - old_log_probs
    ratio = torch.exp(log_ratio)
    clipped_ratio = torch.clamp(ratio, 1.-epsilon, 1.+epsilon)
    surr_clipped_erew = A*torch.min(ratio, clipped_ratio)
    entropy = torch.sum(dist.entropy(),1)
    policy_loss = -torch.mean(surr_clipped_erew + beta_val*entropy)
    
    # Value function loss
    value_loss = torch.mean(torch.pow(RF - pred_values[:,0],2))
    
    # total loss
    total_loss = policy_loss + value_loss
    
    
    # updating policy network
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()
    del total_loss
    
    return policy_value_network

In [None]:
import numpy as np
from collections import deque
# training loop max iterations
episode = 1500
discount_rate = .99
epsilon = 0.1
beta_val = .01
tmax = 1000
SGD_epoch = 4
learning_rate = 1e-4
score_window_len = 100
state_dim = 33
action_dim = 4
max_score = 30
n_bootstrap = 5

# Initializing policy and value network, and optimizer
policy_value_network = Policy_Value(state_dim,action_dim).to(device)
import torch.optim as optim
optimizer = optim.Adam(policy_value_network.parameters(), lr=learning_rate)

# keep track of progress
mean_rewards = []
mean_rewards_window = deque(maxlen=score_window_len)
for e in range(episode):

    # collect trajectories
    policy_value_network.eval()
    with torch.no_grad():
        states, actions, rewards, log_probs, values,  scores = \
            collect_trajectories(policy_value_network, env, tmax)
        
    # gradient ascent step
    policy_value_network.train()
    with torch.enable_grad()
    for _ in range(SGD_epoch):
        policy_value_network = update_A2C(optimizer, 
               policy_value_network, 
               log_probs, 
               states, 
               actions, 
               rewards,
               values,
               gamma = discount_rate, 
               epsilon=epsilon, 
               beta_val=beta_val, 
               n_bootstrap=n_bootstrap)
        
    # the clipping parameter reduces as time goes on
    epsilon*=.995
    
    # the regulation term also reduces
    # this reduces exploration in later runs
    beta_val*=.995
    
    # get the average reward of the parallel environments
    mean_of_all_agents = np.mean(scores)
    mean_rewards.append(mean_of_all_agents)
    mean_rewards_window.append(mean_of_all_agents)
    avg_of_last_x_episodes = np.mean(mean_rewards_window)
    
    # display some progress every 20 iterations
    print(f"\r Episode : {e+1}\t Average reward in last 100 episode : {avg_of_last_x_episodes:.2f}",end="")
    if (e+1)%50 ==0 :
        print(f"\r Episode : {e+1}\t Average reward in last 100 episode : {avg_of_last_x_episodes:.2f}")
    
    if avg_of_last_x_episodes >= max_score:
        print(f"Environment solved in episodes = {e+1}")

In [None]:
env.close()