In [1]:
import random
from typing import List, Tuple

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from IPython.display import clear_output
from torch.distributions import Normal
from unityagents import UnityEnvironment
import numpy as np

In [2]:
def initialize_uniformly(layer: nn.Linear, init_w: float = 3e-3):
    """Initialize the weights and bias in [-init_w, init_w]."""
    layer.weight.data.uniform_(-init_w, init_w)
    layer.bias.data.uniform_(-init_w, init_w)


class Actor(nn.Module):
    def __init__(self, in_dim: int, out_dim: int):
        """Initialize."""
        super(Actor, self).__init__()
        
        self.hidden1 = nn.Linear(in_dim, 128)
        self.mu_layer = nn.Linear(128, out_dim)     
        self.log_std_layer = nn.Linear(128, out_dim)   
        
        initialize_uniformly(self.mu_layer)
        initialize_uniformly(self.log_std_layer)

    def forward(self, state: torch.Tensor) -> torch.Tensor:
        """Forward method implementation."""
        x = F.relu(self.hidden1(state))
        
        mu = torch.tanh(self.mu_layer(x))
        log_std = F.softplus(self.log_std_layer(x))
        std = torch.exp(log_std)
        
        dist = Normal(mu, std)
        action = dist.sample()
        
        return action, dist
    
class Critic(nn.Module):
    def __init__(self, in_dim: int):
        """Initialize."""
        super(Critic, self).__init__()
        
        self.hidden1 = nn.Linear(in_dim, 128)
        self.out = nn.Linear(128, 1)
        
        initialize_uniformly(self.out)

    def forward(self, state: torch.Tensor) -> torch.Tensor:
        """Forward method implementation."""
        x = F.relu(self.hidden1(state))
        value = self.out(x)
        
        return value

In [3]:
device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )
print(device)

cuda


In [4]:
env = UnityEnvironment(file_name='./Reacher_Windows_x86_One/Reacher_Windows_x86_64/Reacher.exe')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [5]:
is_test = False
transition= []
gamma = 0.9
entropy_weight = 1e-2

In [6]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [7]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 1
Size of each action: 4
There are 1 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726671e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [8]:
env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents) 
states,num_agents

(array([[ 0.00000000e+00, -4.00000000e+00,  0.00000000e+00,
          1.00000000e+00, -0.00000000e+00, -0.00000000e+00,
         -4.37113883e-08,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00, -1.00000000e+01,
          0.00000000e+00,  1.00000000e+00, -0.00000000e+00,
         -0.00000000e+00, -4.37113883e-08,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00, -6.30408478e+00,
         -1.00000000e+00, -4.92529202e+00,  0.00000000e+00,
          1.00000000e+00,  0.00000000e+00, -5.33014059e-01]]),
 1)

In [9]:
states

array([[ 0.00000000e+00, -4.00000000e+00,  0.00000000e+00,
         1.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -4.37113883e-08,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -1.00000000e+01,
         0.00000000e+00,  1.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -4.37113883e-08,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -6.30408478e+00,
        -1.00000000e+00, -4.92529202e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00, -5.33014059e-01]])

In [None]:
state_size, action_size

In [None]:
obs_dim = state_size
action_dim = action_size
obs_dim, action_dim

In [None]:
actor = Actor(obs_dim, action_dim).to(device)
critic = Critic(obs_dim).to(device)

In [None]:
actor_optimizer = optim.Adam(actor.parameters(), lr=1e-4)
critic_optimizer = optim.Adam(critic.parameters(), lr=1e-3)

In [None]:
state = torch.from_numpy(states).float().to(device)
state

In [None]:
action, dist = actor(state)
action, dist

In [None]:
selected_action = dist.mean if is_test else action
selected_action

In [None]:
if not is_test:
    log_prob = dist.log_prob(selected_action).sum(dim=-1)
    transition = [state, log_prob]
transition

In [None]:
selected_action = selected_action.clamp(-2.0, 2.0).cpu().detach().numpy()
selected_action

In [None]:
next_state, reward, done, _ = env.step(selected_action)
next_state, reward, done

In [None]:
if not is_test:
    transition.extend([next_state, reward, done])  
transition

In [None]:
state, log_prob, next_state, reward, done = transition
state, log_prob, next_state, reward, done

In [None]:
mask = 1 - done
mask

In [None]:
next_state = torch.FloatTensor(next_state).to(device)
next_state

In [None]:
pred_value = critic(state)

In [None]:
pred_value

In [None]:
targ_value = reward + gamma * critic(next_state) * mask
targ_value

In [None]:
value_loss = F.smooth_l1_loss(pred_value, targ_value.detach())
value_loss

In [None]:
# update value
critic_optimizer.zero_grad()
value_loss.backward()
critic_optimizer.step()

In [None]:
# advantage = Q_t - V(s_t)
advantage = (targ_value - pred_value).detach()
advantage

In [None]:
policy_loss = -advantage * log_prob
policy_loss

In [None]:
policy_loss += entropy_weight * -log_prob
policy_loss

In [None]:
# update policy
actor_optimizer.zero_grad()
policy_loss.backward()
actor_optimizer.step()

In [None]:
policy_loss.item(), value_loss.item()