In [1]:
import random
from typing import List, Tuple

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from IPython.display import clear_output
from torch.distributions import Normal

In [2]:
def initialize_uniformly(layer: nn.Linear, init_w: float = 3e-3):
    """Initialize the weights and bias in [-init_w, init_w]."""
    layer.weight.data.uniform_(-init_w, init_w)
    layer.bias.data.uniform_(-init_w, init_w)


class Actor(nn.Module):
    def __init__(self, in_dim: int, out_dim: int):
        """Initialize."""
        super(Actor, self).__init__()
        
        self.hidden1 = nn.Linear(in_dim, 128)
        self.mu_layer = nn.Linear(128, out_dim)     
        self.log_std_layer = nn.Linear(128, out_dim)   
        
        initialize_uniformly(self.mu_layer)
        initialize_uniformly(self.log_std_layer)

    def forward(self, state: torch.Tensor) -> torch.Tensor:
        """Forward method implementation."""
        x = F.relu(self.hidden1(state))
        
        mu = torch.tanh(self.mu_layer(x)) * 2
        log_std = F.softplus(self.log_std_layer(x))
        std = torch.exp(log_std)
        
        dist = Normal(mu, std)
        action = dist.sample()
        
        return action, dist
    
class Critic(nn.Module):
    def __init__(self, in_dim: int):
        """Initialize."""
        super(Critic, self).__init__()
        
        self.hidden1 = nn.Linear(in_dim, 128)
        self.out = nn.Linear(128, 1)
        
        initialize_uniformly(self.out)

    def forward(self, state: torch.Tensor) -> torch.Tensor:
        """Forward method implementation."""
        x = F.relu(self.hidden1(state))
        value = self.out(x)
        
        return value

In [7]:
device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )
print(device)

cuda


In [3]:
env_id = "Pendulum-v0"
env = gym.make(env_id)

In [35]:
is_test = False
transition= []
gamma = 0.9
entropy_weight = 1e-2

In [13]:
state = env.reset()

In [11]:
env.observation_space.shape

(3,)

In [12]:
env.action_space.shape

(1,)

In [16]:
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
obs_dim, action_dim

(3, 1)

In [17]:
actor = Actor(obs_dim, action_dim).to(device)
critic = Critic(obs_dim).to(device)

In [18]:
actor_optimizer = optim.Adam(actor.parameters(), lr=1e-4)
critic_optimizer = optim.Adam(critic.parameters(), lr=1e-3)

In [14]:
state = torch.from_numpy(state).float().to(device)

In [22]:
action, dist = actor(state)
action, dist

(tensor([-0.8187], device='cuda:0'),
 Normal(loc: tensor([-0.0029], device='cuda:0', grad_fn=<MulBackward0>), scale: tensor([1.9988], device='cuda:0', grad_fn=<ExpBackward0>)))

In [23]:
selected_action = dist.mean if is_test else action
selected_action

tensor([-0.8187], device='cuda:0')

In [25]:
if not is_test:
    log_prob = dist.log_prob(selected_action).sum(dim=-1)
    transition = [state, log_prob]
transition

[tensor([ 0.8697, -0.4935,  0.1179], device='cuda:0'),
 tensor(-1.6948, device='cuda:0', grad_fn=<SumBackward1>)]

In [26]:
selected_action = selected_action.clamp(-2.0, 2.0).cpu().detach().numpy()
selected_action

array([-0.81873894], dtype=float32)

In [28]:
next_state, reward, done, _ = env.step(selected_action)
next_state, reward, done

(array([ 0.8370765, -0.5470859, -0.8801508], dtype=float32),
 -0.3008234222532113,
 False)

In [29]:
if not is_test:
    transition.extend([next_state, reward, done])  
transition

[tensor([ 0.8697, -0.4935,  0.1179], device='cuda:0'),
 tensor(-1.6948, device='cuda:0', grad_fn=<SumBackward1>),
 array([ 0.8370765, -0.5470859, -0.8801508], dtype=float32),
 -0.3008234222532113,
 False]

In [30]:
state, log_prob, next_state, reward, done = transition
state, log_prob, next_state, reward, done

(tensor([ 0.8697, -0.4935,  0.1179], device='cuda:0'),
 tensor(-1.6948, device='cuda:0', grad_fn=<SumBackward1>),
 array([ 0.8370765, -0.5470859, -0.8801508], dtype=float32),
 -0.3008234222532113,
 False)

In [31]:
mask = 1 - done
mask

1

In [32]:
next_state = torch.FloatTensor(next_state).to(device)
next_state

tensor([ 0.8371, -0.5471, -0.8802], device='cuda:0')

In [44]:
torch.min(state, next_state)

tensor([ 0.8371, -0.5471, -0.8802], device='cuda:0')

In [33]:
pred_value = critic(state)

In [34]:
pred_value

tensor([0.0072], device='cuda:0', grad_fn=<AddBackward0>)

In [36]:
targ_value = reward + gamma * critic(next_state) * mask
targ_value

tensor([-0.3020], device='cuda:0', grad_fn=<AddBackward0>)

In [37]:
value_loss = F.smooth_l1_loss(pred_value, targ_value.detach())
value_loss

tensor(0.0478, device='cuda:0', grad_fn=<SmoothL1LossBackward0>)

In [38]:
# update value
critic_optimizer.zero_grad()
value_loss.backward()
critic_optimizer.step()

In [39]:
# advantage = Q_t - V(s_t)
advantage = (targ_value - pred_value).detach()
advantage

tensor([-0.3092], device='cuda:0')

In [40]:
policy_loss = -advantage * log_prob
policy_loss

tensor([-0.5241], device='cuda:0', grad_fn=<MulBackward0>)

In [41]:
policy_loss += entropy_weight * -log_prob
policy_loss

tensor([-0.5071], device='cuda:0', grad_fn=<AddBackward0>)

In [42]:
# update policy
actor_optimizer.zero_grad()
policy_loss.backward()
actor_optimizer.step()

In [43]:
policy_loss.item(), value_loss.item()

(-0.5071378946304321, 0.0478142611682415)