In [1]:
!pip install pybullet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pybullet
  Downloading pybullet-3.2.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (91.7 MB)
[K     |████████████████████████████████| 91.7 MB 1.2 MB/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-3.2.5


In [24]:
import pybullet_envs
from gym import make
from collections import deque
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam
import random
import copy

GAMMA = 0.99
TAU = 0.005
CRITIC_LR = 3e-4
ACTOR_LR = 3e-4
NOISE = 0.2
NOISE_CLIP = 0.5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 256
ENV_NAME = "AntBulletEnv-v0"
TRANSITIONS = 1000000

In [25]:
DEVICE

'cuda'

In [26]:
def soft_update(target, source):
    for tp, sp in zip(target.parameters(), source.parameters()):
        tp.data.copy_((1 - TAU) * tp.data + TAU * sp.data)

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ELU(),
            nn.Linear(256, 256),
            nn.ELU(),
            nn.Linear(256, action_dim),
            nn.Tanh()
        )
        
    def forward(self, state):
        return self.model(state)
        

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim + action_dim, 256),
            nn.ELU(),
            nn.Linear(256, 256),
            nn.ELU(),
            nn.Linear(256, 1)
        )
    
    def forward(self, state, action):
        return self.model(torch.cat([state, action], dim=-1)).view(-1)


class TD3:
    def __init__(self, state_dim, action_dim):
        self.step = 0
        self.action_dim = action_dim
        self.actor = Actor(state_dim, action_dim).to(DEVICE)
        self.critic_1 = Critic(state_dim, action_dim).to(DEVICE)
        self.critic_2 = Critic(state_dim, action_dim).to(DEVICE)
        
        self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic_1_optim = Adam(self.critic_1.parameters(), lr=ACTOR_LR)
        self.critic_2_optim = Adam(self.critic_2.parameters(), lr=ACTOR_LR)
        
        self.target_actor = copy.deepcopy(self.actor)
        self.target_critic_1 = copy.deepcopy(self.critic_1)
        self.target_critic_2 = copy.deepcopy(self.critic_2)
        
        self.replay_buffer = deque(maxlen=200000)

    def update(self, transition):
        self.replay_buffer.append(transition)
        if len(self.replay_buffer) > BATCH_SIZE * 16:
            
            # Sample batch
            transitions = [self.replay_buffer[random.randint(0, len(self.replay_buffer)-1)] for _ in range(BATCH_SIZE)]
            state, action, next_state, reward, done = zip(*transitions)
            state = torch.tensor(np.array(state), device=DEVICE, dtype=torch.float)
            action = torch.tensor(np.array(action), device=DEVICE, dtype=torch.float)
            next_state = torch.tensor(np.array(next_state), device=DEVICE, dtype=torch.float)
            reward = torch.tensor(np.array(reward), device=DEVICE, dtype=torch.float)
            done = torch.tensor(np.array(done), device=DEVICE, dtype=torch.float)
            
            # Update critic
            with torch.no_grad():
                noise = (
                    torch.randn_like(action) * NOISE
                ).clamp(-NOISE_CLIP, NOISE_CLIP)
                
                next_action = (self.target_actor(next_state) + noise).clamp(-1, 1)

                q_target_1 = self.target_critic_1(next_state, next_action)
                q_target_2 = self.target_critic_2(next_state, next_action)
                q_target = torch.min(q_target_1, q_target_2)
                q_target = reward + (1 - done) * GAMMA * q_target
            # Get current Q estimates
            q_curr_1 = self.critic_1(state, action)
            q_curr_2 = self.critic_2(state, action)

            # Compute critic loss
            critic_loss = F.mse_loss(q_curr_1, q_target) + F.mse_loss(q_curr_2, q_target)
            self.critic_1_optim.zero_grad()
            self.critic_2_optim.zero_grad()
            critic_loss.backward()
            self.critic_1_optim.step()
            self.critic_2_optim.step()
            
            if self.step % 2 == 0:
                # Update actor
                actor_loss = -self.critic_1(state, self.actor(state)).mean()
                self.actor_optim.zero_grad()
                actor_loss.backward()
                self.actor_optim.step()
                
                soft_update(self.target_critic_1, self.critic_1)
                soft_update(self.target_critic_2, self.critic_2)
                soft_update(self.target_actor, self.actor)
            self.step += 1

    def act(self, state):
        with torch.no_grad():
            state = torch.tensor(np.array([state]), dtype=torch.float, device=DEVICE)
            return self.actor(state).cpu().numpy()[0]

    def save(self):
        torch.save(self.actor.state_dict(), "agent.pt")


def evaluate_policy(env, agent, episodes=5):
    returns = []
    for _ in range(episodes):
        done = False
        state = env.reset()
        total_reward = 0.
        
        while not done:
            state, reward, done, _ = env.step(agent.act(state))
            total_reward += reward
        returns.append(total_reward)
    return returns

In [27]:
if __name__ == "__main__":
    env = make(ENV_NAME)
    test_env = make(ENV_NAME)
    td3 = TD3(state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0])
    state = env.reset()
    episodes_sampled = 0
    steps_sampled = 0
    eps = 0.3
    eps_final = 0.01
    eps_decay = 40000
    best_mean_reward = 0
    for i in range(TRANSITIONS):
        steps = 0
        
        #Epsilon-greedy policy
        current_eps = eps + (eps_final - eps) * i / eps_decay
        current_eps = current_eps if current_eps >= eps_final else eps_final
        action = td3.act(state)
        action = np.clip(action + current_eps * np.random.randn(*action.shape), -1, +1)

        next_state, reward, done, _ = env.step(action)
        td3.update((state, action, next_state, reward, done))
        
        state = next_state if not done else env.reset()
        
        if (i + 1) % (TRANSITIONS//100) == 0:
            rewards = evaluate_policy(test_env, td3, 5)
            curr_mean_reward = np.mean(rewards)
            print(f"Step: {i+1}, Reward mean: {curr_mean_reward}, Reward std: {np.std(rewards)}")
            if curr_mean_reward > best_mean_reward:
                best_mean_reward = curr_mean_reward
                td3.save()

  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Future gym versions will require that `Env.reset` can be passed a `seed` instead of using `Env.seed` for resetting the environment random number generator."
  "Future gym versions will require that `Env.reset` can be passed `return_info` to return information from the environment resetting."
  "Future gym versions will require that `Env.reset` can be passed `options` to allow the environment initialisation to be passed additional information."
  "Core environment is written in old step API which returns one bool instead of two. "


Step: 10000, Reward mean: 823.6651511056167, Reward std: 68.9889077710921
Step: 20000, Reward mean: 417.0514512907721, Reward std: 91.44780347592332
Step: 30000, Reward mean: 635.1486008710741, Reward std: 39.16137794665174
Step: 40000, Reward mean: 503.00647130109064, Reward std: 20.498248828465474
Step: 50000, Reward mean: 635.951560430877, Reward std: 95.86026994028924
Step: 60000, Reward mean: 667.6050148721954, Reward std: 88.41046195158626
Step: 70000, Reward mean: 323.6541433178499, Reward std: 210.62134849551862
Step: 80000, Reward mean: 715.8696499501409, Reward std: 80.95972528703433
Step: 90000, Reward mean: 546.5870284045623, Reward std: 75.39818396894313
Step: 100000, Reward mean: 616.125995510998, Reward std: 37.70215388774769
Step: 110000, Reward mean: 626.1739509273614, Reward std: 102.92350758546945
Step: 120000, Reward mean: 415.73375220213836, Reward std: 90.42867582332966
Step: 130000, Reward mean: 371.8444087451471, Reward std: 121.47444377661728
Step: 140000, Rewa

In [30]:
best_mean_reward

1987.9889183346186

In [None]:
env = make(ENV_NAME)
#test_env = make(ENV_NAME)
#td3 = TD3(state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0])
#state = env.reset()
#episodes_sampled = 0
#steps_sampled = 0
eps = 0.03
eps_final = 0.01
eps_decay = 40000
best_mean_reward = 0
for i in range(TRANSITIONS):
    steps = 0
    
    #Epsilon-greedy policy
    current_eps = eps
    action = td3.act(state)
    action = np.clip(action + current_eps * np.random.randn(*action.shape), -1, +1)

    next_state, reward, done, _ = env.step(action)
    td3.update((state, action, next_state, reward, done))
    
    state = next_state if not done else env.reset()
    
    if (i + 1) % (TRANSITIONS//100) == 0:
        rewards = evaluate_policy(test_env, td3, 5)
        curr_mean_reward = np.mean(rewards)
        print(f"Step: {i+1}, Reward mean: {curr_mean_reward}, Reward std: {np.std(rewards)}")
        if curr_mean_reward > best_mean_reward:
            best_mean_reward = curr_mean_reward
            td3.save()

Step: 10000, Reward mean: 1302.7541447668261, Reward std: 670.4789759354582
Step: 20000, Reward mean: 1724.5521297061223, Reward std: 387.96100960828664
Step: 30000, Reward mean: 2069.7823975149886, Reward std: 64.99273975767048
Step: 40000, Reward mean: 2171.1697182519547, Reward std: 19.185764719687477
Step: 50000, Reward mean: 1973.9572492438958, Reward std: 39.73270416976789
Step: 60000, Reward mean: 951.2848406605632, Reward std: 187.80481065936905
Step: 70000, Reward mean: 2084.256168206799, Reward std: 31.690838274740496
Step: 80000, Reward mean: 1894.1454974606488, Reward std: 138.98636907478698
Step: 90000, Reward mean: 1988.284556893693, Reward std: 70.69332672906621
Step: 100000, Reward mean: 2139.6838390305866, Reward std: 34.7676465878369
Step: 110000, Reward mean: 1875.7850006866743, Reward std: 425.8956761319998
Step: 120000, Reward mean: 1604.5491780821196, Reward std: 483.7878310103452
Step: 130000, Reward mean: 1558.7788996211577, Reward std: 235.28109449424312
Step: 