# Deep Deterministic Policy Gradients

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("../scripts/")

from ddpg import *
from utils import *

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import itertools
import functools

from collections import OrderedDict
from tqdm import tqdm

## Concepts

* DDPG is adapted specifically for continuous action spaces
* Learns an approximator for $Q$ as well as $a^*(s) = \arg \max_a Q(s, a)$
* Because actions are continuous, we assume that we can differentiate $Q$ w.r.t. $a$.
* So, we learn approximator $\mu: \mathcal{S} \rightarrow \mathcal{A}$ such that $\max_a Q(s, a) \approx Q(s, \mu(s))$.

In [3]:
# Checking implementation of value model

val_model = ValueModel(3, 1)

states = torch.from_numpy(np.array([
    [0, 0, 0],
    [1, 1, 1],
    [2, 2, 2]], dtype=np.float32))
actions = torch.from_numpy(np.array([
    [7],
    [8],
    [9]], dtype=np.float32))

print(val_model(states, actions))

tensor([[0.3019],
        [0.1899],
        [0.0780]], grad_fn=<AddmmBackward>)


In [4]:
pol_model = PolicyModel(3, 1)

print(pol_model(states))

tensor([[0.0208],
        [0.1753],
        [0.3622]], grad_fn=<AddmmBackward>)


In [5]:
def train(val_model, pol_model, val_opt, pol_opt):
    pass

In [6]:
env = gym.make("Pendulum-v0")
state_dim = 3
action_dim = 1
update_after = 1000
update_steps = 500
max_ep_len = 1000
start_steps = 5000
num_update_iter = 10
batch_size = 256
act_noise = 0.1
rho = 0.995
discount_rate = 0.99
replay_buf = ReplayBuffer(1000000)

# the models we're optimizing
opt_val_model = ValueModel(state_dim, action_dim)
opt_pol_model = PolicyModel(state_dim, action_dim)

# the target models
tgt_val_model = ValueModel(state_dim, action_dim)
tgt_pol_model = PolicyModel(state_dim, action_dim)

# freeze weights
for param in tgt_val_model.parameters():
    param.requires_grad = False

for param in tgt_pol_model.parameters():
    param.requires_grad = False

# define optimizers
val_optim = optim.Adam(opt_val_model.parameters(), lr=0.001)
pol_optim = optim.Adam(opt_pol_model.parameters(), lr=0.0001)
total_r_running_avg = RunningAverage(100)

step_cnt = 0
for ep in range(2000):
    
    s = env.reset()
    total_r = 0
    
    for _ in range(max_ep_len):
        step_cnt += 1
        
        # move one step
        env.render()
        
        if step_cnt < start_steps:
            a = torch.from_numpy(env.action_space.sample().astype(np.float32))
        else:
            a = opt_pol_model(
                torch.from_numpy(np.array([s], dtype=np.float32))
                ).detach()
            noise = np.random.randn() * 0.1
            a += noise
            
        sp, r, d, _ = env.step(a)
        d = 1.0 if d else 0.0
        total_r += r.detach().item()
        replay_buf.add_one((s, a, r, sp, d))

        # time to update target networks
        if step_cnt % update_steps == 0 and step_cnt > update_after:
            for update_iter in range(num_update_iter):
                sarsd_arr = list(replay_buf.get_arrays(batch_size))
                
                val_optim.zero_grad()
                pol_optim.zero_grad()
                
                bl = bellman_loss(
                    sarsd_arr,
                    tgt_val_model,
                    tgt_pol_model,
                    opt_val_model,
                    discount_rate
                    )
                
                bl.backward()
                val_optim.step()
                
                pl = policy_loss(sarsd_arr, opt_val_model, opt_pol_model)
                
                pl.backward()
                pol_optim.step()
                
                # polyak updating of target networks
                with torch.no_grad():
                    tgt_val_new_state = OrderedDict()
                    for layer, val in tgt_val_model.state_dict().items():
                        tgt_val_new_state[layer] = (
                            rho * val +
                            (1 - rho) * opt_val_model.state_dict()[layer]
                            )
                    tgt_val_model.load_state_dict(tgt_val_new_state, strict=False)
                    
                    tgt_pol_new_state = OrderedDict()
                    for layer, val in tgt_pol_model.state_dict().items():
                        tgt_pol_new_state[layer] = (
                            rho * val +
                            (1 - rho) * opt_pol_model.state_dict()[layer]
                            )
                    tgt_pol_model.load_state_dict(tgt_pol_new_state, strict=False)
                        
        if d:
            break
    
    total_r_running_avg.update(total_r)
    print(f"ep={ep}, return={total_r:0.4f}, return_avg={total_r_running_avg.avg():0.4f}")
    
env.close()



ep=0, return=-1326.1620, return_avg=-1326.1620
ep=1, return=-966.2772, return_avg=-1146.2196
ep=2, return=-1656.6655, return_avg=-1316.3682
ep=3, return=-886.0697, return_avg=-1208.7936
ep=4, return=-979.1414, return_avg=-1162.8632
ep=5, return=-983.3390, return_avg=-1132.9425
ep=6, return=-1304.5672, return_avg=-1157.4603
ep=7, return=-1280.0197, return_avg=-1172.7802
ep=8, return=-940.7424, return_avg=-1146.9982
ep=9, return=-1678.6912, return_avg=-1200.1675
ep=10, return=-1735.3694, return_avg=-1248.8222
ep=11, return=-827.6727, return_avg=-1213.7264
ep=12, return=-964.5256, return_avg=-1194.5571
ep=13, return=-1326.8698, return_avg=-1204.0081
ep=14, return=-1057.9766, return_avg=-1194.2726
ep=15, return=-1749.2966, return_avg=-1228.9616
ep=16, return=-1191.5632, return_avg=-1226.7617
ep=17, return=-1182.8795, return_avg=-1224.3238
ep=18, return=-1385.1956, return_avg=-1232.7908
ep=19, return=-1560.3901, return_avg=-1249.1707
ep=20, return=-1523.6518, return_avg=-1262.2412
ep=21, re

KeyboardInterrupt: 

In [7]:
env.close()

In [8]:
tgt_val_model.state_dict().keys()

odict_keys(['f1.weight', 'f1.bias', 'f2.weight', 'f2.bias'])