# Deep Deterministic Policy Gradients

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("../scripts/")

from ddpg import *
from utils import *

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import itertools
import functools

from collections import OrderedDict
from tqdm import tqdm

## Concepts

* DDPG is adapted specifically for continuous action spaces
* Learns an approximator for $Q$ as well as $a^*(s) = \arg \max_a Q(s, a)$
* Because actions are continuous, we assume that we can differentiate $Q$ w.r.t. $a$.
* So, we learn approximator $\mu: \mathcal{S} \rightarrow \mathcal{A}$ such that $\max_a Q(s, a) \approx Q(s, \mu(s))$.

In [3]:
# Checking implementation of value model

val_model = ValueModel(3, 1)

states = torch.from_numpy(np.array([
    [0, 0, 0],
    [1, 1, 1],
    [2, 2, 2]], dtype=np.float32))
actions = torch.from_numpy(np.array([
    [7],
    [8],
    [9]], dtype=np.float32))

print(val_model(states, actions))

tensor([[-0.6547],
        [-1.0437],
        [-1.4348]], grad_fn=<AddmmBackward>)


In [4]:
pol_model = PolicyModel(3, 1)

print(pol_model(states))

tensor([[-0.2539],
        [-0.3184],
        [-0.5430]], grad_fn=<AddmmBackward>)


In [5]:
def train(val_model, pol_model, val_opt, pol_opt):
    pass

In [None]:
env = gym.make("Pendulum-v0")
state_dim = 3
action_dim = 1

update_after = 1000
update_steps = 50
num_update_iter = 50

max_ep_len = 1000
start_steps = 5000
batch_size = 1024
act_noise = 0.1
rho = 0.995
discount_rate = 0.99
replay_buf = ReplayBuffer(100000)
notified = False

# the models we're optimizing
opt_val_model = ValueModel(state_dim, action_dim)
opt_pol_model = PolicyModel(state_dim, action_dim)

# the target models
tgt_val_model = ValueModel(state_dim, action_dim)
tgt_pol_model = PolicyModel(state_dim, action_dim)

# freeze weights
for param in tgt_val_model.parameters():
    param.requires_grad = False

for param in tgt_pol_model.parameters():
    param.requires_grad = False

# define optimizers
val_optim = optim.Adam(opt_val_model.parameters(), lr=0.001)
pol_optim = optim.Adam(opt_pol_model.parameters(), lr=0.001)
total_r_running_avg = RunningAverage(50)

step_cnt = 0
for ep in range(2000):
    
    s = env.reset()
    total_r = 0
    
    for it in range(max_ep_len):
        step_cnt += 1
        
        # move one step
        env.render()
        
        if step_cnt > start_steps and not notified:
            print("Stopping random sampling...")
            notified = True
        
        if step_cnt < start_steps:
            a = env.action_space.sample()
        else:
            s_tensor = torch.as_tensor([s]).float()
            a = opt_pol_model(s_tensor).detach().numpy()[0]
            a += 0.1 * np.random.randn(1)
            a = np.clip(a, -2.0, 2.0)
            
        sp, r, d, _ = env.step(a)
        
        if it == max_ep_len - 1:
            d = True
        
        d = 1.0 if d else 0.0
        
        total_r += r
        replay_buf.add_one((s, a, [r], sp, [d]))
        
        # update the current state! for the next prediction!
        s = sp

        # time to update target networks
        if step_cnt % update_steps == 0 and step_cnt > update_after:
            for update_iter in range(num_update_iter):
                sarsd_arr = list(replay_buf.get_arrays(batch_size))
                
                val_optim.zero_grad()
                
                bl = bellman_loss(
                    sarsd_arr,
                    tgt_val_model,
                    tgt_pol_model,
                    opt_val_model,
                    discount_rate
                    )
#                 print("loss:", bl.detach().numpy())
                
                bl.backward()
                val_optim.step()
                
                # freeze Q network
                for p in opt_val_model.parameters():
                    p.requires_grad = False
                
                pol_optim.zero_grad()
                pl = policy_loss(sarsd_arr, opt_val_model, opt_pol_model)
                pl.backward()
                pol_optim.step()
                
                for p in opt_val_model.parameters():
                    p.requires_grad = True
                
                # polyak updating of target networks
                with torch.no_grad():
                    for opt, tgt in zip(opt_val_model.parameters(), tgt_val_model.parameters()):
                        tgt.data.mul_(rho)
                        tgt.data.add_((1.0 - rho) * opt.data)
                        
                    for opt, tgt in zip(opt_pol_model.parameters(), tgt_pol_model.parameters()):
                        tgt.data.mul_(rho)
                        tgt.data.add_((1.0 - rho) * opt.data)
                        
#                     tgt_val_new_state = OrderedDict()
#                     for layer, val in tgt_val_model.state_dict().items():
#                         tgt_val_new_state[layer] = (
#                             rho * val +
#                             (1 - rho) * opt_val_model.state_dict()[layer]
#                             )
#                     tgt_val_model.load_state_dict(tgt_val_new_state, strict=False)
                    
#                     tgt_pol_new_state = OrderedDict()
#                     for layer, val in tgt_pol_model.state_dict().items():
#                         tgt_pol_new_state[layer] = (
#                             rho * val +
#                             (1 - rho) * opt_pol_model.state_dict()[layer]
#                             )
#                     tgt_pol_model.load_state_dict(tgt_pol_new_state, strict=False)
                        
        if d:
            break
    
    total_r_running_avg.update(total_r)
    print(f"ep={ep}, return={total_r:0.4f}, return_avg={total_r_running_avg.avg():0.4f}")
    
env.close()



ep=0, return=-1002.7034, return_avg=-1002.7034
ep=1, return=-875.1966, return_avg=-938.9500
ep=2, return=-817.7968, return_avg=-898.5656
ep=3, return=-1357.7819, return_avg=-1013.3697
ep=4, return=-886.2083, return_avg=-987.9374
ep=5, return=-1298.9955, return_avg=-1039.7804
ep=6, return=-897.8009, return_avg=-1019.4976
ep=7, return=-922.8276, return_avg=-1007.4139
ep=8, return=-925.9900, return_avg=-998.3668
ep=9, return=-1245.8507, return_avg=-1023.1152
ep=10, return=-1591.8053, return_avg=-1074.8143
ep=11, return=-869.0598, return_avg=-1057.6681
ep=12, return=-960.3049, return_avg=-1050.1786
ep=13, return=-992.1059, return_avg=-1046.0305
ep=14, return=-1074.4443, return_avg=-1047.9248
ep=15, return=-1110.5125, return_avg=-1051.8365
ep=16, return=-855.2943, return_avg=-1040.2752
ep=17, return=-1724.7123, return_avg=-1078.2995
ep=18, return=-1003.5343, return_avg=-1074.3645
ep=19, return=-1714.6352, return_avg=-1106.3780
ep=20, return=-1112.7312, return_avg=-1106.6806
ep=21, return=-1

In [None]:
env.close()

In [None]:
#torch.save(opt_val_model.state_dict(), "./value_model.pth")
#torch.save(opt_pol_model.state_dict(), "./policy_model.pth")

In [None]:
test_env = gym.make("Pendulum-v0")

s_t = test_env.reset()

for _ in range(1000):
    test_env.render()
    
    with torch.no_grad():
        a = opt_pol_model(torch.as_tensor([s_t]).float()).detach().numpy()[0]
        a = np.clip(a, -2.0, 2.0)
    
    sp_t, r_t, d_t, _ = test_env.step(a)
    
    s_t = sp_t
    
test_env.close()