# Deep Deterministic Policy Gradient

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt

from collections import namedtuple, deque

In [None]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
from buffer import ReplayBuffer
from model import PolicyNetwork, ValueNetwork

In [None]:
plt.style.use('ggplot')

## Set Configs

In [None]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

In [None]:
BUFFER_SIZE = int(1e5)
BATCH_SIZE = 32
GAMMA = 0.99
TAU = 1e-2
ACTOR_LR = 1e-3
CRITIC_LR = 1e-3
PRINT_EVERY = 1

## Set Environment

In [None]:
ENV_NAME = 'Pendulum-v0'
env = gym.make(ENV_NAME).unwrapped; env.seed(90);

In [None]:
print('Environment Display:')
env.reset() # reset environment to a new, random state
env.render()

print('State space {}'.format(env.observation_space))
print('Action space {}'.format(env.action_space))

## Define [DDPG](https://arxiv.org/pdf/1509.02971.pdf) Agent

In [None]:
class DDPGAgent():
    
    def __init__(self, env, seed):
        
        self.env = env
        
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        
        self.gamma = GAMMA
        self.actor_lr = ACTOR_LR
        self.critic_lr = CRITIC_LR
        self.tau = TAU
        
        # initialize actor and critic networks
        self.actor = PolicyNetwork(self.state_size, self.action_size, seed).to(device)
        self.actor_target = PolicyNetwork(self.state_size, self.action_size, seed).to(device)
        
        self.critic = ValueNetwork(self.state_size, self.action_size, seed).to(device)
        self.critic_target = ValueNetwork(self.state_size, self.action_size, seed).to(device)
        
        # copy critic target parameters
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data)
            
        # set optimizers
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.critic_lr)
        
        # set helpers
        self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)
        
    def memorize(self, state, action, reward, next_state, done):

        self.buffer.add(state, action, reward, next_state, done)
    
        # if enough samples are available in memory, get random subset and learn
        if len(self.buffer) > BATCH_SIZE:
            experiences = self.buffer.sample()
            self.learn(experiences, self.gamma)
        
    def act(self, state):
        
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = self.actor(state)
        action = action.squeeze(0).cpu().detach().numpy()
        
        return action
    
    def learn(self, experiences, gamma):
        
        states, actions, rewards, next_states, dones = experiences
        
        # obtain Q expected
        Q_expected = self.critic(states, actions)
        
        next_actions = self.actor_target(next_states)
        Q_target_next = self.critic_target(next_states, next_actions.detach())
        
        # compute Q target
        Q_target = rewards + (gamma * Q_target_next)
        
        # compute loss
        Q_loss = F.mse_loss(Q_expected, Q_target.detach())
        
        # minimize Q loss
        self.critic_optimizer.zero_grad()
        Q_loss.backward()
        self.critic_optimizer.step()
        
        # update actor
        policy_loss = -self.critic(states, self.actor(states)).mean()
        
        # minimize policy loss
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()
        
        # update target networks
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))
            
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))

In [None]:
agent = DDPGAgent(env, seed=90)

## Train The Agent

In [None]:
def train_agent(num_episodes=100, max_time=500):
    
    scores = []
    scores_window = deque(maxlen=1)
    
    for i_episode in range(1, num_episodes+1):
        
        state = env.reset()
        score = 0
        
        for time_step in range(max_time):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            
            agent.memorize(state, action, reward, next_state, done)
            state = next_state
            
            score += reward
            if done:
                break
        
        scores_window.append(score)
        scores.append(score)
        
        print(f'\rEpisode: {i_episode}, Average Score: {np.mean(scores_window):.2f}', end='')
        
        if i_episode % PRINT_EVERY == 0:
            print(f'\rEpisode: {i_episode}, Average Score: {np.mean(scores_window):.2f}')
        if np.mean(scores_window) >= 100.0:
            print(f'\nEnvironment solved in {i_episode-100:d} episodes! Average Score: {np.mean(scores_window):.2f}')
            agent.save(f'./agents/ACTOR_{ENV_NAME}.pth', f'./agents/CRITIC_{ENV_NAME}.pth')
            break
            
    print('Training completed.')
    
    return scores

## Evaluate The Agent

In [None]:
scores = train_agent(num_episodes=100, max_time=500)

---