# Train Pendulum problem(for continuous output) in OpenAI Env using Proximal Policy Optimization method

In [1]:
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# for CUDA
# use_cuda = torch.cuda.is_available()
use_cuda = False
device   = torch.device("cuda" if use_cuda else "cpu")

In [11]:
env = gym.make("Pendulum-v0")

epsilon = 1.0
epsilonMin = 0.01
epsilonDecay = 0.999
episodes = 1000
batch_size = 32
gamma = 0.99
goal_steps = 200
input_shape = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
# print(num_actions)
buffer_capacity = 1000
batch_siz = 32
epochs = 10
clip_param = 0.2

In [4]:
# Memory to save the experiences 
class Buffer(object):
    def __init__(self):
        self.buffer = []
        self.buffer_capacity = 1000
        self.batch = 32
    
    def add(self, params):
        self.buffer.append(params)
        
    def reinit(self):
        self.buffer = []
        
    def length(self):
        return len(self.buffer)

In [5]:
# Network for Actor and Critic
class Actor(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(Actor, self).__init__()
        self.fc = nn.Linear(input_shape, 256)
        self.mu_head = nn.Linear(256,num_actions)
        self.log_std = nn.Parameter(torch.zeros(1))
        
    def forward(self, x):
        x = F.relu(self.fc(x))
        mu = self.mu_head(x)
        std = self.log_std.exp().expand_as(mu)
        dist = Normal(mu, std)
        return dist

class Critic(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(input_shape, 256)
        self.fc2 = nn.Linear(256, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        value = self.fc2(x)
        return value

In [6]:
# Update method for both policy and value
def update():
    mem = memory.buffer
    s = torch.FloatTensor([m[0].numpy() for m in mem])
    a = torch.FloatTensor([m[1] for m in mem]) 
    old_log_a = torch.FloatTensor([m[2] for m in mem])
    r = torch.FloatTensor([m[3] for m in mem])
    s_ = torch.FloatTensor([m[4] for m in mem])
#     print(type(critic(s)))
    with torch.no_grad():
        target = r + gamma * critic(s_).squeeze(1)
    adv = target - critic(s).squeeze(1)
    
    for _ in range(epochs):
        for id in BatchSampler(SubsetRandomSampler(range(buffer_capacity)), batch_size, False):
            dist = actor(s[id])
            new_log_a = dist.log_prob(a[id].unsqueeze(1))
            ratio = torch.exp(new_log_a.squeeze(1) - old_log_a[id])
            
            loss1 = ratio * adv[id]
            loss2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * adv[id]
            action_loss = -torch.min(loss1, loss2).mean()
#             print(type(action_loss))
            opt_a.zero_grad()
            action_loss.backward(retain_graph=True)
            nn.utils.clip_grad_norm_(actor.parameters(), 0.5)
            opt_a.step()

            value_loss = (critic(s[id]).squeeze(1) - target[id]).pow(2).mean()
#             print(type(value_loss))
            opt_c.zero_grad()
            value_loss.backward(retain_graph=True)
            nn.utils.clip_grad_norm_(critic.parameters(), 0.5)
            opt_c.step()
    
    memory.reinit()


In [7]:
actor = Actor(input_shape, num_actions)
critic = Critic(input_shape, num_actions)

opt_a = optim.Adam(actor.parameters(), lr=1e-4)
opt_c = optim.Adam(critic.parameters(), lr=3e-4)

In [9]:
memory = Buffer()
state = env.reset()
for idx in range(episodes):
    state = env.reset()
    score = 0
    for _ in range(goal_steps):
        state = torch.FloatTensor(state)
#         print(state)
        value = critic(state)
        dist = actor(state)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        # action = action.clamp(-2, 2)
#         print(action)
        next_state, reward, done, _ = env.step(action.numpy())
        score += reward
        memory.add([state, action, log_prob, reward, next_state])
        state = next_state
        
        if memory.length() >= buffer_capacity:
            # print(memory.buffer[0])
            update()
    print("Episode = " + str(idx) + ", Score = " + str(score))

Episode = 0, Score = -1429.66693877
Episode = 1, Score = -1219.3795983
Episode = 2, Score = -1330.43684385
Episode = 3, Score = -1819.15302243
Episode = 4, Score = -1113.82928015
Episode = 5, Score = -1504.99575381
Episode = 6, Score = -1398.50632921
Episode = 7, Score = -1613.11907834
Episode = 8, Score = -1630.30959298
Episode = 9, Score = -1398.03450788
Episode = 10, Score = -1335.17190667
Episode = 11, Score = -1456.57094385
Episode = 12, Score = -1342.14394014
Episode = 13, Score = -1347.1808573
Episode = 14, Score = -1470.55966507
Episode = 15, Score = -1750.80594961
Episode = 16, Score = -1531.29486813
Episode = 17, Score = -1483.41032081
Episode = 18, Score = -1836.43499556
Episode = 19, Score = -1779.41154329
Episode = 20, Score = -1756.26482404
Episode = 21, Score = -1580.96714747
Episode = 22, Score = -1604.50739394
Episode = 23, Score = -1336.15292053


KeyboardInterrupt: 

In [13]:
for idx in range(5):
    state = env.reset()
    score = 0
    for _ in range(goal_steps):
        env.render()
        state = torch.FloatTensor(state)
        dist = actor(state)
        action = dist.sample()
        next_state, reward, done, _ = env.step(action.numpy())
        state = next_state

NotImplementedError: abstract