<a href="https://colab.research.google.com/github/oroojlooy/RL_pytorch/blob/master/continues_PG_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install --upgrade gym
# !pip install --upgrade torch

In [2]:
import argparse
import gym
import numpy as np
from itertools import count

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import time

# parameters

In [3]:
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
                    help='discount factor (default: 0.99)')
parser.add_argument('--entropy_alpha', type=float, default=0.001, metavar='G',
                    help='entropy coefficient in the loss (default: 0.001)')
parser.add_argument('--seed', type=int, default=543, metavar='N',
                    help='random seed (default: 543)')
parser.add_argument('--render', action='store_true',
                    help='render the environment')
parser.add_argument('--log-interval', type=int, default=200, metavar='N',
                    help='interval between training status logs (default: 10)')
parser.add_argument('--alg', type=str, default='reinforce', 
                    help='the algorithm to train the agent')
args, unknown = parser.parse_known_args()


# create an environment

In [4]:
# this environment has env.reset() and end.step() functions
env = gym.make('Pendulum-v0')
env.seed(args.seed)
torch.manual_seed(args.seed)


<torch._C.Generator at 0x7f559ff22730>

In [5]:
env.reset()
# env.render()
env.step([1.])

(array([0.97927093, 0.20255479, 0.8098795 ]), -0.05664824675073252, False, {})

# create actor network

In [6]:
class Actor(nn.Module):
    # this class defines a policy network with two layer NN
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1_mu = nn.Linear(3, 256)
        self.fc2_mu = nn.Linear(256, 1)

        self.fc1_sigma = nn.Linear(3, 256)
        self.fc2_sigma = nn.Linear(256, 1)

    def forward(self, x):
        ''' do the forward pass and return a probability over actions
        Input:
                x: state
        returns:
                prob: a probability distribution
        '''
        out = F.relu(self.fc1_mu(x))
        mu = self.fc2_mu(out)
        
        out = F.relu(self.fc1_sigma(x))
        out = self.fc2_sigma(out)
        sigma = F.softplus(out) + 1e-5
        return mu, sigma

# Critic network

In [7]:
class Critic(nn.Module):
    # this class defines a policy network with two layer NN
    def __init__(self, in_d=3):
        super(Critic, self).__init__()
        self.affine1 = nn.Linear(in_d, 256)
        self.affine2 = nn.Linear(256, 1)

    def forward(self, x):
        ''' do the forward pass and return a probability over actions
        Input:
                x: state
        returns:
                v: value of being at x
        '''
        
        x = F.relu(self.affine1(x))
        v = self.affine2(x).squeeze()
        return v

class QCritic(nn.Module):
    # this class defines a policy network with two layer NN
    def __init__(self, in_d=4):
        super(Critic, self).__init__()
        self.affine1 = nn.Linear(in_d, 30)
        self.affine2 = nn.Linear(30, 2)

    def forward(self, x):
        ''' do the forward pass and return a probability over actions
        Input:
                x: state
        returns:
                v: value of being at x
        '''
        
        x = F.relu(self.affine1(x))
        v = self.affine2(x).squeeze()
        return v


# rollout funtion

In [8]:
def select_action_manual(state, env):
    # this function selects stochastic actions based on the policy probabilities
    state = torch.from_numpy(state).float().unsqueeze(0)
    mu, sigma = actor(state)
    action = mu + torch.rand(state.size(0))*sigma
    action = torch.clip(action, env.action_space.low[0], env.action_space.high[0])
    entropy = 0.5*torch.log(2*np.pi*sigma) + 0.5
    log_prob = -((action - mu)**2/(2*(sigma)**2)) -torch.log(sigma) -np.log(2*np.pi)
    
    return action.item(), log_prob, entropy, mu, sigma


def select_action(state, env):
    # this function selects stochastic actions based on the policy probabilities
    state = torch.from_numpy(state).float().unsqueeze(0)
    mu, sigma = actor(state)
    dist = torch.distributions.normal.Normal(mu, sigma)
    action = dist.sample()
    action = torch.clip(action, env.action_space.low[0], env.action_space.high[0])
    entropy = dist.entropy()
    log_prob = dist.log_prob(action)
    
    return action.item(), log_prob, entropy, mu, sigma    

# mu, sigma = torch.tensor([0.]), torch.tensor([1.])
# dist = torch.distributions.normal.Normal(mu, sigma)
# a = dist.sample()
# print(a, dist.entropy(), 0.5*torch.log(2*np.pi*sigma) + 0.5, dist.log_prob(a), -((a - mu)**2/(2*(sigma)**2)) -torch.log(sigma) -0.5*np.log(2*np.pi))


In [9]:
# s=env.reset()
# actor = Actor()
# select_action(s)
# state = torch.from_numpy(s).float().unsqueeze(0)
# probs = actor(state)
# m = Categorical(probs)
# action = m.sample()
# log_prob = m.log_prob(action)
# print(probs, m, action, log_prob)
# # (tensor([[0.5007, 0.4993]], grad_fn=<SoftmaxBackward>), Categorical(), tensor([1]), tensor([-0.6946], grad_fn=<SqueezeBackward1>))
# np.log(0.4993)
# # -0.6945481614755734

# states, rewards, log_probs, entropies, next_states, mask, mus, sigmas = rollout()

In [None]:
# %debug

In [None]:
# states[0]
# rewards[0]
# log_probs[0]
# entropies[0]
# next_states[0]
# entropies
# log_probs
# sigmas
# torch.log(torch.stack(sigmas))
# sigmas

In [None]:
def rollout(env, render=False, pause=.2):
    states = []
    rewards = []
    log_probs = []
    entropies = []
    next_states = []
    mask = []
    mus = []
    sigmas = []
    # play an episode
    state = env.reset()
    while True:  # Don't infinite loop while learning
        # select an action
        action, log_prob, entropy, mu, sigma = select_action(state, env)
        states.append(list(state))
        log_probs.append(log_prob)
        entropies.append(entropy)
        mus.append(mu)
        sigmas.append(sigma)
        
        # take the action and move to next state
        next_state, reward, done, _ = env.step([action])
        rewards.append(reward)
        next_states.append(next_state)
        if render:
            env.render()
            time.sleep(pause)
        if done:
            mask.append(1)
            break
        mask.append(0)
        state = next_state
            
    return states, rewards, log_probs, entropies, next_states, mask, mus, sigmas

# train function

In [None]:
def reinforce_train_step(args, states, rewards, log_probs, entropies, critic):
    
    R = 0
    P = 0
    E = 0
    rewards_path = []
    log_probs_paths = []
    entropies_path = []
    for i in reversed(range(len(rewards))):
        R = rewards[i] + args.gamma * R
        rewards_path.insert(0, R) 
        
        P = log_probs[i] + P
        log_probs_paths.insert(0, P) 

        E = entropies[i] + args.gamma * E
        entropies_path.insert(0, E)

    rewards_path = torch.tensor(rewards_path, dtype=torch.float32)
    rewards_path = (rewards_path - rewards_path.mean()) / (rewards_path.std() + 1e-8)
    log_probs_paths = torch.stack(log_probs_paths)
    
#     print(rewards_path,log_probs_paths)
    value = critic(torch.tensor(states, dtype=torch.float32))

    # take a backward step for actor
    # This is based on Pytorch implementation of REINFORCE 
    actor_loss = -torch.mean(((rewards_path - value.detach()) * torch.stack(log_probs)) -args.entropy_alpha * torch.stack(entropies_path))
    # This is based on the formual which Levine obtains for REINFORCE algorithm
    # actor_loss = -torch.mean(((rewards_path - value.detach()) * log_probs_paths))
    actor_optim.zero_grad()
    actor_loss.backward()
    actor_optim.step()

    # take a backward step for critic
    loss_fn = torch.nn.MSELoss()
    critic_loss = loss_fn(value, rewards_path)
    critic_optim.zero_grad()
    critic_loss.backward()
    critic_optim.step()

    return actor_loss, critic_loss


In [None]:
# Actor Critic 
def ac_train_step(args, states, rewards, log_probs, entropies, next_states, done, critic):
# states, rewards, log_probs, next_states = rollout()
# if 1 == 1:

    P = 0
    E = 0
    rewards_path = []
    log_probs_paths = []
    qvalue = critic(torch.tensor(states, dtype=torch.float32))
    next_qvalue = critic(torch.tensor(next_states, dtype=torch.float32))
    next_qvalue = next_qvalue.detach().cpu().numpy()
    target = []
    for i in range(len(rewards)):
        target.append(rewards[i] + args.gamma * next_qvalue[i]*(1-done[i]))
       
    target = torch.tensor(target)

    qvalue = (qvalue - torch.mean(qvalue))/(torch.std(qvalue) + 1e-8)
    # take a backward step for actor
    # This is based on the commonly used AC algorithm  
    actor_loss = -torch.mean(qvalue.detach() * torch.stack(log_probs) - args.entropy_alpha * torch.stack(entropies))
    actor_optim.zero_grad()
    actor_loss.backward()
    actor_optim.step()

    # take a backward step for critic
    loss_fn = torch.nn.MSELoss()
    critic_loss = loss_fn(qvalue, torch.tensor(target, dtype=torch.float32))
    critic_optim.zero_grad()
    critic_loss.backward()
    critic_optim.step()


In [None]:
def rtg_reinforce_train_step(args, states, rewards_, log_probs, entropies, critic):
    
    R = 0
    G = 0
    E = 0
    log_probs_grads = []
    entropies_path = []
    rewards = []
#     print(rewards_path,log_probs_paths)
    value = critic(torch.tensor(states, dtype=torch.float32))
    for i in reversed(range(len(rewards_))):
        R = rewards_[i] + args.gamma * R
        rewards.insert(0, R)

        E = entropies[i] + args.gamma * E
        entropies_path.insert(0, E)
        
    rewards = torch.tensor(rewards, dtype=torch.float32)
    rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-8)
    value_detached = value.detach()
    for i in reversed(range(len(rewards_))):
        G += log_probs[i]*(rewards[i] - value_detached[i]) 
        log_probs_grads.insert(0, G) 

    log_probs_grads = torch.stack(log_probs_grads)
    
    # take a backward step for actor
    # This is based on Pytorch implementation of REINFORCE 
    actor_loss = -torch.mean(log_probs_grads - args.entropy_alpha * torch.stack(entropies_path))
    actor_optim.zero_grad()
    actor_loss.backward()
    actor_optim.step()

    # take a backward step for critic
    loss_fn = torch.nn.MSELoss()
    critic_loss = loss_fn(value, rewards)
    critic_optim.zero_grad()
    critic_loss.backward()
    critic_optim.step()

    return actor_loss, critic_loss


In [None]:
# Advantage Actor Critic  (single worker A2C)
def aac_train_step(args, states, rewards, log_probs, next_states, done, critic):
# states, rewards, log_probs, next_states = rollout()
# if 1 == 1:
    P = 0
    rewards_path = []
    log_probs_paths = []
    value = critic(torch.tensor(states, dtype=torch.float32))
    next_value = critic(torch.tensor(next_states, dtype=torch.float32))
    next_value = next_value.detach().cpu().numpy()
    advantage = []
    target = []
    for i in range(len(rewards)):
        target.append(rewards[i] + args.gamma * next_value[i]*(1-done[i]))
        advantage.append(target[i] - value[i].detach().cpu().numpy())
        
        P = log_probs[i] + P
        log_probs_paths.insert(0, P) 

    advantage = torch.tensor(advantage)
    target = torch.tensor(target)
    cumulative_log_probs = torch.stack(log_probs_paths)
    
#     print(log_probs_paths)

    # take a backward step for actor
    # This is based on Pytorch implementation of AC 
    actor_loss = -torch.mean(advantage.detach() * torch.squeeze(torch.stack(log_probs)) - args.entropy_alpha * torch.stack(entropies))
    # This is based on the formual which Levine obtains for AC algorithm
    # actor_loss = -torch.mean(advantage.detach() * cumulative_log_probs)
    actor_optim.zero_grad()
    actor_loss.backward()
    actor_optim.step()

    # take a backward step for critic
    loss_fn = torch.nn.MSELoss()
    critic_loss = loss_fn(value, torch.tensor(target, dtype=torch.float32))
    critic_optim.zero_grad()
    critic_loss.backward()
    critic_optim.step()


# run training

In [None]:
# create actor and critic network
# args.alg = 'reinforcertg'
args.alg= 'reinforce'
if args.alg == 'ac_q':
  critic = QCritic()
else:
  critic = Critic()

args.entropy_alpha = 7e-4
actor = Actor()

# create optimizers
actor_optim = optim.Adam(actor.parameters(), lr=1e-3)
critic_optim = optim.Adam(critic.parameters(), lr=1e-3)
args.log_interval = 200

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print(args)

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cpu



In [None]:
running_reward = 0
for i_episode in range(200000):
    states, rewards, log_probs, entropies, next_states, done, mus, sigmas = rollout(env)
    running_reward = 0.9*running_reward + 0.1*np.sum(rewards)
    if args.alg == 'reinforce':
        actor_loss, critic_loss = reinforce_train_step(args, states, rewards, log_probs, entropies, critic)
    elif args.alg == 'ac':
        actor_loss, critic_loss = ac_train_step(args, states, rewards, log_probs, entropies, next_states, done, critic)
    elif args.alg == 'reinforcertg':
        actor_loss, critic_loss = rtg_reinforce_train_step(args, states, rewards, log_probs, entropies, critic)
    elif args.alg == 'aac':
        actor_loss, critic_loss = aac_train_step(args, states, rewards, log_probs, next_states, done, critic)
    if i_episode % args.log_interval == 0:
        print('Episode={} \tAverage reward: {:.2f} \tActor-loss'
        '={:.2f}, \tCritic-loss={:.2f}, \tentropy={:.2f}, \tlog-prob={:.2f}, \tmu={:.2f}, \tsigma={:.4f}'.format(
            i_episode, running_reward, actor_loss, critic_loss, torch.mean(
                torch.stack(entropies)).item(), torch.mean(torch.stack(
                    log_probs)).item(), torch.mean(torch.stack(mus)
                    ).item(), torch.mean(torch.stack(sigmas)).item()))


Episode=0 	Average reward: -148.62 	Actor-loss=43.92, 	Critic-loss=1.07, 	entropy=1.01, 	log-prob=-1.20, 	mu=1.06, 	sigma=0.4537
Episode=200 	Average reward: -1358.72 	Actor-loss=-55.87, 	Critic-loss=0.94, 	entropy=0.75, 	log-prob=-0.67, 	mu=0.91, 	sigma=0.2995
Episode=400 	Average reward: -1394.82 	Actor-loss=209.75, 	Critic-loss=1.13, 	entropy=1.35, 	log-prob=-1.87, 	mu=0.29, 	sigma=0.8788
Episode=600 	Average reward: -1655.09 	Actor-loss=91.59, 	Critic-loss=0.69, 	entropy=1.32, 	log-prob=-1.79, 	mu=0.07, 	sigma=0.8956
Episode=800 	Average reward: -1499.29 	Actor-loss=-44.09, 	Critic-loss=0.94, 	entropy=1.25, 	log-prob=-1.67, 	mu=-0.48, 	sigma=0.7592
Episode=1000 	Average reward: -1498.38 	Actor-loss=-1636.89, 	Critic-loss=0.98, 	entropy=1.95, 	log-prob=-47.78, 	mu=-20.31, 	sigma=3.1392
Episode=1200 	Average reward: -1511.05 	Actor-loss=-12794.98, 	Critic-loss=0.98, 	entropy=1.87, 	log-prob=-2123.42, 	mu=-46.67, 	sigma=3.4451
Episode=1400 	Average reward: -1243.74 	Actor-loss=2601961

KeyboardInterrupt: ignored

In [None]:
running_reward = 10
for i_episode in range(1000):
    states, rewards, log_probs = rollout()
    t = len(rewards)
    running_reward = running_reward * 0.9 +  t * 0.1
    ac_train_step(states, rewards, log_probs)
    if i_episode % args.log_interval == 0:
        print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(
            i_episode, t, running_reward))
    if running_reward > env.spec.reward_threshold:
        print("Solved! Running reward is now {} and "
              "the last episode runs to {} time steps!".format(running_reward, t))
        break

In [None]:
# for var_name in actor_optim.state_dict():
#     print(var_name, "\t", actor_optim.state_dict()[var_name])

d=actor_optim.state_dict()
# d
optimizer_state= actor_optim.state # holds all the information about the gradients, square of gradients and past gradients used in Adam 

te=d['state'][140575074157424]['exp_avg']
te.shape

In [None]:
c=torch.nn.Conv1d(10,12,3)
c=torch.nn.Conv2d(10,12,3)


In [None]:
for i in c.parameters():
#   print (i['weight'])
#   print (dir(i))
  print (i.shape)
  print (i.numel())
#   print (i.count())

# render optimal policy


In [None]:
a = rollout(True,pause=.05)

In [None]:
env.close()

In [None]:
z=[5,7,9,12]
for b in [1,2,10,30]:
  print ("------",b)
  s=sum([np.exp(i*b) for i in z])
  for i in z:
    print (np.exp(i*b)/s)

In [None]:
import torch

batch_size = 5
nb_digits = 10
# Dummy input that HAS to be 2D for the scatter (you can use view(-1,1) if needed)
y = torch.LongTensor(batch_size,1).random_() % nb_digits
# One hot encoding buffer that you create out of the loop and just keep reusing
y_onehot = torch.FloatTensor(batch_size, nb_digits)
# In your for loop
y_onehot.zero_()


In [None]:
y_onehot.scatter_(1, y, 1)

print(y)
print(y_onehot)

In [None]:
d=torch.chunk(y_onehot,10,1)
d[0]