In [1]:
from torch import nn
from torch.distributions import Categorical
import torch
import torch.optim as optim
import gym
import numpy as np

# Dicount Factor
gamma = 0.99

class Pi(nn.Module):
  def __init__(self, in_dim, out_dim):
    super(Pi, self).__init__()
    layers = [
        nn.Linear(in_dim, 64),
        nn.ReLU(),
        nn.Linear(64, out_dim),
        ]
    self.model = nn.Sequential(*layers)
    self.onpolicy_reset()
    self.train() 
  
  # Initialize log_probs & rewards
  def onpolicy_reset(self):
    self.log_probs = []
    self.rewards = []
  
  # Calculate Propagation based on the state of Agent
  def forward(self, x):
    pdparam = self.model(x)
    return pdparam

  # By taking the state as a factor, call forward(), calculate it
    # and Select the action as a sample
  # log_probs according to the action are calulated and stored
  def act(self, state):
    x = torch.from_numpy(state.astype(np.float32))  # Convert to numpy to tensor
    pdparam = self.forward(x)                       # Forward pass      
    pd = Categorical(logits=pdparam)                # Probability Distribution
    action = pd.sample()                            # Sampling one action from pd
    log_prob = pd.log_prob(action)                  # Calculate log_probs
    self.log_probs.append(log_prob)                 # Store log_probs
    return action.item()

In [2]:
def train(pi, optimizer):
  # Inner gradient-ascent loop of REINFORCE algorithm
  T = len(pi.rewards)                  # Total number of rewards received durning the episode
  rets = np.empty(T, dtype=np.float32) # Array to store the returns
  future_ret = 0.0                     # Intialize Future return
  
  # Compute the returns efficiently
  for t in reversed(range(T)):
    future_ret = pi.rewards[t] + gamma * future_ret
    rets[t] = future_ret

  # Convert to Pytorch tensor  
  rets = torch.tensor(rets)
  log_probs = torch.stack(pi.log_probs)

  # Calculate Loss Sum
  loss = - log_probs * rets # gradient term; Negative for maximizing
  loss = torch.sum(loss)

  # Backpropagation & Optimization
  optimizer.zero_grad()     # Clear previous gradients
  loss.backward()           # Compute gradients via backpropagation
  optimizer.step()          # Update policy parameters using gradients
  return loss

In [3]:
def main():
  # Initialize Environment & Network
  env = gym.make('CartPole-v0')           # Call the Environment
  in_dim = env.observation_space.shape[0] # 4
  out_dim = env.action_space.n            # 2
  pi = Pi(in_dim, out_dim)                # policy pi_theta for REINFORCE
  optimizer = optim.Adam(pi.parameters(), lr=0.01)
  
  # Episode Loop
  for epi in range(300):
    state, _ = env.reset()
    # Time Step Loop
    for t in range(200): 
      action = pi.act(state) # Action Sampling
      next_state, reward, terminated, truncated, _ = env.step(action) # Calculate Next State & Reward
      done = terminated or truncated
      pi.rewards.append(reward) # Store Reward 
      env.render()
      state = next_state
      if done:
        break
    loss = train(pi, optimizer) # train per episode
    total_reward = sum(pi.rewards)
    solved = total_reward > 195.0
    pi.onpolicy_reset() # onpolicy: clear memory after training
    
    print(f'Episode {epi}, loss: {loss}, \
    total_reward: {total_reward}, solved: {solved}')
    
if __name__ == '__main__':
  main()

  logger.warn(
  gym.logger.warn(


Episode 0, loss: 42.46593475341797,     total_reward: 11.0, solved: False
Episode 1, loss: 150.3479766845703,     total_reward: 21.0, solved: False
Episode 2, loss: 189.4750518798828,     total_reward: 24.0, solved: False
Episode 3, loss: 323.60150146484375,     total_reward: 31.0, solved: False
Episode 4, loss: 119.0931625366211,     total_reward: 18.0, solved: False
Episode 5, loss: 41.036399841308594,     total_reward: 11.0, solved: False
Episode 6, loss: 85.41355895996094,     total_reward: 15.0, solved: False
Episode 7, loss: 652.3286743164062,     total_reward: 47.0, solved: False
Episode 8, loss: 2879.379150390625,     total_reward: 109.0, solved: False
Episode 9, loss: 160.1590576171875,     total_reward: 22.0, solved: False
Episode 10, loss: 1074.5362548828125,     total_reward: 62.0, solved: False
Episode 11, loss: 114.58767700195312,     total_reward: 18.0, solved: False
Episode 12, loss: 70.17330932617188,     total_reward: 13.0, solved: False
Episode 13, loss: 182.46235656