In [2]:
from torch.distributions import Categorical
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

gamma = 0.99

In [4]:
class Pi(nn.Module):
  def __init__(self, in_dim, out_dim) -> None:
    super(Pi, self).__init__()
    layers = [
        nn.Linear(in_dim, 64),
        nn.ReLU(),
        nn.Linear(64, out_dim)
    ]
    # Sequential has to take in the individual elements
    # of the list as params, not just the list itself
    self.model = nn.Sequential(*layers)
    self.onpolicy_reset()
    self.train()

  def forward(self, x):
    return self.model(x)

  def onpolicy_reset(self):
    self.log_probs = []
    self.rewards = []

  def act(self, state):
    x = torch.from_numpy(state.astype(np.float32))
    pdparam = self.forward(x)
    pd = Categorical(logits=pdparam)
    action = pd.sample()
    log_prob = pd.log_prob(action)
    self.log_probs.append(log_prob)
    return action.item()

def train(pi, optimizer):
  T = len(pi.rewards) # The max timestep for the given episode
  returns = np.empty(T, dtype=np.float32) # The array of returns
  future_return = 0.0
  # iterate from the end to the beginning to collect returns for the
  # trajectory -> done from the end to be able to use future returns
  # to calculate return from early episodes
  for t in reversed(range(T)):
    future_return = pi.rewards[t] + gamma * future_return
    returns[t] = future_return
  returns = torch.tensor(returns)
  log_probs = torch.stack(pi.log_probs)
  loss = - log_probs * returns
  loss = torch.sum(loss)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  return loss

def main():
  env = gym.make("CartPole-v0")
  in_dim = env.observation_space.shape[0]
  out_dim = env.action_space.n
  pi = Pi(in_dim, out_dim)
  optimizer = optim.Adam(pi.parameters(), lr =0.01)
  for episode in range(300):
    state = env.reset()
    for t in range(200):
      action = pi.act(state)
      state, reward, done, _ = env.step(action)
      pi.rewards.append(reward)
      env.render()
      if done:
        break
    loss = train(pi, optimizer)
    total_reward = sum(pi.rewards)
    solved = total_reward > 195.0
    pi.onpolicy_reset()
    print(f"Episode {episode}, loss: {loss}, total_reward: {total_reward}, solved: {solved}")

if __name__ == "__main__":
  main()






If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Episode 0, loss: 375.53045654296875, total_reward: 34.0, solved: False
Episode 1, loss: 1029.369140625, total_reward: 60.0, solved: False
Episode 2, loss: 50.84876251220703, total_reward: 11.0, solved: False
Episode 3, loss: 237.23043823242188, total_reward: 27.0, solved: False
Episode 4, loss: 185.2281494140625, total_reward: 23.0, solved: False
Episode 5, loss: 595.0177001953125, total_reward: 45.0, solved: False
Episode 6, loss: 85.96440887451172, total_reward: 15.0, solved: False
Episode 7, loss: 72.83362579345703, total_reward: 14.0, solved: False
Episode 8, loss: 531.07568359375, total_reward: 43.0, solved: False
Episode 9, loss: 333.6488342285156, total_reward: 33.0, solved: False
Episode 10, loss: 3038.112060546875, total_reward: 114.0, solved: False
Episode 11, loss: 105.67044830322266, total_reward: 18.0, solved: False
Episode 12, loss: 136.82830810546875, total_reward: 18.0, solved: False
Episode 13, loss: 141.8507843017578, total_reward: 21.0, solved: False
Episode 14, loss