In [53]:
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.distributions import Categorical
from torch.optim import Adam
import tensorboard
from dataclasses import dataclass
# from stable_baselines3.common.buffers
import gymnasium as gym
import time

In [62]:
@dataclass
class Args:
    env_id = 'CartPole-v1'
    seed = 0
    num_vec_envs = 2
    device = 'cude' if torch.cuda.is_available() else 'cpu'
    gamma = 0.9
    lr = 1e-4
    critic_coeff = 0.5
    entropy_coeff = 0.01

    total_steps = 10000
    steps_per_episode = 100
    num_episodes = total_steps // steps_per_episode
    
    writer = True
    log_dir = 'runs'

args = Args()

In [65]:
class Policy(nn.Module):
    def __init__(self, envs):
        super().__init__()
        
        self.actor_linear1 = nn.Linear(envs.single_observation_space.shape[0], 32)
        self.actor_linear2 = nn.Linear(32, int(envs.single_action_space.n))

        self.critic_linear1 = nn.Linear(envs.single_observation_space.shape[0], 32)
        self.critic_linear2 = nn.Linear(32, 1)

    def actor(self, x):
        x = F.relu( self.actor_linear1(x) )
        x = F.softmax( self.actor_linear2(x), dim=-1 )
        return x

    def critic(self, x):
        x = F.relu( self.critic_linear1(x) )
        x = self.critic_linear2(x)
        return x

    def train_actor(self, x):
        probs = self.actor(x)
        cat = Categorical(probs)
        actions = cat.sample()
        log_probs = cat.log_prob(actions)
        entropy = cat.entropy()
        return actions, log_probs, entropy


def make_env(env_id, seed):
    def _make():
        env = gym.make(env_id)
        # env.seed(seed + index)
        return env
    return _make


def make_vec_envs(num_vecs):
    envs = gym.vector.SyncVectorEnv(
        [make_env(args.env_id, args.seed + i) for i in range(num_vecs)]
    )
    return envs


def calc_return(rewards, final_value, gamma):
    rewards.append(final_value)
    for i in reversed(range(len(rewards) - 1)):
        # print('return at state ', i)
        # print(rewards[i], " + ", rewards[i+1], " * ", gamma)
        rewards[i] += rewards[i + 1] * gamma
    rewards.pop()
    return np.stack(rewards)


def train(args, policy, optimizer):
    if args.writer:
        writer = SummaryWriter(args.log_dir)
        
    for ep in range(args.num_episodes):
        obs, _ = envs.reset()
        values, log_probs, rewards, entropys = [], [], [], []
        for step in range(args.steps_per_episode):
            obs = torch.tensor(np.asarray(obs), device=args.device)
            actions, log_prob, entropy = policy.train_actor(obs)
            value = policy.critic(obs)
            
            next_obs, reward, done, truncated, info = envs.step(actions.numpy())
            
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(reward)
            entropys.append(entropy)

            # restart envs if they finished
            obs = next_obs
            if done.any() or truncated.any():
                obs, _ = envs.reset_done()

        #get value for the final step
        obs = torch.tensor(np.asarray(obs), device=args.device)
        with torch.no_grad():
            final_value = policy.critic(obs)

        #calc episodic return
        ep_return = calc_return(rewards, final_value.squeeze().numpy(), args.gamma)
        
        if arge.writer:
            writer.add_scalar('episodic return', ep_return, ep)
            
        #make tensors
        ep_return = torch.as_tensor(ep_return, device=args.device)
        log_probs = torch.stack(log_probs)
        values = torch.stack(values).squeeze()
        entropys = torch.stack(entropys)
        # print(type(entropy), entropy.shape)

        #calc advantage
        advantages = ep_return - values

        #calc actor, critic and total losses
        critic_loss = advantages.pow(2).mean()
        actor_loss = - (advantages * log_probs).mean()
        entropy_loss = entropys.mean()
        loss = (actor_loss + args.critic_coeff * critic_loss - args.entropy_coeff * entropy_loss)

        #train step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


envs = make_vec_envs(args.num_vec_envs)

policy = Policy(envs)
optimizer = Adam(policy.parameters(), lr=args.lr)
train(args, policy, optimizer)
torch.save(policy.state_dict(), 'model_weights')

AttributeError: 'SyncVectorEnv' object has no attribute 'reset_done'