In [1]:
import torch
from torch import nn
from torch import optim
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

import gym

from ActorCritic import TDActorCritic

Using cpu device


In [2]:
class Network(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.shareNet = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
        )
        self._actor = nn.Sequential(
            nn.Linear(128, output_dim),
            nn.Softmax()
        )
        self._critic = nn.Sequential(
            nn.Linear(128, 1),
        )
    def forward(self, x):
        x = self.shareNet(x)
        act = self._actor(x)
        cri = self._critic(x)
        return act, cri


In [3]:
model = Network(4, 2)
critic_loss_fn = nn.MSELoss()
optimizer= optim.Adam(model.parameters(), lr=0.001)

In [4]:
env = gym.make('CartPole-v1', render_mode='rgb_array')

In [5]:
agent = TDActorCritic(env, model, critic_loss_fn, optimizer, discount_factor=0.99, batch_size=32, action_space_type='discrete')

In [9]:
agent.train(1024, 500)

Batch 1 (Epi [32/1024]): avg steps: 147.53125 avg reward: 147.53125
Batch 2 (Epi [64/1024]): avg steps: 128.75 avg reward: 128.75
Batch 3 (Epi [96/1024]): avg steps: 228.1875 avg reward: 228.1875
Batch 4 (Epi [128/1024]): avg steps: 234.6875 avg reward: 234.6875
Batch 5 (Epi [160/1024]): avg steps: 226.9375 avg reward: 226.9375
Batch 6 (Epi [192/1024]): avg steps: 157.75 avg reward: 157.75
Batch 7 (Epi [224/1024]): avg steps: 160.71875 avg reward: 160.71875
Batch 8 (Epi [256/1024]): avg steps: 187.25 avg reward: 187.25
Batch 9 (Epi [288/1024]): avg steps: 198.5625 avg reward: 198.5625
Batch 10 (Epi [320/1024]): avg steps: 206.15625 avg reward: 206.15625
Batch 11 (Epi [352/1024]): avg steps: 188.65625 avg reward: 188.65625
Batch 12 (Epi [384/1024]): avg steps: 151.53125 avg reward: 151.53125
Batch 13 (Epi [416/1024]): avg steps: 158.6875 avg reward: 158.6875
Batch 14 (Epi [448/1024]): avg steps: 193.90625 avg reward: 193.90625
Batch 15 (Epi [480/1024]): avg steps: 201.625 avg reward: 20

In [None]:
import numpy as np
state = env.reset()[0]

actor, critic = model(torch.tensor(state))

print(actor)
print(critic)