In [1]:
import torch, numpy as np
import gymnasium as gym

In [2]:
env = gym.make('CartPole-v1')

In [3]:
action_number = env.action_space.n

In [4]:
state_size = env.observation_space.shape[0]

In [5]:
gamma = 0.99
lr = 5e-5
hid_layer1 = 64
hid_layer2 = 128
batch_size = 32
q_target_update_frequency = 1000

epsiolon_start = 1.0
epsilon_end = 0.02
epsilon_decay = 10000

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [7]:
Q = torch.nn.Sequential(
    torch.nn.Linear(state_size, hid_layer1),
    torch.nn.ReLU(),
    torch.nn.Linear(hid_layer1, hid_layer2),
    torch.nn.ReLU(),
    torch.nn.Linear(hid_layer2, action_number)
).to(device)

Q_target = torch.nn.Sequential(
    torch.nn.Linear(state_size, hid_layer1),
    torch.nn.ReLU(),
    torch.nn.Linear(hid_layer1, hid_layer2),
    torch.nn.ReLU(),
    torch.nn.Linear(hid_layer2, action_number)
).to(device)

In [8]:
optim = torch.optim.Adam(Q.parameters(), lr=lr)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=100, verbose=True)

In [519]:

# class ReplayBuffer:
#     def __init__(self, capacity, min_replay_size):
#         self.capacity = capacity
#         self.min_replay_size = min_replay_size
#         self.buffer = [None for i in range(capacity)]
#         self.position = 0

#     def push(self, state, action, reward, next_state, done):
#         self.buffer[self.position] = (state, action, reward, next_state, done)
#         self.position = (self.position + 1) % self.capacity

#     def sample(self, batch_size):
#         # filter out the None
#         return random.sample([i for i in self.buffer if i is not None], batch_size)

#     def __len__(self):
#         return len(self.buffer)
    
#     def is_ready(self):
#         return len([i for i in self.buffer if i is not None]) >= self.min_replay_size
        

In [9]:
from collections import deque
import random

class ReplayBuffer:
    def __init__(self, capacity, min_replay_size):
        self.capacity = capacity
        self.min_replay_size = min_replay_size
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)
    
    def is_ready(self):
        return len(self.buffer) >= self.min_replay_size

In [10]:
replay_buffer = ReplayBuffer(50000, 1000)

In [11]:
def step(state, randomness):
    risk = random.random() <= randomness

    q = Q(torch.tensor(state, dtype=torch.float).to(device)).to(device)

    action = random.choice([possible for possible in range(action_number)]) if risk \
        else torch.argmax(q).item()

    new_state, r, done, _, _ = env.step(action)

    replay_buffer.push(state, action, r, new_state, done)

    if not replay_buffer.is_ready():
        return done, None

    batch = replay_buffer.sample(batch_size)

    states = torch.tensor([i[0] for i in batch], dtype=torch.float).to(device)
    actions = torch.tensor([i[1] for i in batch], dtype=torch.long).to(device)
    rewards = torch.tensor([i[2] for i in batch], dtype=torch.float).to(device)
    new_states = torch.tensor([i[3] for i in batch], dtype=torch.float).to(device)
    dones = torch.tensor([i[4] for i in batch], dtype=torch.int).to(device)

    q = Q(states).to(device)
    q_target = Q_target(new_states).to(device)

    y = rewards + (1 - dones)*(gamma*torch.max(q_target, dim=1, keepdim=True)[0])

    q_actions = torch.gather(input=q, dim=1, index=actions.unsqueeze(-1))

    loss = torch.nn.functional.smooth_l1_loss(q_actions, y)

    optim.zero_grad()
    loss.backward()
    optim.step()
    # scheduler.step(loss)

    return done, loss.item()  

In [12]:
def init_population():
    state = env.reset()[0]
    while not replay_buffer.is_ready():
        done, _ = step(state, 1.0)
        if done:
            state = env.reset()[0]

In [13]:
def run_episode(curr_steps):
    episode_steps = 0
    episode_avg_loss = 0

    state = env.reset()[0]

    done = False

    while not done:
        step_randomness = np.interp(curr_steps + episode_steps, [0, epsilon_decay], [epsiolon_start, epsilon_end])
        done, loss = step(state, step_randomness)
        episode_steps += 1

        if (curr_steps + episode_steps) % q_target_update_frequency == 0:
            Q_target.load_state_dict(Q.state_dict())

    episode_avg_loss = (episode_avg_loss * episode_steps + loss) / (episode_steps + 1)

    return episode_steps, episode_avg_loss

In [18]:
torch.autograd.set_detect_anomaly(False)

steps = 0
episodes = 0
avg_score = 0

# run the agent, and save the model on process exit
try:
    init_population()
    while True:
        episode_steps, episode_avg_loss = run_episode(steps)

        steps += episode_steps
        episodes += 1
        avg_score = (avg_score * episodes + episode_steps) / (episodes + 1)

        print(f'Episode: {episodes:04d}. Loss: {episode_avg_loss:.5f}. Score: {episode_steps:03d}. Average score: {avg_score:.2f}.')

except KeyboardInterrupt:
    pass
finally:
    torch.save(Q.state_dict(), 'cartpole.dqn.pth')
    print('Saved model.')

  loss = torch.nn.functional.smooth_l1_loss(q_actions, y)


Episode: 0001. Loss: 0.49142. Score: 021. Average score: 10.50.
Episode: 0002. Loss: 0.23090. Score: 052. Average score: 24.33.
Episode: 0003. Loss: 0.76201. Score: 013. Average score: 21.50.
Episode: 0004. Loss: 0.23780. Score: 036. Average score: 24.40.
Episode: 0005. Loss: 0.35064. Score: 015. Average score: 22.83.
Episode: 0006. Loss: 0.36259. Score: 012. Average score: 21.29.
Episode: 0007. Loss: 0.71516. Score: 011. Average score: 20.00.
Episode: 0008. Loss: 0.25598. Score: 015. Average score: 19.44.
Episode: 0009. Loss: 0.48041. Score: 027. Average score: 20.20.
Episode: 0010. Loss: 0.34748. Score: 020. Average score: 20.18.
Episode: 0011. Loss: 0.39721. Score: 012. Average score: 19.50.
Episode: 0012. Loss: 0.46540. Score: 016. Average score: 19.23.
Episode: 0013. Loss: 0.45667. Score: 014. Average score: 18.86.
Episode: 0014. Loss: 0.60214. Score: 012. Average score: 18.40.
Episode: 0015. Loss: 0.26739. Score: 032. Average score: 19.25.
Episode: 0016. Loss: 0.06499. Score: 055