In [1]:
import os
import random
import gymnasium as gym
import numpy as np
from collections import deque
import torch
from torch import nn as nn
#import torch.nn as nn OR from torch import nn as nn
#import torch.optim as optim  OR from torch import optim as optim
env = gym.make('CartPole-v1',render_mode="human") # environment info
state_size = env.observation_space.shape[0]

#state_size = 4
action_size = env.action_space.n
#action_size = 2

batch_size = 32  # increase by powers of 2
num_episodes = 1000 # increase for more training

output_dir = 'results/cartpole'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


In [2]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        # memory reply
        self.memory = deque(maxlen=2000)
        # discount factor
        self.gamma = 0.95                   #closer to 1, more importance to future rewards
        # exploration rate
        self.epsilon = 1.0                  # how much to act randomly
        # decay rate
        self.epsilon_decay = 0.995          # how much to reduce exploration rate epsilon*decay = new_epsilon
        # min exploration rate
        self.epsilon_min = 0.01             # ensure epsilon doesn't go below a certain value
        # learning rate
        self.learning_rate = 0.001          # how much we update our DQN
        self.model = self._build_model()

        self.loss_function = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)


    def _build_model(self):
        model = nn.Sequential(
            nn.Linear(self.state_size, 24),
            nn.GELU(),
            #nn.ReLU(),
            nn.Linear(24, 24),
            nn.GELU(),
            #nn.ReLU(),
            nn.Linear(24, self.action_size)
        )
        return model

    def remember(self, state, action, reward, next_state, done):
        # store the state, action, reward, next_state, done in memory
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        #if random number (0 -> 1) is less than epsilon, act randomly
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        # otherwise, get the model to predict the Q values of the state

        # convert state to a tensor
        # bc PyTorch models expect input in the form of tensors
        state = torch.FloatTensor(state)

        # act_values is an array of Q values for each action
        act_values = self.model(state)

        # argmax returns the index of the max value in the array act_values
        return torch.argmax(act_values).item()

        # maybe add softmax here?

    def replay(self, batch_size):
        # if memory is less than batch_size, do nothing
        if len(self.memory) < batch_size:
            return
        # Select a random batch of experiences from the memory
        minibatch = random.sample(self.memory, batch_size)

        # Loop through each experience in the minibatch
        for state, action, reward, next_state, done in minibatch:
            # convert state, next_state to tensors
            state = torch.FloatTensor(state)
            next_state = torch.FloatTensor(next_state)
            action = torch.LongTensor([action])
            reward = torch.FloatTensor([reward])
            done = torch.FloatTensor([done])

            # If the episode has ended, the target is just the reward
            # If the episode has not ended, calculate the future discounted reward
            # Bellman equation: Q(s,a) = r + gamma * max Q(s',a')
            # reward = reward + gamma * (max Q value of the next state)
            target = reward + self.gamma * torch.max(self.model(next_state)) * (1 - done)

            # Get the predicted Q values
            #current_prediction = self.model(state)[action]
            # Get the predicted Q values
            current_prediction = self.model(state).gather(1, action.unsqueeze(-1))
            # Calculate the loss function between the predicted Q values and the target
            loss = self.loss_function(current_prediction, target)

            # Zero the gradients
            self.optimizer.zero_grad()
            # Backpropagate the loss (calculate the gradients)
            loss.backward()
            # Update the weights (adam step)
            self.optimizer.step()


        # If epsilon is greater than epsilon_minimum, decay it
        if self.epsilon > self.epsilon_min:
            # epsilon = epsilon * decay
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_state_dict(torch.load(name))

    def save(self, name):
        torch.save(self.model.state_dict(), name)


In [3]:
agent = Agent(state_size, action_size)
    # extract the numpy array from the state variable
    # state_array = state[0]
    # reshape the numpy array
    # state = np.reshape(state_array, [1, state_array.shape[0]])
    # now state_array is reshaped

    # old transpose:
    # reshape the states, transpose them to fit the model
    # state = np.reshape(state, [1, state_size])

In [4]:
# game hasnt started yet
done = False
for episode in range(num_episodes):

    # reset the environment
    state = env.reset()

    # reshape the states, transpose them to fit the model
    state = np.reshape(state[0], [1, state[0].shape[0]])


    for time in range(5000):
        #env.render()

        #checks epsilon and acts randomly or
        # converts states into tensor for the model
        # to predict the action then acts by argsmax of the Q values
        action = agent.act(state)

        #envirnment takes the action and returns the next state, reward, done
        next_state, reward, done, info, _ = env.step(action)


        if not done:
            #if the episode is not done, reward is whatever the reward is
            reward = reward
        else:
            #if the episode is done, reward is -10 for failing
            reward = -10

        # reshape the next state, transpose them to fit the model
        next_state = np.reshape(next_state, [1, next_state.shape[0]])

        # store in memory
        agent.remember(state, action, reward, next_state, done)

        # set the state to the next state
        state = next_state

        #
        # if the episode is done: either reached 5000 time steps or agent dropped the pole
        # print the episode number, score, epsilon, and break
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}".format(episode, num_episodes, time, agent.epsilon))
            break

    %timeit agent.replay(16)

    if episode % 50 == 0:
        agent.save(output_dir + "weights_" + "{:04d}".format(episode) + ".hdf5")

episode: 0/1000, score: 19, e: 1.0


  return F.mse_loss(input, target, reduction=self.reduction)


38.8 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
episode: 1/1000, score: 18, e: 0.67
39.7 ms ± 1.46 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
episode: 2/1000, score: 17, e: 0.44
38.1 ms ± 370 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
episode: 3/1000, score: 36, e: 0.3
39.3 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
episode: 4/1000, score: 48, e: 0.2
39 ms ± 2.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
episode: 5/1000, score: 29, e: 0.13
42.1 ms ± 4.39 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
episode: 6/1000, score: 73, e: 0.088
40.9 ms ± 4.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
episode: 7/1000, score: 34, e: 0.058
39.1 ms ± 157 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
episode: 8/1000, score: 87, e: 0.039
41.4 ms ± 5.98 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
episode: 9/1000, score: 27, e: 0.026
44.6 ms ± 7.21 ms per loop (mean ± std. 

KeyboardInterrupt: 

In [None]:
# prompt: load the weights

agent.load(output_dir + "weights_" + "{:04d}".format(50) + ".hdf5")
