In [1]:
import random
from collections import deque

import gymnasium as gym
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import numpy as np

In [2]:
ENV_NAME = "CartPole-v1"

GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995

In [3]:
class DQN:
    def __init__(self, observation_space, action_space):
        # Learning Parameters
        self.exploration_rate = EXPLORATION_MAX

        # Memory
        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        # Model <TODO try others
        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(learning_rate=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        # Append current environment to memory
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        # Get next move
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

In [5]:
env = gym.make(ENV_NAME)
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
dqn_solver = DQN(observation_space, action_space)

Metal device set to: Apple M1


2023-07-08 17:12:10.585336: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-07-08 17:12:10.585793: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [25]:
state = env.reset()

In [26]:
print(state)

(array([-0.02704097, -0.02883094, -0.03380166,  0.00350587], dtype=float32), {})


In [29]:
np.reshape(state[0], [1, observation_space])

array([[-0.02704097, -0.02883094, -0.03380166,  0.00350587]],
      dtype=float32)

In [31]:
action = dqn_solver.act(state)

In [33]:
len(env.step(action))

5

In [None]:
state_next, reward, terminal, info = env.step(action)