In [1]:
%pip install gym

Note: you may need to restart the kernel to use updated packages.


In [2]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import keras
import time
from tqdm import tqdm
import os

In [3]:
ENV_NAME = "CartPole-v1"
# ENV_NAME = 'LunarLander-v2'

MODEL_PATH = f'{ENV_NAME}_model'
PLAY_MODE = True

env = gym.make(ENV_NAME)
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n

In [4]:
GAMMA = 0.95
LEARNING_RATE = 0.01
LEARNING_RATE_DECAY = 0.01
MEMORY_SIZE = 1000000
BATCH_SIZE = 64
MODEL_PATH = f'{ENV_NAME}_model'
FRAME_RATE = 0.05
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995
FRAME_RATE = 0.01


In [5]:
class DQNAgent():
    def __init__(self, env):
        self.exploration_rate = EXPLORATION_MAX
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.should_learn = True
        self.env = env
        self.observation_space = env.observation_space.shape[0]
        self.action_space = env.action_space.n

        if os.path.exists(MODEL_PATH):
            self.model = keras.models.load_model(MODEL_PATH)
            print('Using predefined model')
        else:
            self.model = Sequential()
            self.model.add(Dense(24, input_dim=self.observation_space, activation='tanh'))
            self.model.add(Dense(48, activation='tanh'))
            self.model.add(Dense(self.action_space, activation='linear'))
            self.model.compile(loss='mse', optimizer=Adam(lr=LEARNING_RATE, decay=LEARNING_RATE_DECAY))
            print('Creating new model')
        self.iterations = 1

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, exploration_rate):
        if np.random.rand() < exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def process_state(self, state):
        return np.reshape(state, [1, self.observation_space])
    
    def learn(self):
        states_batch, q_values_batch = [], []
        mini_batch = random.sample(
            self.memory, min(len(self.memory), BATCH_SIZE))
        for state, action, reward, next_state, done in mini_batch:
            q_values = self.model.predict(state)
            q_values[0][action] = reward if done else reward + GAMMA * np.max(self.model.predict(next_state)[0])
            states_batch.append(state[0])
            q_values_batch.append(q_values[0])

        self.model.fit(np.array(states_batch), np.array(q_values_batch), batch_size=len(states_batch), verbose=0)

        if self.iterations % 10 == 0:
            print(f'Saving model to {MODEL_PATH}')
            self.model.save(MODEL_PATH)

        self.iterations += 1
        
    def play(self, exploration, render=True):
        state = self.process_state(self.env.reset())
        if render:
            env.render()
        done = False
        score = 0
        actions = []
        while not done:
            action = self.act(state, exploration)
            actions.append(action)
            next_state, reward, done, _ = self.env.step(action)
            if render:
                time.sleep(FRAME_RATE)
                env.render()
            next_state = self.process_state(next_state)
            self.remember(state, action, reward, next_state, done)
            state = next_state
            score += reward
        if not PLAY_MODE:
            self.learn()
        return score


In [6]:
agent = DQNAgent(env)

Using predefined model


# Play Randomly

In [7]:
agent.play(1)

-151.73804950987062

# Partialy Explore

In [8]:
agent.play(0.5)

-209.11163479215594

# Exploit

In [9]:
agent.play(0)

-124.62020103034948