In [1]:
import gym
import numpy as np
import random
import os.path
from collections import deque
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from time import sleep

GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995

In [2]:
class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        if not os.path.isfile('cartpole.h5'):
            print("creating model...")
            self.model = Sequential()
            self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
            self.model.add(Dense(24, activation="relu"))
            self.model.add(Dense(self.action_space, activation="linear"))
            self.model.compile(loss="mse", optimizer=Adam(learning_rate=LEARNING_RATE))
        else:
            self.model = tensorflow.keras.models.load_model('cartpole.h5')

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)


In [3]:
def cartpole_train():    
    env = gym.make('CartPole-v0')
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)

    for i_episode in range(20):
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        score = 0
        for t in range(100):
            env.render()
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action) # take a random action
            state_next = np.reshape(state_next, [1, observation_space])
            reward = reward if not terminal else -reward
            score += reward
            
            dqn_solver.remember(state, action, reward, state_next, terminal)
            dqn_solver.experience_replay()
            state = state_next
            reward = 0
            if terminal:
                print("Episode finished after {} timesteps".format(t+1))
                print("Score: {}".format(score))
                break
#             sleep(0.03)
    env.close()
    return dqn_solver

In [4]:
def cartpole_play(dqn_solver_trained):    
    env = gym.make('CartPole-v0')
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = dqn_solver_trained
    
    for i_episode in range(20):
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        for t in range(100):
            env.render()
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action) # take a random action
            state_next = np.reshape(state_next, [1, observation_space])
            
            state = state_next
            if terminal:
                break
            sleep(0.03)
    env.close()

In [None]:
dqn_solver_trained = cartpole_train()

Episode finished after 32 timesteps
Score: 30.0
Episode finished after 16 timesteps
Score: 14.0
Episode finished after 14 timesteps
Score: 12.0


In [None]:
print(dqn_solver_trained)
# dqn_solver_trained.model.save("cartpole.h5")

In [7]:
env = gym.make('CartPole-v0')
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
dqn_solver_trained = DQNSolver(observation_space, action_space)

In [9]:
cartpole_1(dqn_solver_trained)