In [None]:
## Solving the same problem using DeepQNetwork
import gym
import random
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Embedding
from keras.layers import Reshape
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras import regularizers
#from scores.score_logger import ScoreLogger

In [None]:
class DQNSolver:
    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX
        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.vocab_size = VOCAB_SIZE
        self.embed_dim = EMBED_DIM

        # Define a feedforward NN
        self.model = Sequential()
        self.model.add(Embedding(self.vocab_size, self.embed_dim, input_shape=(observation_space,)))        
        self.model.add(Reshape((self.embed_dim,)))
        self.model.add(Dense(30, activation="relu", kernel_regularizer=regularizers.l2(0.01),activity_regularizer=regularizers.l2(0.01))) 
        self.model.add(Dense(self.action_space, activation="linear")) 
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    # Create a memory sequence
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # Given a state predict action
    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.choice(range(self.action_space))            
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = reward + GAMMA * np.amax(self.model.predict(state_next)[0])
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

In [None]:
# OpenAI Environment 
ENV_NAME = "Taxi-v3"

# Hyperparameters
GAMMA = 0.7
LEARNING_RATE = 0.1
MEMORY_SIZE = 50000
BATCH_SIZE = 128
EXPLORATION_MAX = 1
EXPLORATION_MIN = 0.5
EXPLORATION_DECAY = 0.995
EPISODES = 5000
VOCAB_SIZE = 500 # env.observation_space gives the vocabulary size for embedding layer
EMBED_DIM = 16

# Instantiate env object
env = gym.make(ENV_NAME).env
observation_space = 1 
action_space = env.action_space.n -1

# Create DQN solver instance
dqn_solver = DQNSolver(observation_space, action_space)

In [None]:
# Train for a number of episodes
run = 0
for i in range(EPISODES):
    run += 1
    state = np.reshape(env.reset(), [1, observation_space])
    step = 0
    terminal = False
    penalties = 0
    while True:
        step += 1
        action = dqn_solver.act(state)
        state_next, reward, terminal, info = env.step(action)
        if(action ==4): terminal = True
        state_next = np.reshape(state_next, [1, observation_space])
        dqn_solver.remember(state, action, reward, state_next, terminal)
        state = state_next
        if terminal:
            print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
            break
    dqn_solver.experience_replay()

In [None]:
 """Evaluate agent's performance after Deep Q-learning"""

total_epochs, total_penalties, total_reward_all_episode = 0, 0, 0
episodes = 1
for e in range(episodes):
    state = env.reset()
    env.render()
    state = np.reshape(state, [1, observation_space])
    epochs, penalties, reward, total_reward = 0, 0, 0, 0    
    done = False
    while not done:
        action = dqn_solver.act(state)
        state, reward, done, info = env.step(action)
        env.render()
        if action == 4: done =True
        state = np.reshape(state, [1, observation_space])
        total_reward += reward 
        if reward == -10:
            penalties += 1
        epochs += 1      
    total_penalties += penalties
    total_epochs += epochs
    total_reward_all_episode +=total_reward
    print(f"Total reward for Episode {e}:  {total_reward}") 
    print(f"Penalty for Episode {e}:  {penalties}") 
print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties}")
print(f"Average reward per episode: {total_reward_all_episode / episodes}")