In [None]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import time

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
import Env
from Env import CabDriver

# Loading the time matrix provided
Time_matrix = np.load(r'/Users/ACER/OneDrive/Desktop/AI PROJECT/TM.npy')

# Check what the max, min and mean time values are.
print(type(Time_matrix))
print(Time_matrix.max())
print(Time_matrix.min())
print(Time_matrix.mean())
print(Time_matrix.var())

# DQN Agent Class
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Specify hyperparameters for the DQN
        self.discount_factor = 0.95
        self.learning_rate = 0.01
        self.epsilon = 1
        self.epsilon_max = 1
        self.epsilon_decay = -0.0001
        self.epsilon_min = 0.00001
        
        self.batch_size = 32

        # Create replay memory using deque
        self.memory = deque(maxlen=2000)
        self.states_tracked = []
        
        # Track state [1,2,3] and action (0,2) at index 2
        self.track_state = np.array(env.state_encod_arch1([1,2,3])).reshape(1, 36)

        # Create main model and target model
        self.model = self.build_model()

    # Approximate Q function using Neural Network
    def build_model(self):
        input_shape = self.state_size
        model = Sequential()
        # Add layers to the neural network
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model

    def get_action(self, state, possible_actions_index, actions):
        # Get action from model using epsilon-greedy policy
        # Exploration
        if np.random.rand() <= self.epsilon:
            return random.choice(possible_actions_index)
        # Exploitation - greedy
        else:
            # Reshape state
            state = np.array(env.state_encod_arch1(state)).reshape(1, 36)
            # Predict Q-values
            q_value = self.model.predict(state)
            q_vals_possible = [q_value[0][i] for i in possible_actions_index]
            return possible_actions_index[np.argmax(q_vals_possible)]

    def append_sample(self, state, action_index, reward, next_state, done):
        # Save sample to the replay memory
        self.memory.append((state, action_index, reward, next_state, done))
        
    # Pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            update_input = np.zeros((self.batch_size, self.state_size))
            update_output = np.zeros((self.batch_size, self.state_size))
            actions, rewards, done = [], [], []

            for i in range(self.batch_size):
                state, action, reward, next_state, done_boolean = mini_batch[i]
                update_input[i] = env.state_encod_arch1(state)     
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch1(next_state)
                done.append(done_boolean)

            # Q-value is predicted from the states
            target = self.model.predict(update_input)
            # Target Q-value
            target_qval = self.model.predict(update_output)

            for i in range(self.batch_size):
                # Terminal
                if done[i]:
                    target[i][actions[i]] = rewards[i]
                # Non-terminal
                else:
                    target[i][actions[i]] = rewards[i] + self.discount_factor * np.max(target_qval[i])
            # Model fit
            self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)
