### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### Defining Time Matrix

In [2]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

#### Tracking the state-action pairs for checking convergence


In [3]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [10]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon_max = 1
        self.epsilon_decay = -0.0009
        self.epsilon_min = 0
        self.batch_size = 32        
        self.memory = deque(maxlen=2000)
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model

    
    def get_action(self,state,perm_idx,actions,t):
        epsilon = (1 - 0) * np.exp(-0.0009*t)
        if np.random.rand() <= epsilon:
            return random.choice(perm_idx)
        else:
            q_value = self.model.predict(np.array(state).reshape(self.state_size))
            print(q_value)
            return np.argmax(q_value[0])


    def append_sample(self, state, action, reward, next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    
        
    def train_model(self): 
        if len(self.memory) > self.batch_size:
            mini_batch = random.sample(self.memory, self.batch_size)
            update_target = np.zeros((self.batch_size, self.state_size))
            update_input = np.zeros((self.batch_size, self.state_size))
            
            action, reward, done = [], [] , []
            
            for i in range(self.batch_size):
                update_input[i] = mini_batch[i][0]
                action.append(mini_batch[i][1])
                reward.append(mini_batch[i][2])
                update_target[i] = mini_batch[i][3]
                done.append(mini_batch[i][4])
                
            target = self.model.predict(update_input)
            target_val = self.model.predict(update_target)
            
            for i in range(self.batch_size):
                if done[i]:
                    target[i][action[i]] = reward[i]
                else:
                    target[i][action[i]] = reward[i] + self.discount_factor * (np.amax(target_val[i]))
                                
            history = self.model.fit(update_input, target, batch_size=self.batch_size,epochs=1, verbose=0)
            return history.history['loss']


    def save(self, name):
        self.model.save(name)

In [11]:
Episodes = 50

In [12]:
Time_matrix = np.load("TM.npy")
for episode in range(Episodes):
    cd = CabDriver()
    agent = DQNAgent(36,20)
    terminal_state = False
    state = cd.state_init 
    rewards,losses = [],[]
    t = 0 
    while not terminal_state:
        perm_idx,actions = cd.requests(state)
        action = agent.get_action(cd.state_encod(state),perm_idx,actions,t)
        next_state, reward, done = cd.step(state,action,Time_matrix)
        terminal_state = done
        agent.append_sample(cd.state_encod(state), action, reward, cd.state_encod(next_state), done)
        loss = agent.train_model()
        state = next_state
        if loss is not None:
            losses.append(loss)
            rewards.append(reward)
        t+=1

ValueError: Error when checking : expected dense_5_input to have shape (36,) but got array with shape (1,)

### Tracking Convergence

In [None]:
# 