### Cab-Driver Agent

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
import sys
import os

py_file_location = "/content/gdrive/MyDrive/Reinforcement Learning/RL Case Study/RL Project(Cab-Driver)-Code Structure"
sys.path.append(os.path.abspath(py_file_location))

In [None]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import time

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt
%matplotlib inline
# Import the environment
from Env import CabDriver

#### Defining Time Matrix

In [None]:
# Loading the time matrix provided
Time_matrix = np.load("/content/gdrive/MyDrive/Reinforcement Learning/RL Case Study/RL Project(Cab-Driver)-Code Structure/TM.npy")

In [None]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open('/content/gdrive/MyDrive/Reinforcement Learning/RL Case Study/RL Project(Cab-Driver)-Code Structure/'+name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = 0.9
        self.learning_rate =  0.01    
        self.epsilon_max = 1
        self.epsilon = 1
        self.epsilon_decay = -0.0005
        self.epsilon_min = 0.00001
        
        self.batch_size = 32        
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # Initialize the value of the states tracked
        self.states_tracked = []
        
        # We are going to track state [0,0,0] and action (0,2) at index 2 in the action space.
        self.track_state = np.array(env.state_encod_arch1([0,0,0])).reshape(1, 36)

        # create main model and target model
        self.model = self.build_model()

    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        # Write your code here: Add layers to your neural nets       
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model

    def get_action(self, state,possible_actions_index, actions):
    # Write your code here:
    # get action from model using epsilon-greedy policy
    # Decay in ε after we generate each sample from the environment       
        if np.random.rand() <= self.epsilon:
          return random.choice(possible_actions_index)
        else:
          state = np.array(env.state_encod_arch1(state)).reshape(1, 36)     #reshaping to feed into neural net
          q_value = self.model.predict(state)
          possible_q_values = [q_value[0][i] for i in possible_actions_index]
          return possible_actions_index[np.argmax(possible_q_values)]     

    def append_sample(self, state, action_idx, reward, next_state, terminal_state):
    # Write your code here:
    # save sample <s,a,r,s'> to the replay memory
      self.memory.append((state, action_idx, reward, next_state, terminal_state))
    
    
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        
        if len(self.memory) > self.batch_size*3:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            update_output = np.zeros((self.batch_size, self.state_size))
            update_input = np.zeros((self.batch_size, self.state_size)) 
            
            actions, rewards, terminal_state = [], [], []
            
            for i in range(self.batch_size):
              state, action, reward, next_state, terminal_state_value = mini_batch[i]
              update_input[i] = env.state_encod_arch1(state)
              # Add action from memory
              actions.append(action)
              # Add reward from the memory
              rewards.append(reward)
              # Add next state s' to Q(s',a) from the memory
              update_output[i] = env.state_encod_arch1(next_state)
              terminal_state.append(terminal_state_value)

            # Write your code from here
            # 1. Predict the target from earlier model
            pred = self.model.predict(update_input)
            # 2. Get the target for the Q-network
            pred_qvalue = self.model.predict(update_output)
            #3. Update your 'update_output' and 'update_input' batch
            for i in range(self.batch_size):
              if terminal_state[i]:
                pred[i][actions[i]] = rewards[i]     #Only final reward is considered if episode has ended
              else:
                pred[i][actions[i]] = rewards[i] + self.discount_factor*np.max(pred_qvalue[i])   #If not terminal state, add discount factor to reward

            # 4. Fit your model and track the loss values
            self.model.fit(update_input, pred, batch_size=self.batch_size,epochs=1, verbose=0)
        else:
          pass

    def save_tracking_states(self):
        # Use the model to predict the q_value of the state we are tacking.
        q_value = self.model.predict(self.track_state)
        
        # Grab the q_value of the action index that we are tracking.
        self.states_tracked.append(q_value[0][2])
        
    def save_test_states(self):
        # Use the model to predict the q_value of the state we are tacking.
        q_value = self.model.predict(self.track_state)
        
        # Grab the q_value of the action index that we are tracking.
        self.states_test.append(q_value[0][2])

    def save(self, name):
      with open('/content/gdrive/MyDrive/Reinforcement Learning/RL Case Study/RL Project(Cab-Driver)-Code Structure/'+name + '.pkl', 'wb') as file:
        pickle.dump(self.model, file,pickle.HIGHEST_PROTOCOL)

In [None]:
Episodes = 15000

In [None]:
episode_length = 24*30 #30 days before which car has to be recharged
m = 5
t = 24
d = 7

env = CabDriver()
action_space, state_space, state = env.reset()

# Setting up state and action sizes.
state_size = m+t+d         #Vector length
action_size = len(action_space)

# Calling agent class
agent = DQNAgent(action_size=action_size, state_size=state_size)

# to store rewards in each episode
rewards_per_episode, episodes = [], []
# Rewards for state [0,0,0] being tracked.
rewards_init_state = []

### DQN block

In [None]:
start_time = time.time()
scores_track = []

for episode in range(Episodes):
    # Write code here
    # Call the environment
    # Call all the initialised variables of the environment
    terminal_state = False
    score = 0
    tracked_reward = False
    

    env = CabDriver()
    action_space, state_space, state = env.reset()
    first_state = env.state_init
    
    run_time = 0

    while not terminal_state:
      possible_actions_index, actions = env.requests(state)
      # 1. Pick epsilon-greedy action from possible actions for the current state
      action = agent.get_action(state, possible_actions_index, actions)
      # 2. Evaluate your reward and next state
      reward = env.reward_func(state, env.action_space[action], Time_matrix)
      next_state, next_time = env.next_state_func(state, env.action_space[action], Time_matrix)[::3]

      run_time += next_time
      if next_time > episode_length:
        terminal_state = True
      else:
        # 3. Append the experience to the memory
        agent.append_sample(state, action, reward, next_state, terminal_state)
        # 4. Train the model by calling function agent.train_model
        agent.train_model()
        # 5. Keep a track of rewards, Q-values, loss
        score += reward
        state = next_state
      
    rewards_per_episode.append(score)
    episodes.append(episode)

    #Epsilon Decay
    agent.epsilon = (1 - 0.00001) * np.exp(agent.epsilon_decay * episode)

    # every 20 episodes:
    if episode % 20 == 0:
      print("Episode: {0}, Reward: {1}, Memory Length: {2}, Epsilon: {3}".format(episode,score,len(agent.memory),agent.epsilon))
    # every 10 episodes, store q-values of some prespecified state-action pairs:
    if episode % 10 == 0:
      agent.save_tracking_states()
    
    #Rewards per episode
    scores_track.append(score)

    if episode % 1000 == 0:
      print("Saving Model {}".format(episode))
      agent.save(name="model_weights.pkl")

end_time = time.time()
print('Time Taken: '+str(round((end_time-start_time)/3600),2)+' hours')

In [None]:
agent.save(name="model_weights.pkl")

In [None]:
agent.states_tracked

In [None]:
state_tracked_sample = [agent.states_tracked[i] for i in range(len(agent.states_tracked)) if agent.states_tracked[i]

### Tracking Convergence

In [None]:
plt.figure(figsize=(10,7))
plt.title('Q_value for state [0,0,0]  action (0,2)')
xaxis = np.asarray(range(0, len(agent.states_tracked)))
plt.semilogy(xaxis,np.asarray(agent.states_tracked))
plt.show()

In [None]:
score_tracked_sample = [score_tracked[i] for i in range(len(score_tracked)) if (i % 4 == 0)]

In [None]:
plt.plot(list(range(len(rewards_per_episode))), rewards_per_episode)

In [None]:
plt.figure(0, figsize=(16,7))
plt.title('Rewards per episode')
xaxis = np.asarray(range(0, len(score_tracked_sample)))
plt.plot(xaxis,np.asarray(score_tracked_sample))
plt.show()

In [None]:
print("Average reward of last 100 episodes is {0}".format(np.mean(rewards_per_episode[-100:]))) 

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()

## **Note:** <br>
I could not finish the training due to severe Internet and system issues, which is why I was not able to attach the model_weights.pkl file. Apart from this, I have done the maximum I was able to, within my current abilities. I apologise for the same and request you to take this into consideration. <br>
Thank you.