# Reinforcement Learning with:
- Monte Carlo Simulation Algorithm
- SARSA Algorithm
- Q Learning Algorithm

# Environment and Mission 

*The goal is for the agent to learn how to navigate the state space to reach the end goal of retrieving the frisbee*
<br></br>

<U>**Within Action Space, the following actions are defined:**</U>

**'L':** Move left

**'D':** Move down

**'R':** Move right

**'U':** Move up

*If agent attempts to leave the grid, when at the edges, program would set the new state as the old state. Basically it will not move
<br></br>

<U>**Map**:</U>
    
    S  .  .  .
    
    .  H  .  H
    
    .  .  .  H
    
    H  .  .  E
<br></br>
<U>**Rewards**:</U>

Reach goal: +1

Reach hole: -1

Traversing frozen surface: 0 


---

# Building Environment


### Importing relevant packages

In [194]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import random
import statistics as st
import pandas as pd
from queue import PriorityQueue

### Creating Grid Environment

#### Creating Grid Class

In [195]:
class Grid:

    # Takes in variables of rows, cols and start state of agent
    def __init__(self, rows, cols, start):
        self.rows = rows
        self.cols = cols
        self.i = start[0]
        self.j = start[1]
    
    # Fucntion that allows user to set rewards and actions allowed at given states
    def set(self, rewards, actions):
        self.rewards = rewards
        self.actions = actions

    # Function that allows user to set state of agent
    def set_state(self,s):
        self.i = s[0]
        self.j = s[1]
    
    # Function that fetches current state of agent
    def current_state(self):
        return(self.i, self.j)
    
    # Function that checks if agent is in a terminal state (if current state of agent is in a terminal state: hole / goal state, then function returns True)
    def is_terminal(self, s):
        return s not in self.actions
    
    # Function that fetches the possible actions the agent can take at a given state s
    def possible_actions(self, s):
        return self.actions[s]
    
    # Moves the agent in the state space based on the action taken by the agent
    def move(self, action):
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action == 'D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1
    
    # Gets reward of the current state of agent
    def get_rewards(self):
        reward = self.rewards.get((self.i, self.j), 0)
        return reward
    
    # Undo move of agent (Function isn't used but put in place if needed)
    def undo_move(self, action):
        if action == 'U':
            self.i += 1
        elif action == 'D':
            self.i -= 1
        elif action == 'R':
            self.j -= 1
        elif action == 'L':
            self.j += 1
        # assert(self.current_state() in self.all_states())
    
            
    # To reset agent to be at starting state - (0, 0) in our specific example
    def reset(self):
        self.set_state((0,0))

#### Environment Characteristics class

In [196]:
class env_characteristics():
    def __init__(self, no_of_rows, no_of_cols, percentage_of_holes):
        self.rows = no_of_rows
        self.cols = no_of_cols
        self.actions = {}
        self.holes = []
        self.rewards = {}
        self.start_state = (0, 0)
        self.end_state = (no_of_rows - 1, no_of_cols - 1)
        self.percentage_holes = percentage_of_holes
     

    # Function that creates legal actions dictionary (similar to hard-coded version in env1)
    def create_legal_actions(self): 
        
        actions = {}
        for i in range(self.rows):
            for j in range(self.cols):
                action_space = ['D', 'L', 'R', 'U']

                if i == 0 and j == 0:
                    illegal_action = ('L', 'U')
                    for item in illegal_action:
                        action_space.remove(item)
                    
                    actions[(i, j)] = tuple(action_space)

            
                elif i == 0 and j == self.cols-1:
                    illegal_action = ('R', 'U')
                    for item in illegal_action:
                        action_space.remove(item)
                    actions[(i, j)] = tuple(action_space)
                    
                elif i == self.rows-1 and j == 0:
                    illegal_action = ('D', 'L')
        
                    for item in illegal_action:
                        action_space.remove(item)
                    actions[(i, j)] = tuple(action_space) 
                        
                    
                elif i == self.rows-1 and j == self.cols-1:
                    illegal_action = ('D', 'R')
                    
                    for item in illegal_action:
                        action_space.remove(item)
                    actions[(i, j)] = tuple(action_space) 
                                            

                elif i == 0:
                    illegal_action = ('U')

                    for item in illegal_action:
                        action_space.remove(item)
                    actions[(i, j)] = tuple(action_space) 
                        

                elif j == 0:
                    illegal_action = ('L')

                    for item in illegal_action:
                        action_space.remove(item)
                    actions[(i, j)] = tuple(action_space) 


                elif i == self.rows-1:
                    illegal_action = ('D')

                    for item in illegal_action:
                        action_space.remove(item)
                    actions[(i, j)] = tuple(action_space) 
                        

                elif j == self.cols-1:
                    illegal_action = ('R')

                    for item in illegal_action:
                        action_space.remove(item)
                    actions[(i, j)] = tuple(action_space) 
                        
            
                else:
                    illegal_action = ()
                    
                    for item in illegal_action:
                        action_space.remove(item)
                    actions[(i, j)] = tuple(action_space) 


        return actions
    

    # Funtion that creates rewards dictionary (similar to hard-coded version in env1)
    # Rewards list will dictate where holes are placed - hence we need to ensure holes are populated without blocking
    # We do so by running the Astar algorithm to check if there is possible path everytime we place a new hole
    def createActionsRewards(self):
        actions_list = self.create_legal_actions()
        number_of_holes = int(self.percentage_holes * self.rows * self.cols)

        # Create rewards dictionary
        rewards = {}
        hole_list = []

        # Add rewards for end state
        rewards[self.end_state] = 1

        # Run loop to create correct number of holes 
        for i in range(number_of_holes):    
            # Initialise random hole variable
            random_hole = 0
            
            # Generate random hole again if random hole has already been made
            while random_hole not in hole_list:
                # If random hole removed from actions list already, generate another hole
                # Generate new hole if random hole selected is start state
                while random_hole not in actions_list or random_hole == self.start_state or random_hole == self.end_state:
                    random_hole = (random.randint(0, self.rows - 1), random.randint(0, self.cols - 1))
                
                state, actions = random_hole, actions_list[random_hole]

                # Delete hole from actions-list to run ASTAR algo
                del actions_list[random_hole]

                # If astar returns viable path, then append hole and note it in rewards dictionary
                viable_path = astar_algo(self.start_state, self.end_state, self.rows, actions_list)
                if viable_path != None:
                    hole_list.append(state)
                    rewards[state] = -1

                # else, run main loop again
                else:
                    actions_list[state] = actions
                    random_hole = 0 # Initialised hole that will never be in actions list so second while loop will run
                
        # Remove end state to fit definition of environment        
        del actions_list[self.end_state] # Done here and not before because Astar algo requires endstate to be in actions_list
        
        return actions_list, rewards

##### Astar Algorithm for building random holes in environment

In [208]:
def astar_algo(start_coord, end_coord, grid_size, actions):
    
    # Heuristic function that calculates the Manhattan distance between two points
    def heuristic(s, e):
        return abs(e[0] - s[0]) + abs(e[1] - s[1])
    
    # Converts coordinate states to number states -> (0,0) = 0, (1,0) = 4
    def coord_to_state(coord, grid_size):
        return coord[0] * grid_size + coord[1]

    start_state = coord_to_state(start_coord, grid_size)
    end_state = coord_to_state(end_coord, grid_size)  
    
    # Create g_score dict 
    g_score = {start_state: 0}
    # Activate f_score which is g_score + heuristic score
    f_score = {start_state: heuristic(start_coord, end_coord)}

    # Create queue to select states with lowest fscore for finding optimal path
    open_list = PriorityQueue()
    open_list.put((f_score[start_state], start_state))
    closed_list = set()

    # came_from dictionary to return shortest path later
    came_from = {}
    
    while open_list.empty() == False:
        
        # Get state of the lowest fscore
        current_state = open_list.get()[1]

        # Check if reached end goal
        if current_state == end_state:
            path = []

            # If reached, return shortest path
            while current_state != start_state:
                path.append(current_state)
                current_state = came_from[current_state]
            path.append(start_state)
            return path[::-1]
        
        # Add current_state to closed_list - to keep track of visited
        closed_list.add(current_state)


        neighbours = []
        row, col = divmod(current_state, grid_size)
        # print("current state: {}, coord: {}".format(current_state, (row, col)))
        
        # Append neighbouring states to neighbours list
        if row > 0:
            if (row, col) in actions:
                above_state = (row - 1, col)
                if above_state in actions:
                    neighbours.append(current_state - grid_size) # State above

        if row < grid_size - 1:
            if (row, col) in actions:
                below_state = (row + 1, col)
                if below_state in actions:
                    neighbours.append(current_state + grid_size) # State below

        if col > 0:
            if (row, col) in actions:
                left_state = (row, col - 1)
                if left_state in actions:
                    neighbours.append(current_state - 1) # State on left

        if col < grid_size - 1:
            if (row, col) in actions:
                right_state = (row, col + 1)
                if right_state in actions:
                    neighbours.append(current_state + 1) # State on right

        # Loop through neighbour states
        for neighbour_state in neighbours:
            if neighbour_state in closed_list:
                continue
        
            # Calculate tentative g-score of neighbour
            tentative_g_score = g_score[current_state] + 1
            if neighbour_state not in g_score or tentative_g_score < g_score[neighbour_state]:
                # Update scores and came_from dictionary
                g_score[neighbour_state] = tentative_g_score
                f_score[neighbour_state] = tentative_g_score + heuristic((neighbour_state // grid_size, neighbour_state % grid_size), end_coord)
                came_from[neighbour_state] = current_state 

                open_list.put((f_score[neighbour_state], neighbour_state))

    # If no path, return none
    return None

#### Creating Grid Environment Function


In [209]:
def standard_grid(rewards, actions, rows, cols, start_state):
    # define a grid that describes the reward for arriving at each state
    # and possible actions at each state
    # the grid looks like this
    # S means start position
    # E means the end states

        # S  .  .  .
        # .  H  .  H
        # .  .  .  H
        # H  .  .  E

    g = Grid(rows, cols, start_state) #(rows, cols, start_state)
    g.set(rewards, actions)
    return g

#### Creating Environment 1 - 4 x 4 Grid (With specific holes in environment)

In [210]:
# Environment Characteristics
# no. of rows & cols of grid
no_of_rows = 4
no_of_cols = 4

# Full action space
action_space = ('D', 'U', 'L', 'R')

# Assigned start state
start_state = (0, 0)

# Define rewards at specific states (punishment yields negative rewards)
# rewards at given states (in dictionary form)
rewards = {(1, 1): -1, # hole
           (1, 3): -1, # hole
           (2, 3): -1, # hole
           (3, 0): -1, # hole
           (3, 3): 1} # frisbee

# Define legal (possible) actions at each state
# States that depict terminal state (hole / end goal) are commented because this will tie in with the .is_terminal() function under class Grid
actions = {
        (0, 0): ('D', 'R'), # Start_state
        (0, 1): ('D', 'R', 'L'), 
        (0, 2): ('D', 'R', 'L'),
        (0, 3): ('D', 'L'),
        (1, 0): ('D', 'R', 'U'),
        #(1, 1): ('D', 'R', 'L', 'U'), #Hole
        (1, 2): ('D', 'R', 'L', 'U'),
        #(1, 3): ('D', 'U', 'L'), #Hole
        (2, 0): ('D', 'U', 'R'),
        (2, 1): ('D', 'R', 'L', 'U'),
        (2, 2): ('D', 'R', 'L', 'U'),
        #(2, 3): ('D', 'U', 'L'), #Hole
        #(3, 0): ('U', 'R', ), #Hole
        (3, 1): ('U', 'R', 'L'),
        (3, 2): ('U', 'R', 'L'),
        #(3, 3): (), #End-State (frisbee)
}


# Create 4x4 Grid environment
env1 = standard_grid(rewards, actions, no_of_rows, no_of_cols, start_state) 

# Reset environment to start state defined as (0,0) in .reset() function
env1.reset()

##### --- Function testing ---

In [200]:
# Test if .is_terminal() function works
    # Terminal States: 1,1  1,3  2,3  3,0  3,3
print(env1.is_terminal((2, 0)))
print(env1.is_terminal((3, 0)))


# Test .move()
env1.reset()
state_before = env1.current_state()
action = env1.move('D')
state_after = env1.current_state()
print('Original State: {}, After taking action: {}'.format(state_before, state_after))


# Test loop to stop moving when environment reaches terminal state
while env1.is_terminal(env1.current_state()) == False:
    a = action_space[(random.randint(0, (len(action_space)-1)))]
    state_b = env1.current_state()
    env1.move(a)
    state_a = env1.current_state()
    
    print("State before: {}, State After taking action '{}': {}".format(state_b, a, state_a))

else:
    print('Reached terminal state {}'.format(env1.current_state()))

False
True
Original State: (0, 0), After taking action: (1, 0)
State before: (1, 0), State After taking action 'U': (0, 0)
State before: (0, 0), State After taking action 'L': (0, 0)
State before: (0, 0), State After taking action 'R': (0, 1)
State before: (0, 1), State After taking action 'L': (0, 0)
State before: (0, 0), State After taking action 'D': (1, 0)
State before: (1, 0), State After taking action 'R': (1, 1)
Reached terminal state (1, 1)


#### Extended Implementation: Creating Environment 2 - 10 x 10 Grid

<strong> Creating Env2, Legal Actions and Rewards List  </strong>

In [218]:
nrows = 10
ncols = 10
p_holes = 0.25
start_state_env_2 = (0,0)

# Creating legal actions and rewards list to parse into standard_grid class
actions_env2, rewards_env2 = env_characteristics(nrows, ncols, p_holes).createActionsRewards()

In [217]:
# Create 10x10 Grid environment
env2 = standard_grid(rewards_env2, actions_env2, nrows, ncols, start_state_env_2) 

# Reset environment to start state defined as (0,0) in .reset() function
env2.reset()

##### --- Function testing ---

In [221]:
env2.actions
env2.rewards

{(9, 9): 1,
 (7, 7): -1,
 (3, 3): -1,
 (7, 8): -1,
 (2, 1): -1,
 (1, 4): -1,
 (7, 2): -1,
 (6, 0): -1,
 (6, 3): -1,
 (5, 1): -1,
 (3, 1): -1,
 (8, 1): -1,
 (7, 0): -1,
 (5, 6): -1,
 (1, 6): -1,
 (2, 6): -1,
 (4, 3): -1,
 (7, 5): -1,
 (1, 5): -1,
 (1, 8): -1,
 (2, 9): -1,
 (6, 5): -1,
 (4, 4): -1,
 (5, 3): -1,
 (9, 7): -1,
 (6, 2): -1}

# Q table, Returns table and Policy

### Q table Function

*Q table is built as a dataframe for easier referencing: there were problems with referencing when building a multi nested dictionary*

In [222]:
def create_qtable(no_of_rows, no_of_cols, action_space):
    # Creates Q table as a nested dictionary
    Q = {}
    for i in range(no_of_rows):
        for j in range(no_of_cols):
            Q[(str(i) + str(j))] = 0
    
    action_space_dic = {}
    for item in action_space:
        action_space_dic[item] = 0
        

    for k, v in Q.items():
        Q[k] = action_space_dic
    
    # Converts Q table into a dataframe
    Q = pd.DataFrame(data = Q)
        
    return Q

### Returns table Function


*Returns table is built as a dataframe for easier referencing: there were problems with referencing when building a multi nested dictionary*

In [223]:
def create_returnstable(no_of_rows, no_of_cols, action_space):
    # Creates Returns table as a nested dictionary
    returns = {}
    for i in range(no_of_rows):
        for j in range(no_of_cols):
            returns[(str(i) + str(j))] = 0
    
    action_space_dic = {}
    for item in action_space:
        action_space_dic[item] = []
        

    for k, v in returns.items():
        returns[k] = action_space_dic
    
    # Converts Returns table into a dataframe
    returns = pd.DataFrame(data = returns)
        
    return returns

### Epsilon Greedy Policy

In [224]:
# Select an action for the agent to take. Each action has a minimum probability of (epsilon / no. of actions) of being selected
# Optimal action has a higher probability of being selected
def epsilon_soft(Qtable, env, epsilon, currentstate):

    prob = epsilon # sum of minimum prob of selecting all actions in action space

    # Set a random probability to determine which actions are being selected
    random_prob = random.random()
    best_actions = []
    valid_q_values = []

    # Finding max q value at the specific state for legal actions
    state_f = str(currentstate[0]) + str(currentstate[1]) # Formatted state
    for legal_action in env.actions[currentstate]:
        valid_q_values.append(Qtable.at[legal_action, state_f])
        max_q_value = max(valid_q_values)

    # Appending max q value of legal actions to best_actions list
    for item in Qtable[Qtable[state_f] == max_q_value].index.values:
        if item in env.actions[currentstate]:
            best_actions.append(item)
        else:
            continue
    
    # When random_prob =< sum of min prob of all actions, randomly select action
    if random_prob <= prob:
        # Select random action from legal actions
        valid_actions = env.actions[currentstate]
        action = valid_actions[(random.randint(0, (len(valid_actions)-1)))]
        return action
            
    # If random_prob > prob, then select legal action with highest q value
        # Other problems that this code solves:
            # 1. When more than 1 action has the same q value - select the action randomly
    else:
        action = best_actions[random.randint(0, len(best_actions)-1)]
        return action

_______________________________________________________________________________________________________________________________

# First-visit Monte Carlo Without Exploring

###  Algorithm Class

In [225]:
class monte_carlo_sim:
    '''
    ** Please remember to reset Q and Returns table after simulation

        Functions:
        # Fetches Qtable ->                            .fetchQtable()    
        # Fetches Returns Table ->                     .fetchReturnstable() 
        # Run simulation ->                            .simulate(no_of_episodes, epsilon, gamma)
        # Resets Qtable and Returns Table ->           .resettables()      
    '''
    def __init__(self, env):
        self.env = env
        # Create Q and Returns table on creating class
        self.Qtable_monte = create_qtable(self.env.rows, self.env.cols, action_space)
        self.Returns = create_returnstable(self.env.rows, self.env.cols, action_space)

    # Returns Qtable
    def fetchQtable(self):
        return self.Qtable_monte
    
    # Returns Returns table
    def fetchReturnstable(self):
        return self.Returns
    
    # Reset Q and Returns table by creating empty tables
    def resettables(self):
        self.Qtable_monte = createQtable()
        self.Returns = createReturnstable()

    # Run monte carlo simulation
    def simulate(self, no_of_episodes, epsilon, gamma):
        for i in range(no_of_episodes):
            episode = []
            G = 0

            # Reset environment to start state for each episode
            self.env.reset()

            # Loop so agent moves through state space until it reaches a terminal state (hole / end goal)
            # Store episode path (states, actions and rewards) in episode list
            while self.env.is_terminal(self.env.current_state()) == False:
                state = self.env.current_state()
                # print(state)
                action = epsilon_soft(self.Qtable_monte, self.env, epsilon, state)
                # Move agent based on selected action
                
                self.env.move(action)
                # Retrieve reward
                reward = self.env.get_rewards()

                # Append all results to episode list so that backpropogation of rewards can be done later
                episode.append((state, action, reward))

            # Reverse episode list so looping is easier
            episode_reversed = episode[::-1]
            # Create a temporary list to use for checking if there are repeated visits to states
            temp_lst = [item[0] for item in episode_reversed]

            # Loop through episodes in reverse (T-1 -> T-2 -> ... -> 0)
            for i in range(len(episode_reversed)):
                state = episode_reversed[i][0]
                state_f = str(state[0]) + str(state[1]) # state but formatted for referencing in dataframe
                act_taken = episode_reversed[i][1]
                r = episode_reversed[i][2]

                G = gamma*G + r

                # print(episode_reversed)

                # For first visit to state, append G to the Returns dataframe
                if state not in temp_lst[i+1:]:
                    self.Returns.at[act_taken, state_f] = self.Returns.at[act_taken, state_f] + [G]
                    # print(Returns.at[act_taken, state_f])
                else:
                    # Else, move on to next step in epsisode
                    continue
            
            # Update Q table with average returns for each state and action
            for state in self.Qtable_monte.columns.values:
                for action in self.Qtable_monte.index.values:
                    if len(self.Returns.at[action, state]) != 0:
                        self.Qtable_monte.loc[action, state] = st.mean(self.Returns.at[action, state])
                    else:    
                        continue


###  Running Simulation

#####  Defining Parameters

In [228]:
epsilon_monte = 0.1 # Epsilon greedy probability 
gamma_monte = 0.9 # Rewards discount rate gamma
no_of_episodes_monte = 100 # Number of episodes to be executed in simulation

#####  Creating instance of class & running simulation (Environment 1)

In [229]:
# Create instance of class with environment 1 (4x4 grid)
monte1 = monte_carlo_sim(env1)

# Run Monte Carlo simulation
monte1.simulate(no_of_episodes_monte, epsilon_monte, gamma_monte)

#####  Qtable Results for *Monte Carlo Simulation* (Environment 1)

In [230]:
monte1.fetchQtable()

Unnamed: 0,00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
D,-0.276433,-1.0,-0.384424,-1.0,-0.07695,0,-0.2487,0,0.0,0.0,0.357,0,0,0,0.0,0
U,0.0,0.0,0.0,0.0,-0.092454,0,-0.099743,0,0.0,-1.0,0.0,0,0,0,-0.81,0
L,0.0,-0.176745,-0.154601,-0.18187,0.0,0,-1.0,0,0.0,0.0,0.0,0,0,0,0.0,0
R,-0.173963,-0.15314,-0.305449,0.0,-1.0,0,-1.0,0,-0.0855,0.81,-1.0,0,0,0,1.0,0


#####  Extended Implementation: Creating instance of class & running simulation (Environment 2)

In [231]:
# Create instance of class with environment 2 (10x10 grid)
monte2 = monte_carlo_sim(env2)

# Run Monte Carlo simulation
monte2.simulate(no_of_episodes_monte, epsilon_monte, gamma_monte)


#####  Qtable Results for *Monte Carlo Simulation* (Environment 2)

In [232]:
monte2.fetchQtable()

Unnamed: 0,00,01,02,03,04,05,06,07,08,09,...,90,91,92,93,94,95,96,97,98,99
D,-0.020816,-0.225,-0.177147,-0.3,-1.0,-1.0,-1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
L,0.0,-0.3645,-0.007286,-0.006183,-4.9e-05,-0.59049,-0.81,0,0,0,...,0,0,0,0,0,0,0,0,0,0
R,-0.09721,-0.016211,-0.215234,-0.478297,-0.531441,-0.9,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


_______________________________________________________________________________________________________________________________

# SARSA with ϵ-Greedy Behavior Policy

###  Algorithm Class

In [295]:
class SARSA_sim:
    '''
    ** Please remember to reset Q and Returns table after simulation

        Functions:
        # Fetches Qtable ->                            .fetchQtable()    
        # Run simulation ->                            .simulate(no_of_episodes, epsilon, gamma)
        # Resets Qtable  ->                            .resettable()      
    '''
    def __init__(self, env):
        self.env = env
        # Create Q table upon creating class
        self.Qtable_sarsa = create_qtable(self.env.rows, self.env.cols, action_space)
        
    # Returns Qtable    
    def fetchQtable(self):
        return self.Qtable_sarsa
    
    # Resets Qtable by creating an empty table
    def resettable(self):
        self.Qtable_sarsa = createQtable()
    
    # Run SARSA simulation
            # There will be a sub step simulation within the overarching simulation - for looking ahead and updating Qtable
    def simulate(self, no_of_episodes, epsilon, gamma, alpha):        
        for i in range(no_of_episodes):    
            print('Episode Number: {} / {}'.format(i + 1, no_of_episodes))
            # reset environment to start state
            self.env.reset()  

            # Initiialise main simulation state
            # main_step_state = self.env.current_state()
            # print('Main Step: {}'.format(main_step_state))

            # Loop for main simulation
            while self.env.is_terminal(self.env.current_state()) == False:
                # Assign state so that it can be referenced again later after sub step simulation is conducted
                main_step_state = self.env.current_state()
                print('Main Step: {}'.format(main_step_state))


                # Loop for sub simulation - Looking ahead until agent reaches terminal state 
                while self.env.is_terminal(self.env.current_state()) == False:
                    sub_step_state = self.env.current_state()
                    state_formatted = str(sub_step_state[0]) + str(sub_step_state[1])
                    # Choose action in sub step simulation
                    sub_step_action = epsilon_soft(self.Qtable_sarsa, self.env, epsilon, sub_step_state)
                    print('Sub step state: {}, Sub step action: {}'.format(sub_step_state, sub_step_action))
                    
                    # Taking action - to observe next state, action and reward
                    self.env.move(sub_step_action)

                    # Retrieve reward for taking specific action
                    reward = self.env.get_rewards()
                    # Retrieve new state
                    new_sub_step_state = self.env.current_state()
                    new_state_formatted = str(new_sub_step_state[0]) + str(new_sub_step_state[1])

                    # print("Sub State: {}\nSub Action: {}\n\n".format(sub_step_state, sub_step_action)) ## Test print

                    # Check if new state is a terminal state: If not, then retrieve action for new state
                    if new_sub_step_state in self.env.actions:
                        new_sub_step_action = epsilon_soft(self.Qtable_sarsa, self.env, epsilon, new_sub_step_state)
                        
                        # Update Q(s1, a1) in direction of Q(s2, a2)
                        self.Qtable_sarsa.loc[sub_step_action, state_formatted] += (alpha * (reward + (gamma * self.Qtable_sarsa.at[new_sub_step_action, new_state_formatted]) - self.Qtable_sarsa.at[sub_step_action, state_formatted]))

                    # If state is a terminal state, then update Q(s1, a1) with same equation, but Q(s2, a2) = 0 (since terminal state)
                    else:
                        self.Qtable_sarsa.loc[sub_step_action, state_formatted] += (alpha * (reward + (gamma * 0) - self.Qtable_sarsa.at[sub_step_action, state_formatted]))
                        break # break out of sub simulation since it has reached terminal state

                # Since we now want to revert back to the main simulation, we will need to reassign agent back to the main state  
                self.env.set_state(main_step_state)
                # Choose action based on policy with new Q values updated by sub simulation
                main_step_action = epsilon_soft(self.Qtable_sarsa, self.env, epsilon, main_step_state)

                # Move agent
                self.env.move(main_step_action) 

                # print("Main state: {}\nMain Action: {}".format(main_step_state, main_step_action)) ## Test print

###  Running Simulation

#####  Defining Parameters

In [293]:
epsilon_sarsa = 0.1 # Epsilon greedy probability 
gamma_sarsa = 0.9 # Rewards discount rate gamma
alpha_sarsa = 0.3 # Learning rate of agent
no_of_episodes_sarsa = 10# Number of episodes to be executed in simulation

#####  Creating instance of class & running simulation (Environment 1)

In [290]:
# Create instance of class with environment 1 (4x4 grid)
sarsa1 = SARSA_sim(env1)

# Run SARSA simulation
sarsa1.simulate(no_of_episodes_sarsa, epsilon_sarsa, gamma_sarsa, alpha_sarsa)

Episode Number: 1 / 100
Episode Number: 2 / 100
Episode Number: 3 / 100
Episode Number: 4 / 100
Episode Number: 5 / 100
Episode Number: 6 / 100
Episode Number: 7 / 100
Episode Number: 8 / 100
Episode Number: 9 / 100
Episode Number: 10 / 100
Episode Number: 11 / 100
Episode Number: 12 / 100
Episode Number: 13 / 100
Episode Number: 14 / 100
Episode Number: 15 / 100
Episode Number: 16 / 100
Episode Number: 17 / 100
Episode Number: 18 / 100
Episode Number: 19 / 100
Episode Number: 20 / 100
Episode Number: 21 / 100
Episode Number: 22 / 100
Episode Number: 23 / 100
Episode Number: 24 / 100
Episode Number: 25 / 100
Episode Number: 26 / 100
Episode Number: 27 / 100
Episode Number: 28 / 100
Episode Number: 29 / 100
Episode Number: 30 / 100
Episode Number: 31 / 100
Episode Number: 32 / 100
Episode Number: 33 / 100
Episode Number: 34 / 100
Episode Number: 35 / 100
Episode Number: 36 / 100
Episode Number: 37 / 100
Episode Number: 38 / 100
Episode Number: 39 / 100
Episode Number: 40 / 100
Episode N

#####  Qtable Results for *SARSA with ϵ-Greedy Behavior Policy* (Environment 1)

In [291]:
sarsa1.fetchQtable()

Unnamed: 0,00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
D,0.270602,-0.999968,-0.093268,-0.980227,0.586902,0,0.754425,0,-0.971752,0.316985,0.862514,0,0,0.0,0.0,0
U,0.0,0.0,0.0,0.0,0.225896,0,0.19186,0,0.094294,-0.942352,0.57127,0,0,0.558145,0.785868,0
L,0.0,0.194399,0.09711,0.09613,0.0,0,-0.882351,0,0.0,0.460562,0.605343,0,0,-0.882351,0.801701,0
R,0.085895,0.035182,-0.06425,0.0,-0.980227,0,-0.942352,0,0.674043,0.746323,-0.997674,0,0,0.898652,1.0,0


#####  Creating instance of class & running simulation (Environment 2)

In [296]:
# Create instance of class with environment 1 (10x10 grid)
sarsa2 = SARSA_sim(env2)

# Run SARSA simulation
sarsa2.simulate(no_of_episodes_sarsa, epsilon_sarsa, gamma_sarsa, alpha_sarsa)

Episode Number: 1 / 10
Main Step: (0, 0)
Sub step state: (0, 0), Sub step action: R
Sub step state: (0, 1), Sub step action: L
Sub step state: (0, 0), Sub step action: D
Sub step state: (1, 0), Sub step action: U
Sub step state: (0, 0), Sub step action: R
Sub step state: (0, 1), Sub step action: L
Sub step state: (0, 0), Sub step action: R
Sub step state: (0, 1), Sub step action: R
Sub step state: (0, 2), Sub step action: R
Sub step state: (0, 3), Sub step action: D
Sub step state: (1, 3), Sub step action: D
Sub step state: (2, 3), Sub step action: U
Sub step state: (1, 3), Sub step action: U
Sub step state: (0, 3), Sub step action: R
Sub step state: (0, 4), Sub step action: D
Main Step: (1, 0)
Sub step state: (1, 0), Sub step action: D
Sub step state: (2, 0), Sub step action: U
Sub step state: (1, 0), Sub step action: R
Sub step state: (1, 1), Sub step action: L
Sub step state: (1, 0), Sub step action: D
Sub step state: (2, 0), Sub step action: D
Sub step state: (3, 0), Sub step actio

KeyboardInterrupt: 

#####  Qtable Results for *SARSA with ϵ-Greedy Behavior Policy* (Environment 2)

In [252]:
sarsa2.fetchQtable()

Unnamed: 0,00,01,02,03,04,05,06,07,08,09,...,90,91,92,93,94,95,96,97,98,99
D,-0.00071,-0.000577,-0.149372,-0.062118,-0.942352,-0.83193,-0.657,-6e-06,-0.3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
U,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.3,-0.001594,-0.010716,0.0,0.0,0.118558,0,0,0
L,0.0,-0.000326,-0.000576,-0.023526,-0.013481,-0.249625,-0.000839,-0.124174,0.0,-0.081,...,0.0,0.0,-0.081,0.0,0.0,0.0,0.0,0,0,0
R,-0.000331,-0.001002,-0.030297,-0.051274,-0.041125,-0.238235,-7e-06,-0.0567,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001559,0.02445,-0.51,0,0,0


_______________________________________________________________________________________________________________________________

# Qlearning with an ϵ-greedy behavior policy

###  Algorithm Class

In [25]:
class qlearning_sim:
    '''
    ** Please remember to reset Q and Returns table after simulation

        Functions:
        # Fetches Qtable ->                            .fetchQtable()    
        # Run simulation ->                            .simulate(no_of_episodes, epsilon, gamma)
        # Resets Qtable  ->                            .resettable()      
    '''
    def __init__(self, env):
        self.env = env
        # Create Q table upon creating class
        self.Qtable_qlearning = create_qtable(self.env.rows, self.env.cols, action_space)
        
    # Returns Qtable    
    def fetchQtable(self):
        return self.Qtable_qlearning
    
    # Resets Qtable by creating an empty table
    def resettable(self):
        self.Qtable_qlearning = createQtable()
    
    # Run Qlearning simulation
            # There will be a sub step simulation within the overarching simulation - for looking ahead and updating Qtable
    def simulate(self, no_of_episodes, epsilon, gamma, alpha):        
        for i in range(no_of_episodes):    
            # reset environment to start state
            self.env.reset()  

            # Initiialise main simulation state
            main_step_state = self.env.current_state()

            # Loop for main simulation
            while self.env.is_terminal(self.env.current_state()) == False:
                # Assign state so that it can be referenced again later after sub step simulation is conducted
                main_step_state = self.env.current_state()

                # Loop for sub simulation - Looking ahead until agent reaches terminal state 
                while self.env.is_terminal(self.env.current_state()) == False:
                    sub_step_state = self.env.current_state()
                    state_formatted = str(sub_step_state[0]) + str(sub_step_state[1])
                    # Choose action in sub step simulation
                    sub_step_action = epsilon_soft(self.Qtable_qlearning, self.env, epsilon, sub_step_state)
                    
                    # Taking action - to observe next state, action and rewar
                    self.env.move(sub_step_action)

                    # Retrieve reward for taking specific action
                    reward = self.env.get_rewards()
                    # Retrieve new state
                    new_sub_step_state = self.env.current_state()
                    new_state_formatted = str(new_sub_step_state[0]) + str(new_sub_step_state[1])

                    # print("Sub State: {}\nSub Action: {}\n\n".format(sub_step_state, sub_step_action)) ## Test print

                    # Check if new state is a terminal state: If not, then retrieve action for new state
                    if new_sub_step_state in self.env.actions:
                        new_sub_step_action = epsilon_soft(self.Qtable_qlearning, self.env, epsilon, new_sub_step_state)
                        
                        # Update Q(s1, a1) in direction of Q(s2, a2) where Q(s2, a2) is max qvalue at state s2
                        self.Qtable_qlearning.loc[sub_step_action, state_formatted] += (alpha * (reward + (gamma * self.Qtable_qlearning[new_state_formatted].max()) - self.Qtable_qlearning.at[sub_step_action, state_formatted]))

                    # If state is a terminal state, then update Q(s1, a1) with same equation, but Q(s2, a2) = 0 (since terminal state)
                    else:
                        self.Qtable_qlearning.loc[sub_step_action, state_formatted] += (alpha * (reward + (gamma * 0) - self.Qtable_qlearning.at[sub_step_action, state_formatted]))
                        break # break out of sub simulation since it has reached terminal state

                # Since we now want to revert back to the main simulation, we will need to reassign agent back to the main state  
                self.env.set_state(main_step_state)
                # Choose action based on policy with new Q values updated by sub simulation
                main_step_action = epsilon_soft(self.Qtable_qlearning, self.env, epsilon, main_step_state)
                # Move agent
                self.env.move(main_step_action) 

                # print("Main state: {}\nMain Action: {}".format(main_step_state, main_step_action)) ## Test print

###  Running Simulation

#####  Defining Parameters

In [26]:
epsilon_qlearning = 0.1 # Epsilon greedy probability 
gamma_qlearning = 0.9 # Rewards discount rate gamma
alpha_qlearning = 0.3 # Learning rate of agent
no_of_episodes_qlearning = 100 # Number of episodes to be executed in simulation

#####  Creating instance of class & running simulation

In [27]:
# Create instance of class with environment 1 (4x4 grid)
qlearning = qlearning_sim(env1)

# Run Qlearning simulation
qlearning.simulate(no_of_episodes_qlearning, epsilon_qlearning, gamma_qlearning, alpha_qlearning)

#####  Qtable Results for *Qlearning with ϵ-Greedy Behavior Policy*

In [28]:
qlearning.fetchQtable()

Unnamed: 0,00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
D,0.59049,-0.657,0.0,-0.51,0.6561,0,0.578368,0,-0.917646,0.81,0.9,0,0,0.0,0.0,0
U,0.0,0.0,0.0,0.0,0.499868,0,0.0,0,0.566039,-0.997674,0.129387,0,0,0.728796,0.809358,0
L,0.0,0.44214,0.0,0.0,0.0,0,-0.3,0,0.0,0.648645,0.0,0,0,-0.999202,0.809947,0
R,0.103357,0.0,0.0,0.0,-0.959646,0,-0.3,0,0.729,0.655807,-0.3,0,0,0.9,1.0,0


_______________________________________________________________________________________________________________________________