# Reinforcement Learning with:
- Monte Carlo Simulation Algorithm
- SARSA Algorithm
- Q Learning Algorithm

# Environment and Mission 

*The goal is for the agent to learn how to navigate the state space to reach the end goal of retrieving the frisbee*
<br></br>

<U>**Within Action Space, the following actions are defined:**</U>

**'L':** Move left

**'D':** Move down

**'R':** Move right

**'U':** Move up

*If agent attempts to leave the grid, when at the edges, program would set the new state as the old state. Basically it will not move
<br></br>

<U>**Map**:</U>
    
    S  .  .  .
    
    .  H  .  H
    
    .  .  .  H
    
    H  .  .  E
<br></br>
<U>**Rewards**:</U>

Reach goal: +1

Reach hole: -1

Traversing frozen surface: 0 


---

# Building Environment


### Importing relevant packages

In [85]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import random
import statistics as st
import pandas as pd

### Creating Grid Environment

#### Creating Grid Class

In [86]:
class Grid:

    # Takes in variables of rows, cols and start state of agent
    def __init__(self, rows, cols, start):
        self.rows = rows
        self.cols = cols
        self.i = start[0]
        self.j = start[1]
    
    # Fucntion that allows user to set rewards and actions allowed at given states
    def set(self, rewards, actions):
        self.rewards = rewards
        self.actions = actions

    # Function that allows user to set state of agent
    def set_state(self,s):
        self.i = s[0]
        self.j = s[1]
    
    # Function that fetches current state of agent
    def current_state(self):
        return(self.i, self.j)
    
    # Function that checks if agent is in a terminal state (if current state of agent is in a terminal state: hole / goal state, then function returns True)
    def is_terminal(self, s):
        return s not in self.actions
    
    # Function that fetches the possible actions the agent can take at a given state s
    def possible_actions(self, s):
        return self.actions[s]
    
    # Moves the agent in the state space based on the action taken by the agent
    def move(self, action):
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action == 'D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1
    
    # Gets reward of the current state of agent
    def get_rewards(self):
        reward = self.rewards.get((self.i, self.j), 0)
        return reward
    
    # Undo move of agent (Function isn't used but put in place if needed)
    def undo_move(self, action):
        if action == 'U':
            self.i += 1
        elif action == 'D':
            self.i -= 1
        elif action == 'R':
            self.j -= 1
        elif action == 'L':
            self.j += 1
        assert(self.current_state() in self.all_states())
    
            
    # To reset agent to be at starting state - (0, 0) in our specific example
    def reset(self):
        self.set_state((0,0))

#### Creating Grid Environment Function


In [87]:
def standard_grid(rewards, actions, rows, cols, start_state):
    # define a grid that describes the reward for arriving at each state
    # and possible actions at each state
    # the grid looks like this
    # S means start position
    # E means the end states

        # S  .  .  .
        # .  H  .  H
        # .  .  .  H
        # H  .  .  E

    g = Grid(rows, cols, start_state) #(rows, cols, start_state)
    g.set(rewards, actions)
    return g

#### Creating Environment

In [88]:
# Environment Characteristics
# no. of rows & cols of grid
no_of_rows = 4
no_of_cols = 4

# Full action space
action_space = ('D', 'U', 'L', 'R')

# Assigned start state
start_state = (0, 0)

# Define rewards at specific states (punishment yields negative rewards)
# rewards at given states (in dictionary form)
rewards = {(1, 1): -1, # hole
           (1, 3): -1, # hole
           (2, 3): -1, # hole
           (3, 0): -1, # hole
           (3, 3): 1} # frisbee

# Define legal (possible) actions at each state
# States that depict terminal state (hole / end goal) are commented because this will tie in with the .is_terminal() function under class Grid
actions = {
        (0, 0): ('D', 'R'), # Start_state
        (0, 1): ('D', 'R', 'L'), 
        (0, 2): ('D', 'R', 'L'),
        (0, 3): ('D', 'L'),
        (1, 0): ('D', 'R', 'U'),
        #(1, 1): ('D', 'R', 'L', 'U'), #Hole
        (1, 2): ('D', 'R', 'L', 'U'),
        #(1, 3): ('D', 'U', 'L'), #Hole
        (2, 0): ('D', 'U', 'R'),
        (2, 1): ('D', 'R', 'L', 'U'),
        (2, 2): ('D', 'R', 'L', 'U'),
        #(2, 3): ('D', 'U', 'L'), #Hole
        #(3, 0): ('U', 'R', ), #Hole
        (3, 1): ('U', 'R', 'L'),
        (3, 2): ('U', 'R', 'L'),
        #(3, 3): (), #End-State (frisbee)
}


# Create Grid environment
env = standard_grid(rewards, actions, no_of_rows, no_of_cols, start_state) 
# Set rewards and actions of environment
env.set(rewards, actions)
# Reset environment to start state defined as (0,0) in .reset() function
env.reset()

##### --- Function testing ---

# Q table, Returns table and Policy

### Q table Function

*Q table is built as a dataframe for easier referencing: there were problems with referencing when building a multi nested dictionary*

In [89]:
def create_qtable(no_of_rows, no_of_cols, action_space):
    # Creates Q table as a nested dictionary
    Q = {}
    for i in range(no_of_rows):
        for j in range(no_of_cols):
            Q[(str(i) + str(j))] = 0
    
    action_space_dic = {}
    for item in action_space:
        action_space_dic[item] = 0
        

    for k, v in Q.items():
        Q[k] = action_space_dic
    
    # Converts Q table into a dataframe
    Q = pd.DataFrame(data = Q)
        
    return Q

### Returns table Function


*Returns table is built as a dataframe for easier referencing: there were problems with referencing when building a multi nested dictionary*

In [90]:
def create_returnstable(no_of_rows, no_of_cols, action_space):
    # Creates Returns table as a nested dictionary
    returns = {}
    for i in range(no_of_rows):
        for j in range(no_of_cols):
            returns[(str(i) + str(j))] = 0
    
    action_space_dic = {}
    for item in action_space:
        action_space_dic[item] = []
        

    for k, v in returns.items():
        returns[k] = action_space_dic
    
    # Converts Returns table into a dataframe
    returns = pd.DataFrame(data = returns)
        
    return returns

### Epsilon Greedy Policy

In [116]:
# Select an action for the agent to take. Each action has a minimum probability of (epsilon / no. of actions) of being selected
# Optimal action has a higher probability of being selected
def epsilon_soft(Qtable, env, action_space, epsilon, currentstate):

    prob = epsilon # sum of minimum prob of selecting all actions in action space
    prob_g = 1 - epsilon # prob of selecting greedy action
    
    # Set a random probability to determine which actions are being selected
    random_prob = random.random()
    
    # Finding max q value at the specific state
    q_values = Qtable[str(state[0]) + str(state[1])]
    max_q_value = q_values.max()
 
    best_actions = q_values[q_values == max_q_value].index.tolist()
    
    # When random_prob =< sum of min prob of all actions, randomly select action
    if random_prob <= prob:
        
        # Loop to select legal action
        while True:
            valid_actions = env.actions[state]
            action = valid_actions[(random.randint(0, (len(valid_actions)-1)))]
            return action
            break
            
    # If random_prob > prob, then select action with highest q value
        # Other scenarios that this covers:
            # 1. When more than 1 action has the same q value - select the action randomly
            # 2. When there is only 1 max q value, this chunk of code will still select the optimal action
    else:
        while True:
            action = best_actions[random.randint(0, len(best_actions)-1)]
            if action in env.actions[state]:
                return action
            break

In [108]:
env.actions[0,0][1]



'R'

In [93]:
data = {
    ('00'): {'U': 200, 'L': 200, 'R': 9, 'D': 3200},
    ('13'): {'U': 15, 'L': 80, 'R': 9, 'D': 10},
    ('24'): {'U': 0, 'L': 8, 'R': 50, 'D': 10},
    ('35'): {'U': 0, 'L': 8, 'R': 9, 'D': 10}
    }

df = pd.DataFrame(data =  data)

df['00'].idxmax(axis = 0)

# for item in df['00']:
#     print(item)

# df['00'].iat[[]]
# x = df[df['00'] == 9].index.values
# x
# df.index('100')

x = df[df.isin([200])].stack()
x[x == 200].index.values[0][0]
# for items in df[df['00'] == 200].index.values:
#     print(items)

for items in df[df['00'] == 200].index.values:
    print(items)

U
L


In [94]:
# # epsilon_soft(df, env, action_space, 0.2)
# for items in df[df['00'] == 200].index:
#     print(items)

lst = [0,3,4,5,6,4,8,1,0,3]
indexes = []

for i in range(len(lst)):
    lst_copy = lst.copy()
    lst_copy.remove(lst[i])
    
    if lst[i] in lst_copy:
        indexes.append(i)
    
    else:
        continue

# df['00'].iloc[3]

In [95]:
# Test if .is_terminal() function works
    # Terminal States: 1,1  1,3  2,3  3,0  3,3
print(env.is_terminal((2, 0)))
print(env.is_terminal((3, 0)))


# Test .move()
env.reset()
state_before = env.current_state()
action = env.move('D')
state_after = env.current_state()
print('Original State: {}, After taking action: {}'.format(state_before, state_after))


# Test loop to stop moving when environment reaches terminal state
while env.is_terminal(env.current_state()) == False:
    a = action_space[(random.randint(0, (len(action_space)-1)))]
    state_b = env.current_state()
    env.move(a)
    state_a = env.current_state()
    
    print('State before: {}, State After taking aciton {}: {}'.format(state_b, state_a, a))

else:
    print('Reached terminal state {}'.format(env.current_state()))

False
True
Original State: (0, 0), After taking action: (1, 0)
State before: (1, 0), State After taking aciton (1, 0): L
State before: (1, 0), State After taking aciton (0, 0): U
State before: (0, 0), State After taking aciton (0, 1): R
State before: (0, 1), State After taking aciton (0, 0): L
State before: (0, 0), State After taking aciton (0, 0): U
State before: (0, 0), State After taking aciton (0, 0): U
State before: (0, 0), State After taking aciton (1, 0): D
State before: (1, 0), State After taking aciton (0, 0): U
State before: (0, 0), State After taking aciton (0, 0): U
State before: (0, 0), State After taking aciton (0, 0): U
State before: (0, 0), State After taking aciton (1, 0): D
State before: (1, 0), State After taking aciton (2, 0): D
State before: (2, 0), State After taking aciton (3, 0): D
Reached terminal state (3, 0)


_______________________________________________________________________________________________________________________________

_______________________________________________________________________________________________________________________________

## Create Q table & Returns table

In [96]:
Qtable = create_qtable(env.rows, env.cols, action_space)

Returns = create_returnstable(env.rows, env.cols, action_space)

Qtable

Unnamed: 0,00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
D,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
U,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
L,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
R,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


_______________________________________________________________________________________________________________________________

# First-visit Monte Carlo without exploring starts

**Defining parameters**

In [103]:
epsilon_monte = 0.2 # soft policy epsilon probability
gamma_monte = 0.9 # discount rate

no_of_episodes_monte = 5 # total number of episodes

In [115]:
Qtable_monte = create_qtable(env.rows, env.cols, action_space)
Returns = create_returnstable(env.rows, env.cols, action_space)

for i in range(no_of_episodes_monte):
    episode = []
    G = 0
    
    # reset environment to start state
    env.reset()  

    while env.is_terminal(env.current_state()) == False:
        state = env.current_state()
        action = epsilon_soft(Qtable_monte, env, action_space, epsilon_monte, state)
        env.move(action)
        rewards = env.get_rewards()
        episode.append((state, action, rewards))
    
    episode_reversed = episode[::-1]
    temp_lst = [item[0] for item in episode_reversed]

    for i in range(len(episode_reversed)):
        state = episode_reversed[i][0]
        state_formatted = str(state[0]) + str(state[1])
        act_taken = episode_reversed[i][1]
        reward = episode_reversed[i][2]

        G = gamma_monte*G + reward

        if state not in temp_lst[i+1:]:
            print(state_formatted, act_taken, reward)
            # print(temp_lst)
            # print(G)
            Returns.at[act_taken, state_formatted] = Returns.at[act_taken, state_formatted] + [G]
        else:
            continue


    for state in Qtable_monte.columns.values:
        for action in Qtable_monte.index.values:
            try:
                Qtable_monte.at[action, state] = st.mean(Returns.at[action, state])
            except ValueError:
                continue

12 L -1
02 D 0
01 None 0


KeyError: None

In [None]:
Qtable_monte

Unnamed: 0,00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
D,0.165847,-1.0,-0.051373,-1.0,0.310027,0,0.657892,0,-1.0,0.585461,0.852948,0,0,0.0,0.0,0
U,-0.348678,0.0,0.0,0.0,-0.012247,0,-0.510647,0,0.126566,-1.0,0.339644,0,0,0.560854,0.716894,0
L,-0.729,-0.207638,-0.434034,-0.356096,0.0,0,-1.0,0,-0.729,0.290343,0.554863,0,0,-1.0,0.52937,0
R,-0.452222,-0.332543,-0.647991,0.0,-1.0,0,-1.0,0,0.4791,0.640501,-1.0,0,0,0.871172,1.0,0


_______________________________________________________________________________________________________________________________

# SARSA with an ϵ-greedy behavior policy

In [None]:
Qtable_sarsa = create_qtable(env.rows, env.cols, action_space)
Qtable_sarsa

Unnamed: 0,00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
D,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
U,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
L,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
R,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Defining parameters**

In [None]:
epsilon_sarsa = 0.2 # soft policy epsilon probability
gamma_sarsa = 0.9 # discount rate
alpha_sarsa = 0.2
no_of_episodes_sarsa = 100# total number of episodes

In [None]:
# SARSA Algorithm

# Create new Qtable for SARSA
Qtable_sarsa = Qtable_sarsa = create_qtable(env.rows, env.cols, action_space)

for i in range(no_of_episodes_sarsa):    
    # reset environment to start state
    env.reset()  


    # Choose action at state
        # Look forward one step and update Q table
        # Choose action based on new Qtable
        # Look forward one step based on new state
        # Choose new action based on new state
        # Update Q table
        ## End when hit terminal state
    
    # Rechoose action based on new Q table
        # Repeat

    # Initiialise main step state
    main_step_state = env.current_state()

    while env.is_terminal(env.current_state()) == False:
        main_step_state = env.current_state()


        while env.is_terminal(env.current_state()) == False:
            # Choose action
            sub_step_state = env.current_state()
            state_formatted = str(sub_step_state[0]) + str(sub_step_state[1])
            sub_step_action = epsilon_soft(Qtable_sarsa, env, action_space, epsilon_sarsa, sub_step_state)
            
            # Move to check next state choices
            env.move(sub_step_action)

            # Retrieve reward for taking specific action
            reward = env.get_rewards()
            # Retrieve new state and next action choice
            new_sub_step_state = env.current_state()
            new_state_formatted = str(new_sub_step_state[0]) + str(new_sub_step_state[1])

            print("Sub State: {}\nSub Action: {}\n\n".format(sub_step_state, sub_step_action))
            if new_sub_step_state in env.actions:
                new_sub_step_action = epsilon_soft(Qtable_sarsa, env, action_space, epsilon_sarsa, new_sub_step_state)
                
                # Update Q table with percentage of rewards for taking specific action at specific state to arrive at new state
                    # q_value = Qtable_sarsa.at[sub_step_action, state_formatted]
                    # q_prime_value = Qtable_sarsa.at[new_sub_step_action, new_state_formatted] 
                    # q_value += (alpha_sarsa * (reward + (gamma_sarsa * q_prime_value) - q_value))

                Qtable_sarsa.at[sub_step_action, state_formatted] += (alpha_sarsa * (reward + (gamma_sarsa * Qtable_sarsa.at[new_sub_step_action, new_state_formatted]) - Qtable_sarsa.at[sub_step_action, state_formatted]))

            else: 
                Qtable_sarsa.at[sub_step_action, state_formatted] += (alpha_sarsa * (reward + (gamma_sarsa * Qtable_sarsa.at[new_sub_step_action, new_state_formatted]) - Qtable_sarsa.at[sub_step_action, state_formatted]))

                continue

        env.set_state(main_step_state)
        main_step_action = epsilon_soft(Qtable_sarsa, env, action_space, epsilon_sarsa, main_step_state)
        env.move(main_step_action)    
        print("Main state: {}\nMain Action: {}".format(main_step_state, main_step_action))   

        

Sub State: (0, 0)
Sub Action: R


Sub State: (0, 1)
Sub Action: R


Sub State: (0, 2)
Sub Action: L


Sub State: (0, 1)
Sub Action: D


Main state: (0, 0)
Main Action: D
Sub State: (1, 0)
Sub Action: D


Sub State: (2, 0)
Sub Action: D


Main state: (1, 0)
Main Action: D
Sub State: (2, 0)
Sub Action: U


Sub State: (1, 0)
Sub Action: R


Main state: (2, 0)
Main Action: U
Sub State: (1, 0)
Sub Action: U


Sub State: (0, 0)
Sub Action: R


Sub State: (0, 1)
Sub Action: D


Main state: (1, 0)
Main Action: D
Sub State: (2, 0)
Sub Action: R


Sub State: (2, 1)
Sub Action: D


Sub State: (3, 1)
Sub Action: U


Sub State: (2, 1)
Sub Action: R


Sub State: (2, 2)
Sub Action: D


Sub State: (3, 2)
Sub Action: R


Main state: (2, 0)
Main Action: R
Sub State: (2, 1)
Sub Action: U


Main state: (2, 1)
Main Action: D
Sub State: (3, 1)
Sub Action: U


Sub State: (2, 1)
Sub Action: L


Sub State: (2, 0)
Sub Action: U


Sub State: (1, 0)
Sub Action: U


Sub State: (0, 0)
Sub Action: D


Sub State: (1,

In [None]:
Qtable_sarsa
# S . . .
# . H . H
# . . . H 
# H . . E


Unnamed: 0,00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
D,-0.095951,-0.977482,0.475014,-0.865782,0.371002,0,0.700146,0,-0.981986,0.790057,0.878503,0,0,0.0,0.0,0
U,0.0,0.0,0.0,0.0,0.03653,0,0.290897,0,0.063989,-0.832228,0.372968,0,0,0.506166,0.690464,0
L,0.0,0.145009,0.20116,0.251266,-0.001166,0,-0.790285,0,0.0,0.125355,0.51704,0,0,-0.964816,0.765187,0
R,0.19707,0.392755,-0.025129,0.0,-0.95602,0,-0.945024,0,0.57202,0.522343,-0.994097,0,0,0.851855,1.0,0


_______________________________________________________________________________________________________________________________

# Qlearning with an ϵ-greedy behavior policy

In [None]:
Qtable_qlearning = create_qtable(env.rows, env.cols, action_space)
Qtable_qlearning

Unnamed: 0,00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
D,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
U,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
L,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
R,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Defining parameters**

In [None]:
epsilon_qlearning = 0.2 # soft policy epsilon probability
gamma_qlearning = 0.9 # discount rate
alpha_qlearning = 0.2
no_of_episodes_qlearning = 2000# total number of episodes

In [None]:
# Qlearning Algorithm

# Create new Qtable for SARSA
Qtable_qlearning = Qtable_qlearning = create_qtable(env.rows, env.cols, action_space)

for i in range(no_of_episodes_qlearning):    
    # reset environment to start state
    env.reset()  


    # Choose action at state
        # Look forward one step and update Q table
        # Choose action based on new Qtable
        # Look forward one step based on new state
        # Choose new action based on new state
        # Update Q table
        ## End when hit terminal state
    
    # Rechoose action based on new Q table
        # Repeat

    # Initiialise main step state
    main_step_state = env.current_state()

    while env.is_terminal(env.current_state()) == False:
        main_step_state = env.current_state()


        while env.is_terminal(env.current_state()) == False:
            # Choose action
            sub_step_state = env.current_state()
            state_formatted = str(sub_step_state[0]) + str(sub_step_state[1])
            sub_step_action = epsilon_soft(Qtable_qlearning, env, action_space, epsilon_qlearning, sub_step_state)
            
            # Move to check next state choices
            env.move(sub_step_action)

            # Retrieve reward for taking specific action
            reward = env.get_rewards()
            # Retrieve new state and next action choice
            new_sub_step_state = env.current_state()
            new_state_formatted = str(new_sub_step_state[0]) + str(new_sub_step_state[1])

            print("Sub State: {}\nSub Action: {}\n\n".format(sub_step_state, sub_step_action))
            if new_sub_step_state in env.actions:
                new_sub_step_action = epsilon_soft(Qtable_qlearning, env, action_space, epsilon_qlearning, new_sub_step_state)
                
                # Update Q table with percentage of rewards for taking specific action at specific state to arrive at new state
                    # q_value = Qtable_sarsa.at[sub_step_action, state_formatted]
                    # q_prime_value = Qtable_sarsa.at[new_sub_step_action, new_state_formatted] 
                    # q_value += (alpha_sarsa * (reward + (gamma_sarsa * q_prime_value) - q_value))

                Qtable_qlearning.at[sub_step_action, state_formatted] += (alpha_qlearning * (reward + (gamma_qlearning * Qtable_qlearning[new_state_formatted].max()) - Qtable_qlearning.at[sub_step_action, state_formatted]))

            else: 
                Qtable_qlearning.at[sub_step_action, state_formatted] += (alpha_qlearning * (reward + (gamma_qlearning * Qtable_qlearning[new_state_formatted].max()) - Qtable_qlearning.at[sub_step_action, state_formatted]))

                continue

        env.set_state(main_step_state)
        main_step_action = epsilon_soft(Qtable_qlearning, env, action_space, epsilon_qlearning, main_step_state)
        env.move(main_step_action)    
        print("Main state: {}\nMain Action: {}".format(main_step_state, main_step_action))   

        

Sub State: (0, 0)
Sub Action: R


Sub State: (0, 1)
Sub Action: L


Sub State: (0, 0)
Sub Action: D


Sub State: (1, 0)
Sub Action: D


Sub State: (2, 0)
Sub Action: R


Sub State: (2, 1)
Sub Action: D


Sub State: (3, 1)
Sub Action: R


Sub State: (3, 2)
Sub Action: R


Main state: (0, 0)
Main Action: D
Sub State: (1, 0)
Sub Action: U


Sub State: (0, 0)
Sub Action: D


Sub State: (1, 0)
Sub Action: U


Sub State: (0, 0)
Sub Action: D


Sub State: (1, 0)
Sub Action: R


Main state: (1, 0)
Main Action: D
Sub State: (2, 0)
Sub Action: D


Main state: (2, 0)
Main Action: R
Sub State: (2, 1)
Sub Action: D


Sub State: (3, 1)
Sub Action: L


Main state: (2, 1)
Main Action: U
Sub State: (0, 0)
Sub Action: R


Sub State: (0, 1)
Sub Action: L


Sub State: (0, 0)
Sub Action: R


Sub State: (0, 1)
Sub Action: R


Sub State: (0, 2)
Sub Action: D


Sub State: (1, 2)
Sub Action: L


Main state: (0, 0)
Main Action: D
Sub State: (1, 0)
Sub Action: D


Sub State: (2, 0)
Sub Action: R


Sub State: (2,

In [None]:
Qtable_qlearning
# S . . .
# . H . H
# . . . H 
# H . . E


Unnamed: 0,00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
D,0.59049,-0.999993,0.729,-0.790285,0.6561,0,0.81,0,-1.0,0.81,0.9,0,0,0.0,0.0,0
U,0.0,0.0,0.0,0.0,0.531441,0,0.656083,0,0.59049,-1.0,0.729,0,0,0.729,0.81,0
L,0.0,0.531441,0.590387,0.655684,0.0,0,-0.999982,0,0.0,0.6561,0.729,0,0,-1.0,0.81,0
R,0.59049,0.6561,0.584059,0.0,-1.0,0,-0.999991,0,0.729,0.81,-1.0,0,0,0.9,1.0,0


_______________________________________________________________________________________________________________________________

---- Testings ----

In [None]:
a = [((0,0), 'D', 0), 
     ((1,0), 'D', 0), 
      ((2,0), 'D', 0), 
      ((3,0), 'D', -1),
      ((3,0), 'D', -1)
    ]

b = {(0,0): {'U': [], 'D': []},
     (1,0): {'U': [], 'D': []},
     (3,3): {'U': [], 'D': []},
     (2,0): {'U': [], 'D': []},
     (3,0): {'U': [], 'D': []}
    }
     
a_reversed = a[::-1]
temp_lst = [item[0] for item in a_reversed]
G = 10
for i in range(len(a_reversed)):
    if a_reversed[i][0] not in temp_lst[i+1:]:
        temp_dict = b[a_reversed[i][0]]
        temp_dict[a_reversed[i][1]].append(G)
        print("appended")
    
    else:
        print("not appended")
        
a_reversed

not appended
appended
appended
appended
appended


[((3, 0), 'D', -1),
 ((3, 0), 'D', -1),
 ((2, 0), 'D', 0),
 ((1, 0), 'D', 0),
 ((0, 0), 'D', 0)]

In [None]:
a = {('00'): {'U': 0, 'L': 3, 'R': 0, 'D': 0},
     ('13'): {'U': 15, 'L': 80, 'R': 9, 'D': 10},
     ('24'): {'U': 0, 'L': 8, 'R': 50, 'D': 10},
     ('35'): {'U': 0, 'L': 8, 'R': 9, 'D': 10},
    }

a_df = pd.DataFrame(data = a)

x = a_df['00'].max()
x

3

In [None]:
# for i in range(Returns.shape[0]):
#     for j in range(Returns.shape[1]):
#         Qtable.at[i, j] = st.mean(Returns.at[i,j])
        

# for states in Qtable.columns.values:
#     print(states)

# for actions in Qtable.index.values:
#     print(actions)



In [None]:
Qtable

Unnamed: 0,00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
D,-0.147342,-1.0,0.655181,-1.0,0.182275,0,0.8019,0,-1.0,0.577833,0.892875,0,0,0.0,0.0,0
U,0.0,0.0,0.0,0.0,-0.269972,0,-0.430467,0,0.323376,-1.0,-0.9,0,0,0.24026,0.3825,0
L,0.0,-0.146363,-0.10317,-0.139279,0.0,0,-1.0,0,-0.81,-0.478884,-0.6561,0,0,-1.0,0.3825,0
R,-0.194269,0.252372,-0.512676,0.0,-1.0,0,-1.0,0,0.346408,0.34317,-1.0,0,0,0.713455,1.0,0


In [None]:
print(a[::-1])
print(a)

[((3, 0), 'D', -1), ((3, 0), 'D', -1), ((2, 0), 'D', 0), ((1, 0), 'D', 0), ((0, 0), 'D', 0)]
[((0, 0), 'D', 0), ((1, 0), 'D', 0), ((2, 0), 'D', 0), ((3, 0), 'D', -1), ((3, 0), 'D', -1)]


In [None]:
rewards = {(1, 1): -1, # hole
           (1, 3): -1, # hole
           (2, 3): -1, # hole
           (3, 0): -1, # hole
           (3, 3): 1} # frisbee

rewards.get(1,1)

1

In [None]:
for i in range(number_of_episodes):
    episode = []
    r_list = []
    G = 0

    state = env.reset()[0]
    
    while True:
        action = epsilon_soft(Q, state, env, eps)
        new_state, reward, terminal, truncated, info = env.step(action)
        episode.append((state, action, reward))
        # r_list.append(reward)
        state = new_state
        if terminal:
            break #Break from episodic loop
        elif truncated:
            print('Timed out')
            break

    for idx, step in enumerate(episode[::-1]):
        G = gamma*G + step[2]
        
        # Episode list in reverse, all rows and first column, next state onwards
        if step[0] not in np.array(episode[::-1])[:, 0][idx+1:]:
            for act_taken in range(env.action_space.n):
                if step[1] == act_taken:
                    returns[str(step[0])][str(step[1])].append(G)
                    Q[step[0],step[1]] = np.mean(returns[str(step[0])][str(step[1])]) 
    
    
    

    print(episode)
    print(returns)


NameError: name 'number_of_episodes' is not defined

In [None]:
random.random()
random.randint(0,3)
a = {(0, 0): {'U': 0, 'L': 0, 'R': 0, 'D': 0},
     (1, 3): {'U': 15, 'L': 80, 'R': 9, 'D': 10},
     (2, 4): {'U': 0, 'L': 8, 'R': 50, 'D': 10},
     (3, 5): {'U': 0, 'L': 8, 'R': 9, 'D': 10},
    }

inner_dic = a[(0,0)]
print(max(inner_dic, key = inner_dic.get))
print(max(a[(0,0)], key = a[(0,0)].get))

U
U


In [None]:
dic = {'a': 1, 'b': 500000, 'c': 10000, 'e': 5, 'f': 10}
max(dic, key=dic.get)

'b'

In [None]:
def generate_episode(env):
    episode = []
    state = env.reset()
    while True:
        action = epsilon_soft(Q, )
        observation, reward, terminal, truncated, info = env.step(action)
        episode.append((observation, action, reward))
        state = observation
        if terminal:
            break #break is just to break that specific loop it is in
        return episode

#### Test Sample


In [None]:
a = [(1,2),(2,4),(3,5)]
a_e = enumerate(a)

b = np.zeros((4,4))
a_np = np.array(a[::-1])

for i in range(4):
    b[i] = [i, i, i, i]

b[:,0],b
a_np

array([[3, 5],
       [2, 4],
       [1, 2]])

In [None]:
returns = {}
for i in range(env.observation_space.n):
    returns[str(i)] = None

actions = {}
for j in range(env.action_space.n):
    actions[str(j)] = []

for k, v in returns.items():
    returns[k] = actions

returns


{'0': {'0': [], '1': [], '2': [], '3': []},
 '1': {'0': [], '1': [], '2': [], '3': []},
 '2': {'0': [], '1': [], '2': [], '3': []},
 '3': {'0': [], '1': [], '2': [], '3': []},
 '4': {'0': [], '1': [], '2': [], '3': []},
 '5': {'0': [], '1': [], '2': [], '3': []},
 '6': {'0': [], '1': [], '2': [], '3': []},
 '7': {'0': [], '1': [], '2': [], '3': []},
 '8': {'0': [], '1': [], '2': [], '3': []},
 '9': {'0': [], '1': [], '2': [], '3': []},
 '10': {'0': [], '1': [], '2': [], '3': []},
 '11': {'0': [], '1': [], '2': [], '3': []},
 '12': {'0': [], '1': [], '2': [], '3': []},
 '13': {'0': [], '1': [], '2': [], '3': []},
 '14': {'0': [], '1': [], '2': [], '3': []},
 '15': {'0': [], '1': [], '2': [], '3': []}}

In [None]:
dic = {'a': 0, 'b': 1, 'c': 2}
dic[0]

KeyError: 0

In [None]:
total_episodes = 50
#learning_rate = 0.8 #Leanring rate - do we need this?
#max_steps = 20
gamma = 0.5

#exploration parameters
epsilon = 0.2
# max_epsilon = 1 # Max exploration prob
# min_epsilon = 0.01 # Min exploration prob
# decay_rate = 0.01 # Decay rate for exploration prob

#List of rewards
rewards = []
    

2
