### Run Q-learning for 200 steps with a deterministic greedy behavior policy: at each state $s_t$ use the best action $a_t \in \argmax_a Q(s_t,a)$ indicated by the current action-value table. If there is a tie, prefer move. Show the action-value table at the end.

In [1]:
import numpy as np

# Parameters
states = ['A', 'B']
actions = ['stay', 'move']
gamma = 0.8  # discount factor
alpha = 0.5  # learning rate
num_steps = 200

# Initialize the Q-table
Q = {state: {action: 0 for action in actions} for state in states}

# Function to choose the best action according to the deterministic greedy policy
def choose_action(state, Q):
    # If there is a tie, prefer 'move'
    if Q[state]['stay'] == Q[state]['move']:
        return 'move'
    # Otherwise, choose the action with the highest Q-value
    return max(Q[state], key=Q[state].get)

# Function to get the next state based on the action
def get_next_state(state, action):
    if action == 'move':
        return 'B' if state == 'A' else 'A'
    return state  # stay in the same state

# Q-learning process
current_state = 'A'
for _ in range(num_steps):
    action = choose_action(current_state, Q)
    next_state = get_next_state(current_state, action)
    reward = 1 if action == 'stay' else 0
    best_next_action = max(Q[next_state], key=Q[next_state].get)
    Q[current_state][action] = (1 - alpha) * Q[current_state][action] + \
                                alpha * (reward + gamma * Q[next_state][best_next_action])
    current_state = next_state

Q  # Display the final Q-table after 200 steps


{'A': {'stay': 0, 'move': 0.0}, 'B': {'stay': 0, 'move': 0.0}}

### Run with $\epsilon$-greedy policy with $\epsilon=0.5$

In [7]:
import random

# Parameters
epsilon = 0.5  # Probability of exploring

# Reinitialize the Q-table
Q = {state: {action: 0 for action in actions} for state in states}

# Function to choose an action according to the epsilon-greedy policy
def epsilon_greedy_action(state, Q, epsilon):
    if random.random() < epsilon:
        # Explore: choose randomly between 'move' and 'stay'
        return random.choice(actions)
    else:
        # Exploit: choose the best action based on the current Q-table, break ties arbitrarily
        best_actions = [action for action in actions if Q[state][action] == max(Q[state].values())]
        return random.choice(best_actions)

# Q-learning process with epsilon-greedy policy
current_state = 'A'
for _ in range(num_steps):
    action = epsilon_greedy_action(current_state, Q, epsilon)
    next_state = get_next_state(current_state, action)
    reward = 1 if action == 'stay' else 0
    best_next_action = max(Q[next_state], key=Q[next_state].get)
    Q[current_state][action] = (1 - alpha) * Q[current_state][action] + \
                                alpha * (reward + gamma * Q[next_state][best_next_action])
    current_state = next_state

Q  # Display the final Q-table after 200 steps


{'A': {'stay': 4.998335051817343, 'move': 3.9951919513688017},
 'B': {'stay': 4.995702477721416, 'move': 3.994470571825331}}