In [None]:
import numpy as np

# Problem Setup

In [None]:
# Define the Cliff Walking environment parameters
GRID_HEIGHT = 4
GRID_WIDTH = 4
START_STATE = (0, 0)
GOAL_STATE = (3, 3)
CLIFF_ONE = (1, 1)
CLIFF_TWO = (2, 2)

In [None]:
# Actions: 0: Up, 1: Down, 2: Left, 3: Right # nothing
ACTIONS = [( -1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] # delta_row, delta_col
NUM_ACTIONS = len(ACTIONS)

In [None]:
# Rewards
REWARD_NORMAL = -1
REWARD_GOAL = 10
REWARD_CLIFF = -5

In [None]:
def is_valid_state(state):
    row, col = state
    return 0 <= row < GRID_HEIGHT and 0 <= col < GRID_WIDTH

In [None]:
def get_valid_actions(state):
    valid_actions = []
    if state == GOAL_STATE or state == CLIFF_ONE or state == CLIFF_TWO:
        return [4]  # Only 'do nothing' action is valid
    for action in range(NUM_ACTIONS-1):  # Exclude the 'do nothing' action
        new_state = (state[0] + ACTIONS[action][0], state[1] + ACTIONS[action][1])
        if is_valid_state(new_state):
            valid_actions.append(action)
    return valid_actions

In [None]:
def get_next_state_and_reward(state, action_idx):
    row, col = state
    delta_row, delta_col = ACTIONS[action_idx]
    next_row, next_col = row + delta_row, col + delta_col

    if not is_valid_state((next_row, next_col)):
        # Stay in the same state if moving out of bounds
        next_state = state
        reward = REWARD_NORMAL
    elif (next_row, next_col) == CLIFF_ONE or (next_row, next_col) == CLIFF_TWO:
        # Fell off the cliff
        next_state = START_STATE
        reward = REWARD_CLIFF + REWARD_NORMAL  # -1 for the step, plus cliff penalty
    elif (next_row, next_col) == GOAL_STATE:
        # Reached the goal
        next_state = GOAL_STATE # Stays at goal after reaching
        reward = REWARD_NORMAL # Still -1 per step, but episode ends
    else:
        # Normal movement
        next_state = (next_row, next_col)
        reward = REWARD_NORMAL
    return next_state, reward

## Visualization

In [None]:
import matplotlib.pyplot as plt

def show_policy(policy, title):
    # Visualize policy
    _, ax = plt.subplots(1, 1, figsize=(8, 4))
    
    # Create a background grid for visualization
    background = np.zeros((GRID_HEIGHT, GRID_WIDTH))
    ax.imshow(background, cmap='RdYlGn', vmin=-5, vmax=10, alpha=0.3)
    
    # Add grid lines
    for i in range(GRID_HEIGHT + 1):
        ax.axhline(i - 0.5, color='black', linewidth=1)
    for j in range(GRID_WIDTH + 1):
        ax.axvline(j - 0.5, color='black', linewidth=1)
    
    # Show policy arrows
    action_symbols = {0: '↑', 1: '↓', 2: '←', 3: '→', 4: ' '}
    for row in range(GRID_HEIGHT):
        for col in range(GRID_WIDTH):
            state = (row, col)
            if state != GOAL_STATE:
                action = policy(state)
                symbol = action_symbols[action]
                size = 100
                color = 'red'
                ax.text(col, row, symbol, ha='center', va='center', 
                        fontsize=size//3, weight='bold', color=color)
    
    ax.set_title(title, fontsize=14, weight='bold')
    
    # Remove axes as requested
    ax.set_xticks([])
    ax.set_yticks([])
    ax.axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
import matplotlib.pyplot as plt

def show_value_function(V, title):
    # Visualize value function
    _, ax = plt.subplots(1, 1, figsize=(8, 4))
    
    # Create a background grid for visualization
    background = np.zeros((GRID_HEIGHT, GRID_WIDTH))
    ax.imshow(background, cmap='RdYlGn', vmin=-5, vmax=10, alpha=0.3)
    
    # Add grid lines
    for i in range(GRID_HEIGHT + 1):
        ax.axhline(i - 0.5, color='black', linewidth=1)
    for j in range(GRID_WIDTH + 1):
        ax.axvline(j - 0.5, color='black', linewidth=1)
    
    # Show value function as text
    for row in range(GRID_HEIGHT):
        for col in range(GRID_WIDTH):
            state = (row, col)
            value = V[state]
            ax.text(col, row, f"{value:.2f}", ha='center', va='center', 
                    fontsize=12, weight='bold', color='blue')
    
    ax.set_title(title, fontsize=14, weight='bold')
    
    # Remove axes as requested
    ax.set_xticks([])
    ax.set_yticks([])
    ax.axis('off')
    
    plt.tight_layout()
    plt.show()

# Epsilon-greedy policy


In [None]:
def epsilon_greedy_policy(state, q_values, epsilon):
    if np.random.rand() < epsilon:
        # Explore: select a random action
        valid_actions = get_valid_actions(state)
        return np.random.choice(valid_actions) if valid_actions else 4
    else:
        # Exploit: select the best action
        return np.argmax(q_values[state])

## Extract Policy

In [None]:
def extract_policy(q_values):
    policy_dict = {}
    for row in range(GRID_HEIGHT):
        for col in range(GRID_WIDTH):
            state = (row, col)
            if state != GOAL_STATE and state != CLIFF_ONE and state != CLIFF_TWO:
                ordered_actions = np.argsort(q_values[state][:-1])[::-1]
                best_action = None
                for action in ordered_actions:
                    if action in get_valid_actions(state):
                        best_action = action
                        break
                if best_action is None:
                    # If no valid action found, default to 'do nothing'
                    best_action = 4
                policy_dict[state] = best_action
            else:
                policy_dict[state] = 4
    def policy(state):
        return policy_dict.get(state, None)
    return policy

## Extract Value

In [None]:
def value_function(q_values):
    V = np.zeros((GRID_HEIGHT, GRID_WIDTH))
    for row in range(GRID_HEIGHT):
        for col in range(GRID_WIDTH):
            state = (row, col)
            if state != GOAL_STATE and state != CLIFF_ONE and state != CLIFF_TWO:
                q_value = q_values[state][:-1]
                actions = np.argsort(q_value)[::-1]
                value = None
                for action in actions:  # Exclude the 'do nothing' action
                    if action in get_valid_actions(state):
                        value = q_value[action]
                if value is not None:
                    V[state] = value
                else:
                    V[state] = 0
            elif state == GOAL_STATE:
                V[state] = 10
            else:
                V[state] = -5
    return V

## Q-Learning Implementation

In [None]:
def q_learning(num_episodes, alpha, gamma, epsilon, echo_interval):
    # Initialize Q-values
    q_values = np.zeros((GRID_HEIGHT, GRID_WIDTH, NUM_ACTIONS))
    for episode in range(num_episodes):
        state = START_STATE
        done = False
        
        while not done:
            action = epsilon_greedy_policy(state, q_values, epsilon)
            next_state, reward = get_next_state_and_reward(state, action)
            
            # Update Q-value
            best_next_action = np.argmax(q_values[next_state])
            td_target = reward + gamma * q_values[next_state][best_next_action]
            td_error = td_target - q_values[state][action]
            q_values[state][action] += alpha * td_error
            
            state = next_state
            
            if state == GOAL_STATE:
                done = True
        if episode % echo_interval == 0:
            print(f"Episode {episode}: Q-values updated.")
            # Extract the policy from Q-values
            show_policy(extract_policy(q_values), f"Q-learning Policy at Episode {episode}")
            show_value_function(value_function(q_values), f"Q-learning Value Function at Episode {episode}")
    # Final policy and value function
    final_policy = extract_policy(q_values)
    final_value_function = value_function(q_values)
    show_policy(final_policy, "Q-learning Final Policy")
    show_value_function(final_value_function, "Q-learning Final Value Function")
    return q_values

## SARSA Implementation

In [None]:
def sarsa(num_episodes, alpha, gamma, epsilon, echo_interval):
    q_values = np.zeros((GRID_HEIGHT, GRID_WIDTH, NUM_ACTIONS))
    for episode in range(num_episodes):
        state = START_STATE
        action = epsilon_greedy_policy(state, q_values, epsilon)
        done = False
        while not done:
            next_state, reward = get_next_state_and_reward(state, action)
            next_action = epsilon_greedy_policy(next_state, q_values, epsilon)
            
            # Update Q-value
            td_target = reward + gamma * q_values[next_state][next_action]
            td_error = td_target - q_values[state][action]
            q_values[state][action] += alpha * td_error
            
            state = next_state
            action = next_action
            
            if state == GOAL_STATE:
                done = True
        if episode % echo_interval == 0:
            print(f"Episode {episode}/{num_episodes} completed.")
            show_policy(extract_policy(q_values), f"SARSA Policy at Episode {episode}")
            show_value_function(value_function(q_values), f"SARSA Value Function at Episode {episode}")

    # Final policy and value function
    final_policy = extract_policy(q_values)
    final_value_function = value_function(q_values)
    show_policy(final_policy, "SARSA Final Policy")
    show_value_function(final_value_function, "SARSA Final Value Function")
    return q_values

## launching the training Q-learning

In [None]:
q_learning(num_episodes=500, alpha=0.1, gamma=0.9, epsilon=0.1, echo_interval=100)

## SARSA training

In [None]:
sarsa(num_episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1, echo_interval=200)