# Q Learning Algoritm

### Importing libraries

In [2]:
pip install numpy

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import random

### Define the MDP components

In [4]:
# Define the MDP components
states = ["s0", "s1", "s2"]
actions = ["a0", "a1", "a2"]
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate
epsilon = 0.1  # Exploration probability (for ε-greedy)
episodes = 10000  # Number of episodes
max_steps = 100  # Max steps per episode


threshold = 1e-6  # Convergence threshold
max_episodes = 100000  # Maximum number of episodes to avoid infinite loops

Transition probabilities and rewards
Format: transition_probs[state][action] = [(next_state, probability, reward)]

In [5]:
# Transition probabilities and rewards
# Format: transition_probs[state][action] = [(next_state, probability, reward)]
transition_probs = {
    "s0": {
        "a0": [("s0", 0.7, 10), ("s1", 0.3, 0)],
        "a1": [("s0", 1.0, 0)],
        "a2": [("s1", 0.8, 0), ("s2", 0.2, 0)],
    },
    "s1": {
        "a0": [("s1", 1.0, 0)],
        "a2": [("s2", 1.0, -50)],
    },
    "s2": {
        "a1": [("s0", 0.8, 40), ("s2", 0.1, 0), ("s1", 0.1, 0)],
    },
}

### Initialize Q-values

In [6]:
# Initialize Q-values
Q = {state: {action: 0 for action in actions} for state in states}

# Helper function to choose an action using ε-greedy
def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        return random.choice(actions)  # Explore
    return max(Q[state], key=Q[state].get)  # Exploit

# Helper function to simulate the environment
def step(state, action):
    if action not in transition_probs[state]:
        return state, 0  # Invalid action, no reward
    
    transitions = transition_probs[state][action]
    next_state, reward = None, None
    prob = random.uniform(0, 1)
    cumulative_prob = 0

    for next_s, p, r in transitions:
        cumulative_prob += p
        if prob <= cumulative_prob:
            next_state, reward = next_s, r
            break

    return next_state, reward

Main Q-Learning algorithm

In [7]:
# Q-Learning algorithm
for episode in range(episodes):
    state = random.choice(states)  # Start from a random state
    for _ in range(max_steps):
        action = choose_action(state)  # Choose an action
        next_state, reward = step(state, action)  # Take the action and observe the outcome
        
        # Update the Q-value using the Q-Learning formula
        best_next_action = max(Q[next_state], key=Q[next_state].get)
        Q[state][action] += alpha * (reward + gamma * Q[next_state][best_next_action] - Q[state][action])
        
        state = next_state  # Move to the next state

# Derive the optimal policy from the Q-values
policy = {state: max(Q[state], key=Q[state].get) for state in states}

Display results

In [8]:
# Display results
print("Optimal Q-Values:")
for state in Q:
    print(f"  {state}: {Q[state]}")

print("\nOptimal Policy:")
for state, action in policy.items():
    print(f"  {state}: {action}")

Optimal Q-Values:
  s0: {'a0': 12.565564130069587, 'a1': 11.158058329381172, 'a2': 9.143247144497526}
  s1: {'a0': 7.349250948221477e-15, 'a1': 7.296850220100894e-15, 'a2': -7.241827715716406}
  s2: {'a0': 39.95592255673869, 'a1': 43.79011198574897, 'a2': 39.99955557944611}

Optimal Policy:
  s0: a0
  s1: a0
  s2: a1


Number of Iteration (Convergence)

In [9]:
# Q-Learning Algorithm
def q_learning_convergence():
    episode_count = 0
    while episode_count < max_episodes:
        episode_count += 1
        state = random.choice(states)  # Start from a random state
        max_q_change = 0  # Track maximum Q-value change in this episode

        for _ in range(max_steps):
            action = choose_action(state)  # Choose an action
            next_state, reward = step(state, action)  # Take the action and observe the outcome
            
            # Update the Q-value using the Q-Learning formula
            best_next_action = max(Q[next_state], key=Q[next_state].get)
            old_q_value = Q[state][action]
            Q[state][action] += alpha * (reward + gamma * Q[next_state][best_next_action] - Q[state][action])
            max_q_change = max(max_q_change, abs(Q[state][action] - old_q_value))
            
            state = next_state  # Move to the next state

        # Check for convergence
        if max_q_change < threshold:
            break

    return episode_count

# Run Q-Learning and calculate convergence
q_learning_episodes = q_learning_convergence()
print(f"Q-Learning Converged in {q_learning_episodes} episodes.")

Q-Learning Converged in 21 episodes.
