<a href="https://colab.research.google.com/github/manikanta-eng/Reinforcement-learning/blob/main/lab_01_rml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np

# Define the MDP
# Example: Gridworld MDP with 4 states and 2 actions (left, right)
num_states = 4
num_actions = 2
P = np.zeros((num_states, num_actions, num_states))  # Transition probabilities
R = np.zeros((num_states, num_actions, num_states))  # Rewards

# Define transitions and rewards (example)
# State 0
P[0, 0, 0] = 1.0  # If at state 0 and action left, stays at state 0
P[0, 1, 1] = 1.0  # If at state 0 and action right, moves to state 1
R[0, 0, 0] = 0.0  # Reward for staying at state 0
R[0, 1, 1] = 1.0  # Reward for moving to state 1

# State 1
P[1, 0, 0] = 0.8
P[1, 0, 1] = 0.2
P[1, 1, 2] = 1.0
R[1, 0, 0] = 0.8
R[1, 0, 1] = 0.2
R[1, 1, 2] = 1.0

# State 2
P[2, 0, 1] = 1.0
P[2, 1, 3] = 1.0
R[2, 0, 1] = 1.0
R[2, 1, 3] = 1.0

# State 3
P[3, 0, 2] = 1.0
P[3, 1, 3] = 1.0
R[3, 0, 2] = 1.0
R[3, 1, 3] = 1.0


# Value iteration function
def value_iteration(P, R, gamma=0.9, epsilon=1e-6):
    num_states, num_actions, _ = P.shape
    V = np.zeros(num_states)
    while True:
        V_prev = np.copy(V)
        for s in range(num_states):
            Q_s = np.zeros(num_actions)
            for a in range(num_actions):
                Q_s[a] = np.sum(P[s, a] * (R[s, a] + gamma * V_prev))
            V[s] = np.max(Q_s)
        if np.max(np.abs(V - V_prev)) < epsilon:
            break
    # Extract policy
    policy = np.zeros(num_states, dtype=int)
    for s in range(num_states):
        Q_s = np.zeros(num_actions)
        for a in range(num_actions):
            Q_s[a] = np.sum(P[s, a] * (R[s, a] + gamma * V))
        policy[s] = np.argmax(Q_s)
    return V, policy

# Run value iteration
optimal_V, optimal_policy = value_iteration(P, R)

print("Optimal Value Function:")
print(optimal_V)
print("Optimal Policy (0: Left, 1: Right):")
print(optimal_policy)


Optimal Value Function:
[9.99999179 9.99999179 9.99999179 9.99999179]
Optimal Policy (0: Left, 1: Right):
[1 1 0 0]


In [5]:
def policy_iteration(P, R, gamma=0.9, epsilon=1e-6):
    num_states, num_actions, _ = P.shape
    policy = np.random.randint(num_actions, size=num_states)  # Initialize policy randomly
    while True:
        # Policy Evaluation
        V = np.zeros(num_states)
        while True:
            V_prev = np.copy(V)
            for s in range(num_states):
                a = policy[s]
                V[s] = np.sum(P[s, a] * (R[s, a] + gamma * V_prev))
            if np.max(np.abs(V - V_prev)) < epsilon:
                break

        # Policy Improvement
        policy_stable = True
        for s in range(num_states):
            old_action = policy[s]
            Q_s = np.zeros(num_actions)
            for a in range(num_actions):
                Q_s[a] = np.sum(P[s, a] * (R[s, a] + gamma * V))
            policy[s] = np.argmax(Q_s)
            if old_action != policy[s]:
                policy_stable = False
        if policy_stable:
            break
    return V, policy

# Run policy iteration
optimal_V_policy, optimal_policy_policy = policy_iteration(P, R)

print("Optimal Value Function (Policy Iteration):")
print(optimal_V_policy)
print("Optimal Policy (Policy Iteration, 0: Left, 1: Right):")
print(optimal_policy_policy)


Optimal Value Function (Policy Iteration):
[9.99999179 9.99999179 9.99999179 9.99999179]
Optimal Policy (Policy Iteration, 0: Left, 1: Right):
[1 1 0 0]


In [6]:
import numpy as np

# Define the MDP for 5 states and 2 actions
num_states = 5
num_actions = 2
P = np.zeros((num_states, num_actions, num_states))  # Transition probabilities
R = np.zeros((num_states, num_actions, num_states))  # Rewards

# Define transitions and rewards

# State 0
P[0, 0, 0] = 1.0   # Left → stay
R[0, 0, 0] = 0.0
P[0, 1, 1] = 1.0   # Right → to state 1
R[0, 1, 1] = 1.0

# State 1
P[1, 0, 0] = 0.5
P[1, 0, 1] = 0.5
R[1, 0, 0] = 0.5
R[1, 0, 1] = 0.5
P[1, 1, 2] = 1.0
R[1, 1, 2] = 2.0

# State 2
P[2, 0, 1] = 1.0
R[2, 0, 1] = 1.0
P[2, 1, 3] = 1.0
R[2, 1, 3] = 2.0

# State 3
P[3, 0, 2] = 0.6
P[3, 0, 3] = 0.4
R[3, 0, 2] = 1.0
R[3, 0, 3] = 0.5
P[3, 1, 4] = 1.0
R[3, 1, 4] = 3.0

# State 4
P[4, 0, 3] = 1.0
R[4, 0, 3] = 1.0
P[4, 1, 4] = 1.0
R[4, 1, 4] = 0.0

# Value iteration function
def value_iteration(P, R, gamma=0.9, epsilon=1e-6):
    num_states, num_actions, _ = P.shape
    V = np.zeros(num_states)
    while True:
        V_prev = np.copy(V)
        for s in range(num_states):
            Q_s = np.zeros(num_actions)
            for a in range(num_actions):
                Q_s[a] = np.sum(P[s, a] * (R[s, a] + gamma * V_prev))
            V[s] = np.max(Q_s)
        if np.max(np.abs(V - V_prev)) < epsilon:
            break
    # Extract policy
    policy = np.zeros(num_states, dtype=int)
    for s in range(num_states):
        Q_s = np.zeros(num_actions)
        for a in range(num_actions):
            Q_s[a] = np.sum(P[s, a] * (R[s, a] + gamma * V))
        policy[s] = np.argmax(Q_s)
    return V, policy

# Run value iteration
optimal_V, optimal_policy = value_iteration(P, R)

print("Optimal Value Function:")
print(optimal_V)
print("Optimal Policy (0: Left, 1: Right):")
print(optimal_policy)


Optimal Value Function:
[19.38367834 20.42631022 20.47367834 20.52631022 19.47367834]
Optimal Policy (0: Left, 1: Right):
[1 1 1 1 0]
