Demonstrate the Markov Decision Process (MDP) in Python

In [None]:
import numpy as np

In [None]:
# Define the grid world
grid_size = 3
goal_state = (2, 2)

In [None]:
# Define states as positions in the grid (3x3)
states = [(i, j) for i in range(grid_size) for j in range(grid_size)]

In [None]:
# Define actions
actions = ['up', 'down', 'left', 'right']

In [None]:
# Define the rewards for each state-action pair
rewards = {}
for state in states:
    if state == goal_state:
        rewards[state] = 10  # Reward for reaching the goal
    else:
        rewards[state] = -1  # Penalty for every move

In [None]:
# Define the transition dynamics
def next_state(state, action):
    i, j = state
    if action == 'up':
        return (max(i-1, 0), j)  # Stay within bounds
    elif action == 'down':
        return (min(i+1, grid_size-1), j)  # Stay within bounds
    elif action == 'left':
        return (i, max(j-1, 0))  # Stay within bounds
    elif action == 'right':
        return (i, min(j+1, grid_size-1))  # Stay within bounds

In [None]:
# Define the transition probabilities (deterministic)
def transition_probability(state, action, next_state_):
    if next_state(state, action) == next_state_:
        return 1.0  # Deterministic transition
    else:
        return 0.0

In [None]:
# Define the MDP model
class MDP:
    def __init__(self, states, actions, rewards, goal_state):
        self.states = states
        self.actions = actions
        self.rewards = rewards
        self.goal_state = goal_state

    def step(self, state, action):
        # Get the next state
        next_state_ = next_state(state, action)

        # Get the reward for the next state
        reward = self.rewards[next_state_]

        # Return next state, reward
        return next_state_, reward

In [None]:
# Create an MDP instance
mdp = MDP(states, actions, rewards, goal_state)

In [None]:
# Define a policy (random for simplicity)
def random_policy(state):
    return np.random.choice(actions)

In [None]:
# Simulate the agent's journey through the grid
def simulate_mdp(mdp, start_state, policy, max_steps=10):
    state = start_state
    total_reward = 0
    print(f"Starting state: {state}")

    for step in range(max_steps):
        if state == mdp.goal_state:
            print(f"Reached goal state {mdp.goal_state} at step {step}")
            break

        action = policy(state)
        next_state_, reward = mdp.step(state, action)
        total_reward += reward

        print(f"Step {step + 1}: State: {state}, Action: {action}, Next State: {next_state_}, Reward: {reward}")
        state = next_state_

    print(f"Total reward: {total_reward}")

In [None]:
# Start the simulation from the top-left corner (0, 0)
simulate_mdp(mdp, start_state=(0, 0), policy=random_policy)

Starting state: (0, 0)
Step 1: State: (0, 0), Action: right, Next State: (0, 1), Reward: -1
Step 2: State: (0, 1), Action: left, Next State: (0, 0), Reward: -1
Step 3: State: (0, 0), Action: down, Next State: (1, 0), Reward: -1
Step 4: State: (1, 0), Action: right, Next State: (1, 1), Reward: -1
Step 5: State: (1, 1), Action: right, Next State: (1, 2), Reward: -1
Step 6: State: (1, 2), Action: left, Next State: (1, 1), Reward: -1
Step 7: State: (1, 1), Action: down, Next State: (2, 1), Reward: -1
Step 8: State: (2, 1), Action: right, Next State: (2, 2), Reward: 10
Reached goal state (2, 2) at step 8
Total reward: 3


In [None]:
simulate_mdp(mdp, start_state=(0, 0), policy=random_policy)

Starting state: (0, 0)
Step 1: State: (0, 0), Action: down, Next State: (1, 0), Reward: -1
Step 2: State: (1, 0), Action: down, Next State: (2, 0), Reward: -1
Step 3: State: (2, 0), Action: down, Next State: (2, 0), Reward: -1
Step 4: State: (2, 0), Action: down, Next State: (2, 0), Reward: -1
Step 5: State: (2, 0), Action: left, Next State: (2, 0), Reward: -1
Step 6: State: (2, 0), Action: down, Next State: (2, 0), Reward: -1
Step 7: State: (2, 0), Action: down, Next State: (2, 0), Reward: -1
Step 8: State: (2, 0), Action: down, Next State: (2, 0), Reward: -1
Step 9: State: (2, 0), Action: right, Next State: (2, 1), Reward: -1
Step 10: State: (2, 1), Action: up, Next State: (1, 1), Reward: -1
Total reward: -10


In [None]:
simulate_mdp(mdp, start_state=(0, 0), policy=random_policy)

Starting state: (0, 0)
Step 1: State: (0, 0), Action: down, Next State: (1, 0), Reward: -1
Step 2: State: (1, 0), Action: up, Next State: (0, 0), Reward: -1
Step 3: State: (0, 0), Action: left, Next State: (0, 0), Reward: -1
Step 4: State: (0, 0), Action: left, Next State: (0, 0), Reward: -1
Step 5: State: (0, 0), Action: left, Next State: (0, 0), Reward: -1
Step 6: State: (0, 0), Action: up, Next State: (0, 0), Reward: -1
Step 7: State: (0, 0), Action: up, Next State: (0, 0), Reward: -1
Step 8: State: (0, 0), Action: down, Next State: (1, 0), Reward: -1
Step 9: State: (1, 0), Action: right, Next State: (1, 1), Reward: -1
Step 10: State: (1, 1), Action: up, Next State: (0, 1), Reward: -1
Total reward: -10
