# Markov Decision Process

In [1]:
#Markov Decision Process

from typing import Tuple

class Environment:
    def __init__(self):
        """
        Constructor of the Environment class.
        """
        self._initial_state = 1
        self._allowed_actions = [0, 1]  # 0: A, 1: B
        self._states = [1, 2, 3]
        self._current_state = self._initial_state

    def step(self, action: int) -> Tuple[int, int]:
        """
        Step function: compute the one-step dynamic from the given action.

        Args:
            action (int): the action taken by the agent.

        Returns:
            Tuple[int, int]: The tuple (current_state, reward).
        """

        # check if the action is allowed
        if action not in self._allowed_actions:
            raise ValueError("Action is not allowed")

        reward = 0
        if action == 0 and self._current_state == 1:
            self._current_state = 2
            reward = 1
        elif action == 1 and self._current_state == 1:
            self._current_state = 3
            reward = 10
        elif action == 0 and self._current_state == 2:
            self._current_state = 1
            reward = 0
        elif action == 1 and self._current_state == 2:
            self._current_state = 3
            reward = 1
        elif action == 0 and self._current_state == 3:
            self._current_state = 2
            reward = 0
        elif action == 1 and self._current_state == 3:
            self._current_state = 3
            reward = 10

        return self._current_state, reward

    def reset(self) -> int:
        """
        Reset the environment starting from the initial state.

        Returns:
            int: The environment state after reset (initial state).
        """
        self._current_state = self._initial_state
        return self._current_state

# Instantiate the environment and run some actions
env = Environment()
state = env.reset()

actions = [0, 0, 1, 1, 0, 1]

print(f"Initial state is {state}")

for action in actions:
    next_state, reward = env.step(action)
    print(f"From state {state} to state {next_state} with action {action}, reward: {reward}")
    state = next_state


Initial state is 1
From state 1 to state 2 with action 0, reward: 1
From state 2 to state 1 with action 0, reward: 0
From state 1 to state 3 with action 1, reward: 10
From state 3 to state 3 with action 1, reward: 10
From state 3 to state 2 with action 0, reward: 0
From state 2 to state 3 with action 1, reward: 1


In [2]:
#Markov Decision Process

import numpy as np

p = np.array([[0.3, 0.7],[0.2, 0.8]])
print("Transition Matrix:\n", p)
s = np.array([0.5, 0.5])
for i in range(10):
    s = np.dot(s, p)
    print("\nIter {0}. Probability vector s = {1}".format(i,s))
print("\nFinal Vector s={0}".format(s))

Transition Matrix:
 [[0.3 0.7]
 [0.2 0.8]]

Iter 0. Probability vector s = [0.25 0.75]

Iter 1. Probability vector s = [0.225 0.775]

Iter 2. Probability vector s = [0.2225 0.7775]

Iter 3. Probability vector s = [0.22225 0.77775]

Iter 4. Probability vector s = [0.222225 0.777775]

Iter 5. Probability vector s = [0.2222225 0.7777775]

Iter 6. Probability vector s = [0.22222225 0.77777775]

Iter 7. Probability vector s = [0.22222223 0.77777778]

Iter 8. Probability vector s = [0.22222222 0.77777778]

Iter 9. Probability vector s = [0.22222222 0.77777778]

Final Vector s=[0.22222222 0.77777778]


# Markov Reward Process

In [3]:
# Define the states
states = ["Distracted", "Study", "Take Exam", "Obtain Certificate"]

# Define the transitions with probabilities and rewards
transitions = {
    "Distracted": [("Distracted", 0.9, 0), ("Study", 0.1, -3)],
    "Study": [("Distracted", 0.5, 0), ("Study", 0.3, -3), ("Take Exam", 0.2, -2)],
    "Take Exam": [("Study", 0.4, -2), ("Obtain Certificate", 0.6, 10)],
    "Obtain Certificate": []
}

# Print the state variables
for state, transitions_list in transitions.items():
    print(f"State: {state}")
    for transition in transitions_list:
        print(f"  Transition to {transition[0]} with probability {transition[1]} and reward {transition[2]}")

State: Distracted
  Transition to Distracted with probability 0.9 and reward 0
  Transition to Study with probability 0.1 and reward -3
State: Study
  Transition to Distracted with probability 0.5 and reward 0
  Transition to Study with probability 0.3 and reward -3
  Transition to Take Exam with probability 0.2 and reward -2
State: Take Exam
  Transition to Study with probability 0.4 and reward -2
  Transition to Obtain Certificate with probability 0.6 and reward 10
State: Obtain Certificate


In [4]:
import numpy as np

# Define the states
states = ["Distracted", "Study", "Take Exam", "Obtain Certificate"]

# Define the transitions with probabilities and rewards
transitions = {
    "Distracted": [("Distracted", 0.9, 0), ("Study", 0.1, -3)],
    "Study": [("Distracted", 0.5, 0), ("Study", 0.3, -3), ("Take Exam", 0.2, -2)],
    "Take Exam": [("Study", 0.4, -2), ("Obtain Certificate", 0.6, 10)],
    "Obtain Certificate": []
}

# Create dictionaries for indexing
state_index = {state: i for i, state in enumerate(states)}

# Initialize matrices
transition_matrix = np.zeros((len(states), len(states)))
reward_matrix = np.zeros((len(states), len(states)))

# Populate the matrices
for state, transitions_list in transitions.items():
    i = state_index[state]
    for next_state, prob, reward in transitions_list:
        j = state_index[next_state]
        transition_matrix[i, j] = prob
        reward_matrix[i, j] = reward

# Print the transition matrix
print("Transition Matrix:")
print("\t\t\t " + " ".join(f"{state:>20}" for state in states))
for i, state in enumerate(states):
    print(f"{state:>20}", end=" ")
    for j in range(len(states)):
        print(f"{transition_matrix[i, j]:>20.2f}", end=" ")
    print()

# Print the reward matrix
print("\nReward Matrix:")
print("\t\t\t " + " ".join(f"{state:>20}" for state in states))
for i, state in enumerate(states):
    print(f"{state:>20}", end=" ")
    for j in range(len(states)):
        print(f"{reward_matrix[i, j]:>20.2f}", end=" ")
    print()


Transition Matrix:
			           Distracted                Study            Take Exam   Obtain Certificate
          Distracted                 0.90                 0.10                 0.00                 0.00 
               Study                 0.50                 0.30                 0.20                 0.00 
           Take Exam                 0.00                 0.40                 0.00                 0.60 
  Obtain Certificate                 0.00                 0.00                 0.00                 0.00 

Reward Matrix:
			           Distracted                Study            Take Exam   Obtain Certificate
          Distracted                 0.00                -3.00                 0.00                 0.00 
               Study                 0.00                -3.00                -2.00                 0.00 
           Take Exam                 0.00                -2.00                 0.00                10.00 
  Obtain Certificate                 0.00      

In [4]:

# Define the states and transitions
states = ["Distracted", "Study", "Take Exam", "Obtain Certificate"]

transitions = {
    "Distracted": [("Distracted", 0.9, 0), ("Study", 0.1, -3)],
    "Study": [("Distracted", 0.5, 0), ("Study", 0.3, -3), ("Take Exam", 0.2, -2)],
    "Take Exam": [("Study", 0.4, -2), ("Obtain Certificate", 0.6, 10)],
    "Obtain Certificate": []
}

def get_transition_reward(current_state, next_state):
    transitions_list = transitions.get(current_state, [])
    for state, prob, reward in transitions_list:
        if state == next_state:
            return reward
    return 0  # Return 0 if no transition exists (shouldn't happen if path is valid)

def calculate_total_reward(path):
    total_reward = 0
    for i in range(len(path) - 1):
        current_state = path[i]
        next_state = path[i + 1]
        reward = get_transition_reward(current_state, next_state)
        total_reward += reward
    
    return total_reward

# Example path
path = ["Study", "Distracted","Take Exam", "Obtain Certificate"]

# Calculate and print the total reward for the path
total_reward = calculate_total_reward(path)
print(f"The total reward for the path {path} is: {total_reward}")


The total reward for the path ['Study', 'Distracted', 'Take Exam', 'Obtain Certificate'] is: 10


In [24]:

# Define the states and transitions
states = ["Distracted", "Study", "Take Exam", "Obtain Certificate"]

transitions = {
    "Distracted": [("Distracted", 0.9, 0), ("Study", 0.1, -3)],
    "Study": [("Distracted", 0.5, 0), ("Study", 0.3, -3), ("Take Exam", 0.2, -2)],
    "Take Exam": [("Study", 0.4, -2), ("Obtain Certificate", 0.6, 10)],
    "Obtain Certificate": []
}

def get_transition_reward(current_state, next_state):
    transitions_list = transitions.get(current_state, [])
    for state, prob, reward in transitions_list:
        if state == next_state:
            return reward
    return 0  # Return 0 if no transition exists (shouldn't happen if path is valid)

def calculate_total_reward(path, gamma):
    total_reward = 0
    for i in range(len(path) - 1):
        current_state = path[i]
        next_state = path[i + 1]
        reward = get_transition_reward(current_state, next_state)
        total_reward += ((0.9)**i)*reward
    
    return total_reward

# Example path
path = ["Distracted", "Study", "Take Exam", "Obtain Certificate"]
gamma = 0.9
    
# Calculate and print the total reward for the path
total_reward = calculate_total_reward(path, gamma)
print(f"The total reward for the path {path} is: {total_reward}")


The total reward for the path ['Distracted', 'Study', 'Take Exam', 'Obtain Certificate'] is: 3.3000000000000016


In [29]:
import numpy as np

# Define the states
states = ["Distracted", "Study", "Take exam", "Obtain Certificate"]

# Define the transition probabilities matrix
P = np.array([
    [0.9, 0.1, 0.0, 0.0],  # Distracted -> [Distracted, Study, Take Exam, Obtain Certificate]
    [0.3, 0.5, 0.2, 0.0],  # Study -> [Distracted, Study, Take Exam, Obtain Certificate]
    [0.0, 0.4, 0.0, 0.6],  # Take Exam -> [Distracted, Study, Take Exam, Obtain Certificate]
    [0.0, 0.0, 0.0, 1.0]   # Obtain Certificate -> [Distracted, Study, Take Exam, Obtain Certificate]
])

# Define the reward matrix corresponding to each transition
R= np.array([
    [0, -3, 0, 0],    # Rewards for transitions from Distracted
    [-3, -2, -2, 0],    # Rewards for transitions from Study
    [0, 0, 0, 10],    # Rewards for transitions from Take Exam
    [0, 0, 0, 0]      # Rewards for transitions from Obtain Certificate
])
def simulate_markov(initial_state, P, R, steps=4):
    current_state = initial_state
    total_reward = 0
    state_history = [states[current_state]]

    for _ in range(steps):
        next_state = np.random.choice(len(states), p=P[current_state])
        reward = R[current_state, next_state]
        total_reward += reward
        state_history.append(states[next_state])
        current_state = next_state

        # If reached terminal state
        if current_state == len(states) - 1:
            break

    return state_history, total_reward

# Initial state: Distracted (index 0)
initial_state = 0
state_history, total_reward = simulate_markov(initial_state, P, R)

print("State history:", state_history)
print("Total reward:", total_reward)

State history: ['Distracted', 'Distracted', 'Distracted', 'Study', 'Distracted']
Total reward: -6
