In [2]:
import numpy as np

class RTDP:
    def __init__(self, state_space, action_space, transition_model, reward_model, gamma=0.9, max_iterations=1000):
        self.state_space = state_space
        self.action_space = action_space
        self.transition_model = transition_model
        self.reward_model = reward_model
        self.gamma = gamma
        self.max_iterations = max_iterations

        self.value_function = np.zeros(len(state_space))
        self.policy = np.zeros(len(state_space), dtype=int)

    def run(self):
        for _ in range(self.max_iterations):
            for state in self.state_space:
                action_values = []
                for action in self.action_space:
                    next_state = self.transition_model(state, action)
                    reward = self.reward_model(state, action, next_state)
                    action_value = reward + self.gamma * self.value_function[next_state]
                    action_values.append(action_value)
                best_action = np.argmax(action_values)
                best_value = action_values[best_action]
                self.value_function[state] = best_value
                self.policy[state] = best_action

def transition_model(state, action):
    # Simple grid world transition model
    if action == 'up':
        return state - 3 if state >= 3 else state
    elif action == 'down':
        return state + 3 if state < 6 else state
    elif action == 'left':
        return state - 1 if state % 3 != 0 else state
    elif action == 'right':
        return state + 1 if state % 3 != 2 else state

def reward_model(state, action, next_state):
    # Simple reward model: -1 for every step
    return -1

# Define the state and action space
state_space = np.arange(9)
action_space = ['up', 'down', 'left', 'right']

# Create an instance of RTDP
rtdp = RTDP(state_space, action_space, transition_model, reward_model)

# Run RTDP
rtdp.run()

# Print the optimal policy
print("Optimal Policy:")
for i, action in enumerate(rtdp.policy):
    print(f"State {i}: {action_space[action]}")

Optimal Policy:
State 0: up
State 1: up
State 2: up
State 3: up
State 4: up
State 5: up
State 6: up
State 7: up
State 8: up
