In [1]:
import numpy as np
import random

class RolloutAgent:
    def __init__(self, env, max_rollouts=100):
        self.env = env
        self.max_rollouts = max_rollouts

    def rollout(self, state, action):
        total_reward = 0
        for _ in range(self.max_rollouts):
            rollout_env = self.env.clone()  # Create a copy of the environment for the rollout
            rollout_env.set_state(state)
            rollout_env.step(action)
            rollout_reward = 0
            done = False
            while not done:
                rollout_action = random.choice(rollout_env.get_possible_actions())
                _, reward, done, _ = rollout_env.step(rollout_action)
                rollout_reward += reward
            total_reward += rollout_reward
        return total_reward / self.max_rollouts

    def choose_action(self, state):
        possible_actions = self.env.get_possible_actions()
        action_values = [self.rollout(state, action) for action in possible_actions]
        best_action = possible_actions[np.argmax(action_values)]
        return best_action

# Example Usage
class GridWorld:
    def __init__(self):
        self.state = (0, 0)
        self.grid_size = 5

    def set_state(self, state):
        self.state = state

    def get_possible_actions(self):
        return ['up', 'down', 'left', 'right']

    def step(self, action):
        if action == 'up' and self.state[0] > 0:
            self.state = (self.state[0] - 1, self.state[1])
        elif action == 'down' and self.state[0] < self.grid_size - 1:
            self.state = (self.state[0] + 1, self.state[1])
        elif action == 'left' and self.state[1] > 0:
            self.state = (self.state[0], self.state[1] - 1)
        elif action == 'right' and self.state[1] < self.grid_size - 1:
            self.state = (self.state[0], self.state[1] + 1)
        reward = -1 if self.state != (self.grid_size - 1, self.grid_size - 1) else 0  # -1 for each step, 0 at goal
        done = self.state == (self.grid_size - 1, self.grid_size - 1)
        return self.state, reward, done, {}

    def clone(self):
        return GridWorld()

env = GridWorld()
rollout_agent = RolloutAgent(env)

# Perform a rollout from the initial state
initial_state = (0, 0)
best_action = rollout_agent.choose_action(initial_state)
print("Best action to take from state", initial_state, ":", best_action)

Best action to take from state (0, 0) : down
