### Exercise 8.4 (programming) 

**Q**

The exploration bonus described above actually changes the estimated values of states and actions. Is this necessary? Suppose the bonus $\kappa \sqrt{\tau}$ was used not in updates, but solely in action selection. That is, suppose the action selected was always that for which $Q(S_t, a) + \kappa \sqrt{\tau(S_t, a)}$ was maximal. Carry out a gridworld experiment that tests and illustrates the strengths and weaknesses of this alternate approach.

**A**

In [10]:
class BaseEnv:
    def __init__(self, n_states: int, n_actions: int):
        self.n_states = n_states
        self.n_actions = n_actions

    def step(self, action: int) -> tuple[int, float, bool, bool]:
        raise NotImplementedError

    def reset(self, seed: int = 0) -> int:
        self.steps = 0

class BaseAgent:
    def __init__(self, env: BaseEnv):
        self.env = env

    def act(self, state: int) -> int:
        raise NotImplementedError

    def train(self, steps: int):
        raise NotImplementedError

    def reset(self):
        return

from numpy.random import MT19937, Generator

def random_generator(seed: int | None = None):
    bg = MT19937(seed)
    rg = Generator(bg)
    return rg

In [11]:
import numpy as np

class BranchingEnv(BaseEnv):
    def __init__(self, n_states: int, b=1, mean=0, deviation=1):
        assert n_states >= b + 1, f"The number of different states ({n_states}) must be more than b + 1 ({b + 1})"
        assert b >= 1, f"The branching ({b}) must be 1 or higher"

        actions = [0, 1]
        n_actions = len(actions)

        super().__init__(
            n_states=n_states,
            n_actions=n_actions)

        self.b = b
        self.steps = 0
        self.all_steps = 0
        self.state: int | None = None
        self.rg = random_generator()
        self.mean = mean
        self.deviation = deviation
        self.transitions: list[list[list[int, float]]] | None = None

    def reset(self, seed: int = 0) -> int:
        state = 0
        rg = random_generator(seed)
        self.steps = 0
        self.state = state
        self.transitions = [
            [
                [
                    (
                        (s + rg.choice(range(1, self.n_states))) % self.n_states,
                        rg.normal(loc=self.mean, scale=self.deviation),
                    )
                    for _ in range(self.b)
                ]
                for _ in range(self.n_actions)
            ]
            for s in range(self.n_states)
        ]
        self.rg = rg
        return state

    def step(self, action: int) -> tuple[int, float, bool, bool]:
        steps = self.steps + 1
        state = self.state

        assert state is not None, "The environment was not initialized"

        b_chosen = self.rg.choice(range(self.b))
        transition = self.transitions[state][action][b_chosen]
        next_state, reward = transition
        terminated = False
        truncated = False

        if self.rg.random() < 0.1:
            terminated = True
            next_state = None

        self.steps = steps
        self.state = next_state
        self.all_steps += 1

        return next_state, reward, terminated, truncated

In [None]:
seed = 1
env = BranchingEnv(n_states=1000, b=1)
state = env.reset(seed)
rg = random_generator(seed)
rewards = 0
terminated = False
for _ in range(1000):
    if terminated:
        print('-' * 80)
        print(f'Cummulative Rewards: {rewards}')
        print('-' * 80)
        rewards = 0
        state = env.reset(seed)
    next_state, reward, terminated, _ = env.step(rg.choice(range(env.n_actions)))
    rewards += reward
    print(f'next_state={next_state}, reward={reward}, terminated={terminated}')
