Copyright **`(c)`** 2024 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free under certain conditions — see the [`license`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

In [76]:
from tqdm.auto import tqdm
from icecream import ic

In [78]:
LENGTH = 10

STATES = tuple(range(LENGTH))
ACTIONS = {s: {-1, +1} for s in range(1, LENGTH - 1)}
ACTIONS[0] = {}
ACTIONS[LENGTH-1] = {}
REWARD = {(s, a): -1 for s in STATES for a in ACTIONS[s]}
REWARD[1, -1] = 7
REWARD[LENGTH - 2, 1] = 15

In [79]:
# Random policy
def random_policy(s):
    available_actions = ACTIONS[s]
    return {(1 / len(available_actions), a) for a in ACTIONS[s]}


# Greedy policy
def create_greedy_policy(value):
    policy = dict()
    for s in STATES:
        if s == 0 or s == LENGTH - 1:
            policy[s] = {}
        elif REWARD[s, -1] + value[s - 1] > REWARD[s, +1] + value[s + 1]:
            policy[s] = {(1, -1)}
        elif REWARD[s, -1] + value[s - 1] < REWARD[s, +1] + value[s + 1]:
            policy[s] = {(1, 1)}
        else:
            policy[s] = {(0.5, -1), (0.5, 1)}
    return policy

In [83]:
value = {s: 0 for s in STATES}

for i in tqdm(range(20)):
    new_value = dict()
    for s in STATES:
        new_value[s] = 0
        for p, a in random_policy(s):
            new_value[s] += p * (REWARD[s, a] + value[s + a])
    ic(max(abs(value[s]-new_value[s]) for s in STATES))
    value = dict(new_value)

  0%|          | 0/20 [00:00<?, ?it/s]

ic| max(abs(value[s]-new_value[s]) for s in STATES

): 7.0
ic| max(abs(value[s]-new_value[s]) for s in STATES): 3.0
ic| max(abs(value[s]-new_value[s]) for s in STATES): 1.5
ic| max(abs(value[s]-new_value[s]) for s in STATES): 1.25
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0.6875
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0.65625
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0.53125
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0.5078125
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0.4375
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0.40625
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0.37109375
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0.333984375
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0.319580078125
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0.2803955078125
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0.27764892578125
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0.24603271484375
ic| max(abs(value[s]-new_value[s]) for s in STATE

In [81]:
value = {s: 0 for s in STATES}

for i in tqdm(range(10)):
    policy = create_greedy_policy(value)
    new_value = dict()
    for s in STATES:
        new_value[s] = 0
        for p, a in policy[s]:
            new_value[s] += p * (REWARD[s, a] + value[s + a])
    ic(max(abs(value[s]-new_value[s]) for s in STATES))
    value = dict(new_value)

  0%|          | 0/10 [00:00<?, ?it/s]

ic| max(abs(value[

s]-new_value[s]) for s in STATES): 15
ic| max(abs(value[s]-new_value[s]) for s in STATES): 15.0
ic| max(abs(value[s]-new_value[s]) for s in STATES): 15.0
ic| max(abs(value[s]-new_value[s]) for s in STATES): 15.0
ic| max(abs(value[s]-new_value[s]) for s in STATES): 7
ic| max(abs(value[s]-new_value[s]) for s in STATES): 5
ic| max(abs(value[s]-new_value[s]) for s in STATES): 3
ic| max(abs(value[s]-new_value[s]) for s in STATES): 1
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0
ic| max(abs(value[s]-new_value[s]) for s in STATES): 0


In [None]:
value = {s: 0 for s in STATES}

for i in tqdm(range(100)):
    policy = create_greedy_policy(value)
    value = dict()
    for s in STATES:
        value[s] = 0
        for p, a in policy[s]:
            value[s] += p * (REWARD[s, a] + value[s + a])

{0: 0,
 1: 33,
 2: 34,
 3: 35,
 4: 36,
 5: 37,
 6: 38,
 7: 39,
 8: 40,
 9: 41,
 10: 42,
 11: 43,
 12: 44,
 13: 45,
 14: 46,
 15: 47,
 16: 48,
 17: 49,
 18: 50,
 19: 0}