## Reprezentacija Markovljevog procesa

In [1]:
import numpy as np

In [2]:
n_states = 3
n_actions = 3

In [3]:
T = [
    # a0               a1                 a2
    [[0.7 , 0.3 , 0] , [1.0 , 0 , 0] ,    [0.8 , 0.2 , 0]],# s0
    [[0 , 1.0 , 0],    None,              [0 , 0 , 1.0]  ],# s1
    [None,             [0.8 , 0.1 , 0.1], None           ] # s2
]
T

[[[0.7, 0.3, 0], [1.0, 0, 0], [0.8, 0.2, 0]],
 [[0, 1.0, 0], None, [0, 0, 1.0]],
 [None, [0.8, 0.1, 0.1], None]]

In [4]:
T_s0 = T[0]
T_s0

[[0.7, 0.3, 0], [1.0, 0, 0], [0.8, 0.2, 0]]

In [5]:
T_s0_a2 = T_s0[2]
T_s0_a2

[0.8, 0.2, 0]

In [6]:
T_s0_a2_s1 = T_s0_a2[1]
T_s0_a2_s1

0.2

In [7]:
R = np.zeros(shape=(n_states , n_actions , n_states))
R[0][0][0] = +10
R[1][2][2] = -50
R[2][1][0] = +40
R

array([[[ 10.,   0.,   0.],
        [  0.,   0.,   0.],
        [  0.,   0.,   0.]],

       [[  0.,   0.,   0.],
        [  0.,   0.,   0.],
        [  0.,   0., -50.]],

       [[  0.,   0.,   0.],
        [ 40.,   0.,   0.],
        [  0.,   0.,   0.]]])

## Q-iteracija

In [8]:
s_posible_actions = [[0 , 1 , 2],
                     [0 , 2],
                     [1]]
Q_values = np.full(shape=(3,3) , fill_value=-np.inf)
for state , actions in enumerate(s_posible_actions):
    Q_values[state][actions] = 0.0
    
Q_values

array([[  0.,   0.,   0.],
       [  0., -inf,   0.],
       [-inf,   0., -inf]])

In [9]:
gamma = 0.98

In [10]:
n_iterations = 50

for iteration in range(n_iterations):

    Q_prev = Q_values.copy()

    for s , a in ((s,a) for s in range(n_states) for a in s_posible_actions[s]):
        Q_values[s][a] = np.sum([
            T[s][a][sp] * (R[s][a][sp] + gamma * np.max(Q_prev[sp]))
            for sp in range(n_states)
        ])

print("Najbolji niz akcija: " + str(np.argmax(Q_values , axis = 1)))
print("Matrica Q-vrednosti:\n" + str(Q_values))

Najbolji niz akcija: [0 2 1]
Matrica Q-vrednosti:
[[42.88147586 41.70278262 37.82191144]
 [22.29842674        -inf 23.08111272]
 [       -inf 74.90018004        -inf]]
