In [17]:
from mdp import generate_cliffworld, value_iteration, v_star
import numpy as np
state_space, action_space, rho_cf, P_cf, r_cf = generate_cliffworld()

# Matrix formulation of policy iteration and density iteration

now `P_Cliffworld` is a $21\times4\times21$ matrix. According to the basic policy evaluation, 
$$Q^\pi = r + \gamma P V$$
where $Q\in\mathbb R^{|\mathcal S||\mathcal A|}$. Then we know that P is in this formulation,
$$P\in\mathbb R^{|\mathcal S||\mathcal A|\times |\mathcal S|}$$, where $P_{ij}$ is the probability of take state-action pair $i$ (in a total of $n\times m$) and transition to next state $s'_j$.
where $$
P = \begin{bmatrix}
    P(s'\mid s_1, a_1)\\
    P(s'\mid s_1, a_2)\\
    \vdots\\
    P(s'\mid s_n, a_m)
\end{bmatrix}
$$

For the Q-version of the policy evaluation, we have

$$
Q^\pi = r + \gamma P\Pi Q
$$
here $\Pi\in\mathbb R^{|\mathcal S|\times|\mathcal S||\mathcal A|}$. We need to write the $\Pi$ in a block diagonal matrix,
$$
\Pi = \begin{bmatrix}
\pi(a\mid s = s_1) & 0 & \dots & 0\\
0 & \pi(a\mid s = s_2) & \dots & 0 \\
\vdots & \vdots & \ddots & \vdots\\
0 & 0 & \dots & \pi(a\mid s = s_n)
\end{bmatrix}
$$

Then if we do matrix multiplication of $P$ and $\Pi$, the results will be 
$$
P\Pi\in\mathbb R^{|\mathcal S||\mathcal A|\times|\mathcal S||\mathcal A|}
$$

In [18]:
# optimal policy
gamma = 0.9
K = 100
v_vi, pi_vi, gaps_vi = value_iteration(P_cf, r_cf, gamma, K, v_star(), theta=1e-4)
pi_vi

Converged at iteration 6


array([3., 0., 0., 0., 0., 1., 1., 1., 1., 2., 1., 1., 1., 1., 2., 1., 1.,
       1., 1., 2., 0.])

In [25]:
def make_p_mat(P):
    P_mat = np.zeros([len(state_space) * len(action_space), len(state_space)])
    for s in range(len(state_space)):
        for a in range(len(action_space)):
            P_mat[s * len(action_space) + a] = P[s, a]
    return P_mat
Pmat = make_p_mat(P_cf)

In [26]:
# 
def make_pi_mat(pi):
    pi_mat = np.zeros([len(state_space), len(state_space) * len(action_space)])
    for i in range(len(state_space)):
        action = pi[i]
        pi_mat[i, i * len(action_space) + int(action)] = 1
    return pi_mat
Pimat = make_pi_mat(pi_vi)

In [46]:
v_star_mat = np.linalg.inv(np.eye(Pmat.shape[0]) - gamma * Pmat @ Pimat) @ r_cf.reshape([-1, 1])
np.isclose(Pimat @ v_star_mat.squeeze(), v_star())

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])