In [None]:
import gymnasium as gym
import numpy as np
import time

In [None]:
env=gym.make("FrozenLake-v1",render_mode='human',is_slippery=False)

In [None]:
n_states=env.observation_space.n
n_actions=env.action_space.n

In [None]:
#.P returns transitionProbability (0), nextState (1), rewardProbability (2), isterminated (3)
env.P[0][0]

[(1.0, 0, 0.0, False)]

In [None]:
gamma = 0.99
theta = 0.000001

In [None]:
def argmax(env, V, pi, action,s, gamma):
    e = np.zeros(n_actions)
    for a in range(n_actions):                         # iterate for every action possible
        q=0
        P = np.array(env.env.P[s][a])
        (x,y) = np.shape(P)                             # for Bellman Equation

        for i in range(x):                              # iterate for every possible states
            s_= int(P[i][1])                            # S' - Sprime - possible succesor states
            p = P[i][0]                                 # Transition Probability P(s'|s,a)
            r = P[i][2]                                 # Reward

            q += p*(r+gamma*V[s_])                      # calculate action_ value q(s|a)
            e[a] = q

    m = np.argmax(e)
    action[s]=m                                           # Take index which has maximum value
    pi[s][m] = 1                                        # update pi(a|s)

    return pi

In [None]:
def bellman_optimality_update(env, V, s, gamma):  # update the stae_value V[s] by taking
    pi = np.zeros((n_states, n_actions))       # action which maximizes current value
    e = np.zeros(n_actions)
                                            # STEP1: Find
    for a in range(n_actions):
        q=0                                 # iterate for all possible action
        P = np.array(env.env.P[s][a])
        (x,y) = np.shape(P)

        for i in range(x):
            s_= int(P[i][1])
            p = P[i][0]
            r = P[i][2]
            q += p*(r+gamma*V[s_])
            e[a] = q

    m = np.argmax(e)
    pi[s][m] = 1

    value = 0
    for a in range(n_actions):
        u = 0
        P = np.array(env.env.P[s][a])
        (x,y) = np.shape(P)
        for i in range(x):

            s_= int(P[i][1])
            p = P[i][0]
            r = P[i][2]

            u += p*(r+gamma*V[s_])

        value += pi[s,a] * u

    V[s]=value
    print("From Bellman Equality Fn ",V[s])
    return V[s]

In [None]:
def value_iteration(env, gamma, theta):
    V = np.zeros(n_states)                                       # initialize v(0) to arbitory value, my case "zeros"
    while True:
        delta = 0
        for s in range(n_states):                       # iterate for all states
            print("Iteration for state {}: =========================================".format(s))
            print("Before ",V[s])
            v = V[s]
            bellman_optimality_update(env, V, s, gamma)   # update state_value with bellman_optimality_update
            print("After ",V[s])
            delta = max(delta, abs(v - V[s]))             # assign the change in value per iteration to delta
        if delta < theta:
            break                                         # if change gets to negligible
                                                          # --> converged to optimal value
    pi = np.zeros((n_states, n_actions))
    action = np.zeros((n_states))
    for s in range(n_states):
        pi = argmax(env, V, pi,action, s, gamma)         # extract optimal policy using action value

    return V, pi,action                                          # optimal value funtion, optimal policy

In [None]:
V, pi, action = value_iteration(env, gamma, theta)

Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  1.0
After  1.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.0
After  0.0
Before  0.0
From Fn  0.99
After  0.99
Before  0.0
From Fn  0.0
A

In [None]:
V

array([0.95099005, 0.96059601, 0.970299  , 0.96059601, 0.96059601,
       0.        , 0.9801    , 0.        , 0.970299  , 0.9801    ,
       0.99      , 0.        , 0.        , 0.99      , 1.        ,
       0.        ])

In [None]:
pi

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]])

In [None]:
action

array([1., 2., 1., 0., 1., 0., 1., 0., 2., 1., 1., 0., 0., 2., 2., 0.])

In [None]:
# Index of Array is the state
# Value of array is the recommended action if agent is in that state
stateActionLookupArray=[int(i) for i in action]
print (action)

[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]


In [None]:
#Write the deploy code
