In [None]:
import numpy as np
import random

# TODO: In "for a in action[s]" add wall
# TODO: Plot delta over iteration for seeing how it smallens everytime

states = []

for i in range(3):
    for j in range(4):
        states.append((i+1, j+1))

### Create reward dictionary
rewards = {}

for state in states:
    # Green Terminal State
    if state == (1,4):
        rewards[state] = 1

    # Red Terminal State
    elif state == (2, 4):
        rewards[state] = -1

    # For all other states
    else:
        rewards[state] = 0
    
### Define actions
# U = Up, D = Down, L = Left, R = Right
actions = {
    (3, 1): ["U", "R"],             # Start state
    (1, 1): ["D", "R"],
    (1, 2): ["L", "R"],
    (1, 3): ["L", "R", "D"],
    (2, 1): ["U", "D"],             # (2, 2) is wall so we can basically ignore it here
    (2, 2): ["U", "D", "L", "R"],
    (2, 3): ["U", "R", "D"],
    (3, 2): ["L", "R"],
    (3, 3): ["L", "U", "R"],
    (3, 4): ["U", "L"]
}

### Define initial policy, here: Random
policy = {}
for state in actions.keys():
    policy[state] = np.random.choice(actions[state])

### Define Transition Probabilites
p_action = {"U": 0.5, "D": 0.1, "L": 0.1, "R": 0.3}

### Value Iteration Presetting ###
# 0th Step: Set Hyperparams
GAMMA = 0.9
THETA = 0.005
NOISE = 0.1

# 1st Step: Initialize all V(s) arbitrary
V = {}
for s in states:
    if s == (1, 4):
        V[s] = 1
    elif s == (2, 4):
        V[s] = 1
    else:
        V[s] = 0

# For Plotting
delta_records = []


def getStateFromRandomAction(a: Str, s: Tuple) -> Tuple:
    if a == "U":
        next_state = (s[0] - 1, s[1])
            if next_state == (2, 2):
                next_state = (s[0], s[1])

    if a == "D":
        next_state = (s[0] + 1, s[1])
            if next_state == (2, 2):
                next_state = (s[0], s[1])    

    if a == "L":
        next_state = (s[0], s[1] - 1)
            if next_state == (2, 2):
                next_state = (s[0], s[1])

    if a == "R":
        next_state = (s[0], s[1] + 1)
            if next_state == (2, 2):
                next_state = (s[0], s[1])
    
    return next_state
    


# Value Iteration
numb_of_iteration = 0

while True:
    delta = 0
    for s in states:
        if s in policy:
            v_init = V[s]
            v_post = 0

            # Get next state by altering the state tuple
            for a in actions[s]:

                a_rand = np.random.choice([action for action in actions[s] if action != a])

                # If action is Up
                if a == "U":
                    next_state = (s[0] - 1, s[1])
                    if next_state == (2, 2):
                        next_state = (s[0], s[1])
                    v = p_action["U"] * (rewards[next_state] + GAMMA*V[next_state])
                    if random.random() < NOISE:
                        v = (1-NOISE) * (p_action["U"] * (rewards[next_state] + GAMMA*V[next_state])) + (NOISE) * (rewards[])

                # If action is Down
                if a == "D":
                    next_state = (s[0] + 1, s[1])
                    if next_state == (2, 2):
                        next_state = (s[0], s[1])
                    v = p_action["D"] * (rewards[next_state] + GAMMA*V[next_state])
 
                # If action is Left
                if a == "L":
                    next_state = (s[0], s[1] - 1)
                    if next_state == (2, 2):
                        next_state = (s[0], s[1])
                    v = p_action["L"] * (rewards[next_state] + GAMMA*V[next_state])

                # If action is Right
                if a == "R":
                    next_state = (s[0], s[1] + 1)
                    if next_state == (2, 2):
                        next_state = (s[0], s[1])
                    v = p_action["R"] * (rewards[next_state] + GAMMA*V[next_state])
                                    
                # If new value of v(s) is better than the old value, i.e. v_init, then keep it
                if v > v_post:
                    v_post = v
                    policy[s] = a
                
                # Print information about the variables
                print("s: {}, s': {}, a: {}, v: {}, v[s]: {}, V[s']: {}".format(s, next_state, a, v, V[s], V[next_state]))
                
            # Safe highest state value v_post in V dictionary
            V[s] = v_post
            # Calculate delta, i.e. difference between the old value and the new value
            delta = max(delta, np.abs(v_init - V[s]))

    delta_records.append(delta)                                 # Optional (for plotting)
    if delta < THETA:
        break
    numb_of_iteration += 1


#########################

print("Number of Iteration: {}".format(numb_of_iteration))
print(policy)
print(V)

