<div align="center"><h3>HW4, Question 4</h3></div>
<div align="center"><h5>Mohammadreza Ghofrani, 400131076</h5></div>

In [1]:
from enum import Enum
from itertools import product

import gym
import numpy as np
from io import StringIO
from contextlib import closing
from amalearn.reward import RewardBase
from amalearn.agent import AgentBase
from amalearn.environment import EnvironmentBase




In [2]:
class Actions(Enum):
    STAY = 0
    UP = 1
    DOWN = 2
    LEFT = 3
    RIGHT = 4
    UP_LEFT = 5
    UP_RIGHT = 6
    DOWN_LEFT = 7
    DOWN_RIGHT = 8

    def __str__(self):
        name2repr = {
            'STAY': '.',
            'UP': '↑',
            'DOWN': '↓',
            'LEFT': '←',
            'RIGHT': '→',
            'UP_LEFT': '↖',
            'UP_RIGHT': '↗',
            'DOWN_LEFT': '↙',
            'DOWN_RIGHT': '↘'
        }
        return name2repr[self.name]

# Environment's Implementation

In [52]:
class Environment(EnvironmentBase):
    def __init__(
        self,
        # GYM library arguments
        id = 0,
        container=None,
        # Environment arguments
        obstacle = [],
        j_limit = 10,
        i_limit = 10,
        # Reward arguments
        p = 0.8,
        punish=-10,
        goalReward = 100,
        actionPrice = -1,
        # Other arguments
        action_count=9,
    ):
        # General
        self.state = None
        self.nrow, self.ncol = i_limit, j_limit
        self.goal_reward = goalReward
        self.action_punishment = actionPrice
        self.obstacle_punishment = punish
        self.action_prob = p
        self.actions = gym.spaces.Discrete(len(Actions)) # Stay: 0,
                                                         # Up: 1, Down: 2, Left: 3, Right: 4,
                                                         # Up-Left: 5, Up-Right: 6, Down-Left: 7, Down-Right: 8
        self.reward_range = (min(self.action_punishment, self.action_punishment), self.goal_reward)
        self.observation_space = gym.spaces.Tuple((gym.spaces.Discrete(i_limit),
                                                   gym.spaces.Discrete(j_limit)))

        # Map
        self.map = np.chararray((i_limit, j_limit))
        self.map[:] = '_'
        self.map[-1,-1], self.map[0,0] = 'S', 'G'
        for obs_location in obstacle:
            self.map[obs_location] = 'X'

        # Probablity Matrix
        self.probablity = {s: {a: [] for a in Actions} for s in np.ndindex(self.map.shape)}

        def count_possible_slipped_states(state, ignore_state):
            row, col = state
            number_of_possible_states_nearby = 0
            for i,j in product([-1,0,1], [-1,0,1]):
                if i == j == 0 or (row+i, col+j) == ignore_state:
                    continue
                if self.isStatePossible((row+i, col+j)):
                    number_of_possible_states_nearby += 1
            return number_of_possible_states_nearby

        def update_probability_matrix(state, action):
            newstate = self._seek_next_expected_state(state, action)
            done = self.is_state_final_state(newstate)
            reward = self.getReward(state, action, newstate)
            return newstate, reward, done

        for state in np.ndindex(self.map.shape):
            for action_done in Actions:
                prob_row = self.probablity[state][action_done]
                if self.is_state_final_state(state):
                    prob_row.append((1, state, self.goal_reward, True))
                elif self.is_state_obstacle(state):
                    prob_row.append((1, state, self.obstacle_punishment, False))
                elif action_done == Actions.STAY:
                    prob_row.append(
                        (1, *update_probability_matrix(state, action_done))
                    )
                else:
                    desired_next_state, reward, done = update_probability_matrix(state, action_done)
                    prob_row.append(
                        (self.action_prob,
                            *update_probability_matrix(state, action_done))
                    )
                    for action in Actions:
                        if action == Actions.STAY or action == action_done:
                            continue
                        else:
                            next_state = self.__seek_next_state(state, action)
                            next_expected_state, reward, done = update_probability_matrix(state, action)
                            if self.isStatePossible(next_state):
                                prob_row.append(
                                    ((1-self.action_prob)/count_possible_slipped_states(state, desired_next_state),
                                        next_expected_state, reward, done)
                                )

    def isInEnvironment(self, state):
        return state in np.ndindex(self.map.shape)

    def isStateObstacle(self, state):
        return self.map[state] == b'X'

    def isStatePossible(self, state):
        return self.isInEnvironment(state) and not self.isStateObstacle(state)

    def isAccessible(self, state, state_p):
        for action in self.available_actions():
            if self.next_state(action) == state_p:
                return True
        return False

    def getTransitionStatesAndProbs(self, state, action, state_p):
        for prob, new_state, reward, done in self.probablity[state][action]:
            if new_state == state_p:
                return prob

    def getReward(self, state, action, state_p):
        if self.map[state_p] == b'G':
            return self.goal_reward
        elif self.map[state_p] == b'X' or \
            not self.isStatePossible(state_p):
            return self.obstacle_punishment + self.action_punishment
        elif action == Actions.STAY:
            return 0
        else:
            return self.action_punishment

    def calculate_reward(self, action):
        nstate = self._seek_next_expected_state(self.state, action)
        return self.getReward(self.state, action, nstate)

    def is_state_final_state(self, state):
        return self.map[state] == b'G'

    def is_state_obstacle(self, state):
        return self.map[state] == b'X'

    def terminated(self):
        return self.is_state_final_state(self.state)

    def observe(self):
        return self.state

    def available_actions(self):
        available_actions = []
        for action in self.actions:
            if self.isStatePossible(self.next_state(action)):
                available_actions.append(action)
        return available_actions

    def _seek_next_expected_state(self, state, action):
        next_state = self.__seek_next_state(state, action)
        if self.isStatePossible(next_state):
            return next_state
        else:
            return state

    def __seek_next_state(self, state, action):
        row, col = state
        if action == Actions.STAY:
            pass
        if action == Actions.UP or action == Actions.UP_LEFT or action == Actions.UP_RIGHT:
            row = row - 1
        if action == Actions.DOWN or action == Actions.DOWN_LEFT or action == Actions.DOWN_RIGHT:
            row = row + 1
        if action == Actions.LEFT or action == Actions.UP_LEFT or action == Actions.DOWN_LEFT:
            col = col - 1
        if action == Actions.RIGHT or action == Actions.UP_RIGHT or action == Actions.DOWN_RIGHT:
            col = col + 1
        return (row, col)

    def _go_next_state(self, state, action):
        self.state = self._seek_next_expected_state(state, action)

    def next_state(self, state, action):
        self._go_next_state(state, action)

    def reset(self):
        last_row, last_col = self.map.shape
        self.state = (last_row -1 , last_col - 1)
        self.lastaction = Actions.STAY

    def render(self, mode='human'):
        if mode == 'human':
            pass
        else:
            return self._render_text(map)

    def _render_text(self, map):
        outfile = StringIO()

        row, col = self.state
        map = [[c.decode("utf-8") for c in line] for line in self.map]
        map[row][col] = gym.utils.colorize(map[row][col], "red", highlight=True)
        outfile.write(f"  ({self.lastaction})\n")
        outfile.write("\n".join(" ".join(line) for line in map) + "\n")

        with closing(outfile):
            return outfile.getvalue()

    def close(self):
        raise NotImplementedError

    def sample_all_rewards(self):
        raise NotImplementedError


gym.envs.register(
    id='Baseenv',
    entry_point='__main__:Environment'
)

  logger.warn(f"Overriding environment {id}")


# Agent's Implementation

In [42]:
class Agent(AgentBase):
    def __init__(self, id, environment, discount, theta):
        super(Agent, self).__init__(id, environment)
        self.theta = theta
        self.discount = discount
        self.environment = environment
        self.mapp = {}
        self.reset()

    def reset(self):
        self.V = np.zeros(self.environment.map.shape)
        self.policy = np.full(self.environment.map.shape, Actions.STAY)

    def sum_over_action(self, state, action):
        sum = 0
        for p, s, r, _ in self.environment.probablity[state][action]:
            sum += p * (r + self.discount * self.V[s])
        return sum

    def policy_iteration(self):
        policy_stable = False
        while not policy_stable:
            self.policy_evaluation()
            policy_stable = self.policy_improvement()

    def policy_evaluation(self):
        map_shape = self.environment.map.shape
        while True:
            delta = 0
            for s in np.ndindex(map_shape):
                value = self.V[s]
                self.V[s] = self.sum_over_action(s, self.policy[s])
                delta = max(delta, abs(value - self.V[s]))
            if delta < self.theta:
                break

    def policy_improvement(self):
        policy_stable = True
        map_shape = self.environment.map.shape
        for s in np.ndindex(map_shape):
            old_action = self.policy[s]
            self.policy[s] = Actions(np.argmax([self.sum_over_action(s, a) for a in Actions]))
            if old_action != self.policy[s]:
                policy_stable = False
        return policy_stable

    def value_iteration(self):
        map_shape = self.environment.map.shape
        while True:
            delta = 0
            for s in np.ndindex(map_shape):
                v = self.V[s]
                self.V[s] = np.max([self.sum_over_action(s, a) for a in Actions])
                delta = max(delta, abs(v - self.V[s]))
            if delta < self.theta:
                break
        for s in np.ndindex(map_shape):
            self.policy[s] = Actions(np.argmax([self.sum_over_action(s, a) for a in Actions]))
        return self.V, self.policy

    def take_action(self):
        return self.environment.step()

    def render_policy(self):
        outfile = StringIO()
        for line in self.policy:
            outfile.write(' '.join([str(action) for action in line]) + '\n')
        with closing(outfile):
            return outfile.getvalue()

    def render_value(self):
        outfile = StringIO()
        for i, line in enumerate(self.V):
            outfile.write('\t'.join([str(np.around(value,2)) if not self.environment.is_state_obstacle((i,j)) else '####' for j,value in enumerate(line)]) + '\n')
        with closing(outfile):
            return outfile.getvalue()

# Part A

In [53]:
base_env = gym.make('Baseenv', j_limit=15, i_limit=15,
                    obstacle=[(0,6), (0,7), (1,6), (1,7), (2,6), (2,7),
                              (7,12), (7,13), (7,14), (8,12), (8,13), (8,14),
                              (11,5), (11,6), (12,5), (12,6), (13,5), (13,6), (14,5), (14,6)],
                    p=0.8, punish=-1, goalReward=1000, actionPrice=-0.01)
agent = Agent(0, base_env, 0.9, 0.1)

In [54]:
agent.reset()
agent.policy_iteration()
policy_output = agent.render_policy()
value_output = agent.render_value()
print(policy_output)
print(value_output)

. ← ← ← ↙ ↙ . . ↓ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ← ← . . ↓ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ↖ ↖ . . ↙ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙ ↙ ↙ ↙ ↙ ↙ ↙
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ← ← ← ↙ ↙ ↙ ↙
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙ ↙ ←
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↙ ← ↖
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ . . .
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ . . .
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙
↗ ↗ ↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ←
↗ ↗ ↗ ↗ ↗ . . ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↗ ↑ . . ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↑ ↖ . . ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↑ ↖ . . ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖

10000.0	9618.23	8393.07	7329.0	6421.32	5725.37	####	####	2874.15	2857.0	2802.04	2510.6	2221.87	1954.06	1745.09
9618.22	9569.59	8379.82	7342.77	6433.79	5723.7	####	####	3232.56	3217.57	2874.27	2544.69	2234.32	1961.19	1746.59
8392.99	8379.76	8257.61	7300.06	6425.22	5680.94	####	####	3724.19	3299.31	2911.65	2553.49	2237.72	1962.8	1748.45
7328.08	7342.07	7299.6	7137.09	6360.65	5636.2	4982.54	4329.01	3789.2	3331.0	2918.06	2555.82	2238.77	1964.44	1753.2
6410.82	6425.88	6419.77	6358.01	6177.52	5542.41	4924.39	4353.36	3808.7	3334.06	2919.19	2556.4	224

# Part B

In [61]:
nocost_env = gym.make('Baseenv', j_limit=15, i_limit=15,
                    obstacle=[(0,6), (0,7), (1,6), (1,7), (2,6), (2,7),
                              (7,12), (7,13), (7,14), (8,12), (8,13), (8,14),
                              (11,5), (11,6), (12,5), (12,6), (13,5), (13,6), (14,5), (14,6)],
                    p=0.8, punish=-0.01, goalReward=1000, actionPrice=0)
agent2 = Agent(0, nocost_env, 0.9, 0.1)

In [62]:
agent2.reset()
agent2.policy_iteration()
policy_output = agent2.render_policy()
value_output = agent2.render_value()
print(policy_output)
print(value_output)

. ← ← ← ↙ ↙ . . ↓ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ← ← . . ↓ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ↖ ↖ . . ↙ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙ ↙ ↙ ↙ ↙ ↙ ↙
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ← ← ← ↙ ↙ ↙ ↙
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙ ↙ ←
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↙ ← ↖
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ . . .
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ . . .
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙
↗ ↗ ↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ←
↗ ↗ ↗ ↗ ↗ . . ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↗ ↑ . . ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↑ ↖ . . ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↑ ↖ . . ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖

10000.0	9618.23	8393.08	7329.02	6421.36	5725.41	####	####	2874.22	2857.07	2802.11	2510.67	2221.95	1954.14	1745.18
9618.23	9569.59	8379.84	7342.8	6433.82	5723.74	####	####	3232.63	3217.64	2874.34	2544.76	2234.4	1961.28	1746.67
8393.0	8379.78	8257.63	7300.09	6425.26	5680.98	####	####	3724.25	3299.37	2911.72	2553.57	2237.79	1962.88	1748.53
7328.1	7342.09	7299.63	7137.12	6360.69	5636.25	4982.59	4329.07	3789.26	3331.07	2918.13	2555.9	2238.85	1964.52	1753.28
6410.85	6425.92	6419.81	6358.05	6177.56	5542.46	4924.44	4353.41	3808.77	3334.12	2919.26	2556

# Part C

In [63]:
highcost_env = gym.make('Baseenv', j_limit=15, i_limit=15,
                    obstacle=[(0,6), (0,7), (1,6), (1,7), (2,6), (2,7),
                              (7,12), (7,13), (7,14), (8,12), (8,13), (8,14),
                              (11,5), (11,6), (12,5), (12,6), (13,5), (13,6), (14,5), (14,6)],
                    p=0.8, punish=-10, goalReward=100, actionPrice=-1)
agent3 = Agent(0, highcost_env, 0.9, 0.1)

In [75]:
agent3.reset()
agent3.policy_iteration()
policy_output = agent3.render_policy()
value_output = agent3.render_value()
print(policy_output)
print(value_output)

. ← ← ← ↙ ↙ . . ↓ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ← ← . . ↓ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ↖ ↖ . . ↙ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙ ↙ ↙ ↙ ↙ ↙ ↙
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ← ← ← ↙ ↙ ↙ ↙
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙ ↙ ←
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↙ ← ↖
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ . . .
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ . . .
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙
↗ ↗ ↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ←
↗ ↗ ↗ ↗ ↗ . . ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↗ ↑ . . ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↑ ↖ . . ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↑ ↖ . . ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖

1000.0	961.44	837.7	730.23	638.56	568.27	####	####	280.3	278.56	273.01	243.58	214.41	187.36	166.24
961.44	956.53	836.36	731.62	639.82	568.1	####	####	316.5	314.98	280.31	247.02	215.67	188.08	166.4
837.69	836.36	824.02	727.31	638.95	563.78	####	####	366.15	323.24	284.08	247.91	216.02	188.25	166.59
730.14	731.55	727.26	710.85	632.43	559.26	493.24	427.24	372.72	326.44	284.73	248.15	216.12	188.41	167.08
637.5	639.02	638.4	632.16	613.93	549.79	487.37	429.69	374.69	326.75	284.85	248.2	216.26	189.0	167.92
556.7	558.13	558.02	556.7	549.31	530.66	477.7

# Part D

In [76]:
for discount in [0.1, 0.5, 0.75, 0.9]:
    agent41 = Agent(0, base_env,discount, 0.1)
    agent41.policy_iteration()
    policy_output = agent41.render_policy()
    value_output = agent41.render_value()
    print("discount: ", discount)
    print(policy_output)
    print(value_output)

discount:  0.1
. ← ← ← ↙ ↙ . . . . . . . . .
↑ ↖ ↖ ↖ ← ← . . . . . . . . .
↑ ↖ ↖ ↖ ↖ ↖ . . . . . . . . .
↑ ↖ ↖ ↖ ↖ ↖ . . . . . . . . .
↗ ↑ ↖ ↖ ↖ ↖ . . . . . . . . .
↗ ↑ ↖ ↖ ↖ ↖ . . . . . . . . .
. . . . . . . . . . . . . . .
. . . . . . . . . . . . . . .
. . . . . . . . . . . . . . .
. . . . . . . . . . . . . . .
. . . . . . . . . . . . . . .
. . . . . . . . . . . . . . .
. . . . . . . . . . . . . . .
. . . . . . . . . . . . . . .
. . . . . . . . . . . . . . .

1111.11	898.62	76.8	6.55	0.55	0.04	####	####	0.0	0.0	0.0	0.0	0.0	0.0	0.0
898.62	895.1	75.13	6.6	0.56	0.04	####	####	0.0	0.0	0.0	0.0	0.0	0.0	0.0
76.8	75.13	72.12	6.26	0.56	0.04	####	####	0.0	0.0	0.0	0.0	0.0	0.0	0.0
6.55	6.6	6.26	5.8	0.51	0.04	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
0.55	0.56	0.56	0.51	0.46	0.03	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
0.04	0.04	0.04	0.04	0.03	0.03	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	####	####	####
0

In [70]:
for discount in [0.1, 0.5, 0.75, 0.9]:
    agent42 = Agent(0, nocost_env, discount, 0.1)
    agent42.policy_iteration()
    policy_output = agent42.render_policy()
    value_output = agent42.render_value()
    print("discount: ", discount)
    print(policy_output)
    print(value_output)

discount:  0.1
. ← ← ← ↙ ↙ . . ↓ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ← ← . . ↓ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ↖ ↖ . . ↙ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙ ↙ ↙ ↙ ↙ ↙ ↙
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ← ← ← ← ← ↙ ↙
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↙ ↙ ←
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↙ ← ↖
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ . . .
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ . . .
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ←
↗ ↗ ↗ ↗ ↗ . . ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↗ ↑ . . ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↑ ↖ . . ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↑ ↖ . . ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖

1111.11	898.62	76.81	6.56	0.56	0.05	####	####	0.0	0.0	0.0	0.0	0.0	0.0	0.0
898.62	895.1	75.14	6.61	0.57	0.05	####	####	0.0	0.0	0.0	0.0	0.0	0.0	0.0
76.81	75.14	72.13	6.28	0.57	0.05	####	####	0.0	0.0	0.0	0.0	0.0	0.0	0.0
6.56	6.61	6.28	5.81	0.52	0.05	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
0.56	0.57	0.57	0.52	0.47	0.04	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
0.05	0.05	0.05	0.05	0.04	0.04	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	####	####	#

# Part E

In [78]:
for env, env_name in zip([base_env, nocost_env, highcost_env], ['base', 'nocost', 'highcost']):
    print(env_name)
    agent5 = Agent(0, env, 0.9, 0.1)
    agent5.value_iteration()
    policy_output = agent5.render_policy()
    value_output = agent5.render_value()
    print(policy_output)
    print(value_output)

base
. ← ← ← ↙ ↙ . . ↓ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ← ← . . ↓ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ↖ ↖ . . ↙ ↙ ↙ ↙ ↙ ↙ ↙
↑ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙ ↙ ↙ ↙ ↙ ↙ ↙
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ← ← ← ↙ ↙ ↙ ↙
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙ ↙ ←
↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↙ ← ↖
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ . . .
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ . . .
↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ← ↙
↗ ↗ ↗ ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖ ←
↗ ↗ ↗ ↗ ↗ . . ↖ ↖ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↗ ↑ . . ↑ ↖ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↑ ↖ . . ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖
↗ ↗ ↗ ↑ ↖ . . ↗ ↑ ↖ ↖ ↖ ↖ ↖ ↖

9999.15	9617.48	8392.4	7328.39	6420.74	5724.84	####	####	2873.78	2856.63	2801.68	2510.26	2221.56	1953.79	1744.85
9617.48	9568.85	8379.16	7342.18	6433.26	5723.21	####	####	3232.2	3217.21	2873.92	2544.37	2234.03	1960.94	1746.35
8392.33	8379.1	8256.96	7299.48	6424.69	5680.46	####	####	3723.81	3298.95	2911.33	2553.2	2237.45	1962.55	1748.22
7327.49	7341.48	7299.02	7136.52	6360.14	5635.73	4982.11	4328.63	3788.83	3330.67	2917.76	2555.56	2238.51	1964.2	1752.97
6410.3	6425.36	6419.25	6357.5	6177.02	5541.95	4923.98	4352.98	3808.37	3333.76	2918.92	2