<a href="https://colab.research.google.com/github/matanaaa14/AI_task2_matan_and_gal/blob/main/Boltzmann_R-Max_QLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import random
import time
from tabulate import tabulate

class CellType:
    EMPTY = 0
    WALL = 1
    REWARD = 2

class Cell:
    def __init__(self, reward=0, cell_type=CellType.EMPTY, step_cost=-1):
        self.reward = reward
        self.cell_type = cell_type
        self.step_cost = step_cost

    def get_reward(self):
        return self.reward

    def get_cell_type(self):
        return self.cell_type

    def get_step_cost(self):
        return self.step_cost

class Grid:
    def __init__(self, height, width):
        self.height = height
        self.width = width
        self.grid = [[Cell() for _ in range(width)] for _ in range(height)]

    def set_cell(self, row, col, reward=0, cell_type=CellType.EMPTY, step_cost=-1):
        self.grid[row][col] = Cell(reward, cell_type, step_cost)

    def get_cell(self, row, col):
        return self.grid[row][col]

    def get_height(self):
        return self.height

    def get_width(self):
        return self.width

class TransitionModel:
    def __init__(self, grid):
        self.grid = grid
        self.transitions = {}

    def record_transition(self, state, action, next_state):
        state_key = (state[0], state[1])
        action_key = action
        next_state_key = (next_state[0], next_state[1])

        if state_key not in self.transitions:
            self.transitions[state_key] = {}
        if action_key not in self.transitions[state_key]:
            self.transitions[state_key][action_key] = {}
        if next_state_key not in self.transitions[state_key][action_key]:
            self.transitions[state_key][action_key][next_state_key] = 0

        self.transitions[state_key][action_key][next_state_key] += 1

    def get_transition_probability(self, state, action, next_state):
        state_key = (state[0], state[1])
        action_key = action
        next_state_key = (next_state[0], next_state[1])

        if state_key in self.transitions and action_key in self.transitions[state_key] and next_state_key in self.transitions[state_key][action_key]:
            total_transitions = sum(self.transitions[state_key][action_key].values())
            return self.transitions[state_key][action_key][next_state_key] / total_transitions
        else:
            return 0

class PolicyIterationRMax:
    def __init__(self, grid, transition_model, epsilon=0.01, max_iterations=1000):
        self.grid = grid
        self.transition_model = transition_model
        self.utilities = np.zeros((grid.get_height(), grid.get_width()))
        self.policy = np.full((grid.get_height(), grid.get_width()), None)
        self.epsilon = epsilon
        self.max_iterations = max_iterations
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # Right, Left, Down, Up

    def run(self):
        width = self.grid.get_width()
        height = self.grid.get_height()
        new_utilities = np.zeros((height, width))

        for _ in range(self.max_iterations):
            delta = 0
            for i in range(height):
                for j in range(width):
                    cell = self.grid.get_cell(i, j)
                    reward = cell.get_reward()
                    if cell.get_cell_type() == CellType.WALL:
                        new_utilities[i, j] = 0
                        self.policy[i, j] = None
                    elif reward != 0:
                        new_utilities[i, j] = reward
                        self.policy[i, j] = None
                    else:
                        max_utility = float('-inf')
                        best_action = None
                        for action_idx, action in enumerate(self.actions):
                            utility = 0
                            for prob, delta_action in [(0.8, action), (0.1, (-action[0], action[1])), (0.1, (action[0], -action[1]))]:
                                new_x = i + delta_action[0]
                                new_y = j + delta_action[1]
                                if 0 <= new_x < height and 0 <= new_y < width:
                                    if self.grid.get_cell(new_x, new_y).get_cell_type() == CellType.WALL:
                                        utility += prob * self.utilities[i, j]
                                    else:
                                        utility += prob * self.utilities[new_x, new_y]
                                else:
                                    utility += prob * self.utilities[i, j]
                            if utility > max_utility:
                                max_utility = utility
                                best_action = action_idx
                        new_utilities[i, j] = cell.get_step_cost() + max_utility
                        self.policy[i, j] = best_action

                        delta = max(delta, abs(new_utilities[i, j] - self.utilities[i, j]))

            self.utilities = np.copy(new_utilities)

            if delta < self.epsilon:
                break

            # Update transition model with observed transitions
            self.update_transition_model()

        return self.utilities

    def update_transition_model(self):
        for i in range(self.grid.get_height()):
            for j in range(self.grid.get_width()):
                if self.policy[i, j] is not None:
                    action = self.policy[i, j]
                    next_state = self.get_next_state([i, j], action)
                    self.transition_model.record_transition([i, j], action, next_state)

    def get_utilities(self):
        return self.utilities

    def get_policy(self):
        return self.policy

    def evaluate_policy(self, episodes=1000):
        total_return = 0
        for _ in range(episodes):
            state = [0, 0]
            episode_return = 0
            steps = 0
            while self.grid.get_cell(state[0], state[1]).get_reward() == 0 and steps < 1000:
                action = self.policy[state[0], state[1]]
                next_state = self.get_next_state(state, action)
                reward = self.grid.get_cell(next_state[0], next_state[1]).get_reward() + self.grid.get_cell(state[0], state[1]).get_step_cost()
                episode_return += reward
                state = next_state
                steps += 1
            total_return += episode_return
        return total_return / episodes

    def get_next_state(self, state, action_idx):
        move = self.actions[action_idx]
        next_state = [state[0] + move[0], state[1] + move[1]]
        if not self.is_valid_location(next_state):
            next_state = state
        return next_state

    def is_valid_location(self, location):
        return (0 <= location[0] < self.grid.get_height() and
                0 <= location[1] < self.grid.get_width() and
                self.grid.get_cell(location[0], location[1]).get_cell_type() != CellType.WALL)

    def print_policy(self):
        direction_mapping = ['→', '←', '↓', '↑']
        policy = [['' for _ in range(self.grid.get_width())] for _ in range(self.grid.get_height())]

        for row in range(self.grid.get_height()):
            for col in range(self.grid.get_width()):
                cell = self.grid.get_cell(row, col)
                if cell.get_cell_type() == CellType.WALL:
                    policy[row][col] = 'W'  # Wall
                elif cell.get_reward() > 0:
                    policy[row][col] = 'P'  # Positive reward
                elif cell.get_reward() < 0:
                    policy[row][col] = 'N'  # Negative reward
                else:
                    best_action = self.policy[row, col]
                    policy[row][col] = direction_mapping[best_action] if best_action is not None else ' '

        # Print Policy
        print("Policy:")
        for row in policy:
            print(" ".join(row))

        # Print Utilities using tabulate for prettier printing
        utilities_table = [["{:.5f}".format(value) for value in row] for row in self.utilities]
        print("\nUtilities:")
        print(tabulate(utilities_table, tablefmt="fancy_grid", numalign="center", stralign="center"))


# Example usage with the grid setup
def main_policy_iteration_rmax(w, h, L, p, r):
    grid = Grid(h, w)

    for x, y, value in L:
        grid.set_cell(h - y - 1, x, reward=value, cell_type=CellType.REWARD if value != 0 else CellType.WALL)

    for i in range(h):
        for j in range(w):
            if grid.get_cell(i, j).get_cell_type() == CellType.EMPTY:
                grid.set_cell(i, j, step_cost=r)

    transition_model = TransitionModel(grid)
    pir = PolicyIterationRMax(grid, transition_model)

    start_time = time.time()
    pir.run()
    end_time = time.time()

    policy_score = pir.evaluate_policy(episodes=1000)
    print("Policy Score:", policy_score)
    print(f"Policy Iteration R-max Time: {end_time - start_time:.4f} seconds")

    print("\nPolicy:")
    pir.print_policy()
    print("=======================================")

# Test each case
test_cases = [
    # Test case 1
    (4, 3, [(1,1,0),(3,2,1),(3,1,-1)], 0.8, -0.04),
    # Test case 2
    (4, 3, [(1,1,0),(3,2,1),(3,1,-1)], 0.8, 0.04),
    # Test case 3
    (4, 3, [(1,1,0),(3,2,1),(3,1,-1)], 0.8, -1),
    # Test case 4
    (12, 4, [(1,0,-100),(2,0,-100),(3,0,-100),(4,0,-100),(5,0,-100),(6,0,-100),(7,0,-100),(8,0,-100),(9,0,-100),(10,0,-100),(11,0,1)], 1, -1),
    # Test case 5
    (12, 6, [(1,0,-100),(2,0,-100),(3,0,-100),(4,0,-100),(5,0,-100),(6,0,-100),(7,0,-100),(8,0,-100),(9,0,-100),(10,0,-100),(11,0,1)], 0.9, -1),
    # Test case 6
    (5, 5, [(4,0,-10),(0,4,-10),(1,1,1),(3,3,2)], 0.9, -0.5),
    # Test case 7
    (5, 5, [(2,2,-2),(4,4,-1),(1,1,1),(3,3,2)], 0.9, -0.25),
    # Test case 8
    (7, 7, [(1,1,-4),(1,5,-6),(5,1,1),(5,5,4)], 0.8, -0.5),
    # Test case 9
    (7, 7, [(1,1,-4),(1,5,-6),(5,1,1),(5,5,4)], 0.8, -0.5),
    # Test case 10
    (7, 7, [(3,1,0),(3,5,0),(1,1,-4),(1,5,-6),(5,1,1),(5,5,4)], 0.8, -0.25)
]

for i, (w, h, L, p, r) in enumerate(test_cases, start=1):
    print(f"\nRunning test case {i}")
    main_policy_iteration_rmax(w, h, L, p, r)



Running test case 1
Policy Score: 0.8799999999999959
Policy Iteration R-max Time: 0.0059 seconds

Policy:
Policy:
→ → → P
↑ W ↑ N
→ → ↑ ←

Utilities:
╒═════════╤═════════╤═════════╤═════════╕
│ 0.85568 │ 0.90041 │ 0.95002 │    1    │
├─────────┼─────────┼─────────┼─────────┤
│ 0.80401 │    0    │ 0.90041 │   -1    │
├─────────┼─────────┼─────────┼─────────┤
│ 0.75641 │ 0.80401 │ 0.85568 │ 0.81011 │
╘═════════╧═════════╧═════════╧═════════╛

Running test case 2
Policy Score: 39.999999999999844
Policy Iteration R-max Time: 0.6046 seconds

Policy:
Policy:
→ → ↓ P
→ W ↓ N
→ → → →

Utilities:
╒═════════╤═════════╤═════════╤═════════╕
│ 40.8556 │ 40.8556 │ 40.8556 │    1    │
├─────────┼─────────┼─────────┼─────────┤
│ 40.8556 │    0    │ 40.8556 │   -1    │
├─────────┼─────────┼─────────┼─────────┤
│ 40.8556 │ 40.8556 │ 40.8556 │ 40.8556 │
╘═════════╧═════════╧═════════╧═════════╛

Running test case 3
Policy Score: -2.0
Policy Iteration R-max Time: 0.0023 seconds

Policy:
Policy:
→ → → P
↑

In [None]:
#-----------------
#Model base -RL (boltzmann)
#-----------------

import numpy as np
import random
import time
from tabulate import tabulate

class CellType:
    EMPTY = 0
    WALL = 1
    REWARD = 2

class Cell:
    def __init__(self, reward=0, cell_type=CellType.EMPTY, step_cost=-1):
        self.reward = reward
        self.cell_type = cell_type
        self.step_cost = step_cost

    def get_reward(self):
        return self.reward

    def get_cell_type(self):
        return self.cell_type

    def get_step_cost(self):
        return self.step_cost

class Grid:
    def __init__(self, height, width):
        self.height = height
        self.width = width
        self.grid = [[Cell() for _ in range(width)] for _ in range(height)]

    def set_cell(self, row, col, reward=0, cell_type=CellType.EMPTY, step_cost=-1):
        self.grid[row][col] = Cell(reward, cell_type, step_cost)

    def get_cell(self, row, col):
        return self.grid[row][col]

    def get_height(self):
        return self.height

    def get_width(self):
        return self.width

class PolicyIterationBoltzmann:
    def __init__(self, grid, temperature=1.0, cooling_rate=0.995):
        self.grid = grid
        self.utilities = np.zeros((grid.get_height(), grid.get_width()))
        self.policy = np.full((grid.get_height(), grid.get_width()), None)
        self.temperature = temperature
        self.cooling_rate = cooling_rate
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # Right, Left, Down, Up

    def run(self):
        width = self.grid.get_width()
        height = self.grid.get_height()
        new_utilities = np.zeros((height, width))

        converged = False
        while not converged:
            converged = True
            for i in range(height):
                for j in range(width):
                    cell = self.grid.get_cell(i, j)
                    reward = cell.get_reward()
                    if cell.get_cell_type() == CellType.WALL:
                        new_utilities[i, j] = 0
                        self.policy[i, j] = None
                    elif reward != 0:
                        new_utilities[i, j] = reward
                        self.policy[i, j] = None
                    else:
                        max_utility = float('-inf')
                        best_action = None
                        for action_idx, action in enumerate(self.actions):
                            utility = 0
                            for prob, delta in [(0.8, action), (0.1, (-action[0], action[1])), (0.1, (action[0], -action[1]))]:
                                new_x = i + delta[0]
                                new_y = j + delta[1]
                                if 0 <= new_x < height and 0 <= new_y < width:
                                    if self.grid.get_cell(new_x, new_y).get_cell_type() == CellType.WALL:
                                        utility += prob * self.utilities[i, j]
                                    else:
                                        utility += prob * self.utilities[new_x, new_y]
                                else:
                                    utility += prob * self.utilities[i, j]
                            if utility > max_utility:
                                max_utility = utility
                                best_action = action_idx
                        new_utilities[i, j] = cell.get_step_cost() + self.temperature * max_utility
                        self.policy[i, j] = best_action

                    if abs(new_utilities[i, j] - self.utilities[i, j]) > 0.01:
                        converged = False

            self.utilities, new_utilities = new_utilities, self.utilities

            # Decrease the temperature
            self.temperature *= self.cooling_rate

        return self.utilities

    def get_utilities(self):
        return self.utilities

    def get_policy(self):
        return self.policy

    def evaluate_policy(self, episodes=1000):
        total_return = 0
        for _ in range(episodes):
            state = [0, 0]
            episode_return = 0
            steps = 0
            while self.grid.get_cell(state[0], state[1]).get_reward() == 0 and steps < 1000:
                action = self.policy[state[0], state[1]]
                next_state = self.get_next_state(state, action)
                reward = self.grid.get_cell(next_state[0], next_state[1]).get_reward() + self.grid.get_cell(state[0], state[1]).get_step_cost()
                episode_return += reward
                state = next_state
                steps += 1
            total_return += episode_return
        return total_return / episodes

    def get_next_state(self, state, action_idx):
        move = self.actions[action_idx]
        next_state = [state[0] + move[0], state[1] + move[1]]
        if not self.is_valid_location(next_state):
            next_state = state
        return next_state

    def is_valid_location(self, location):
        return (0 <= location[0] < self.grid.get_height() and
                0 <= location[1] < self.grid.get_width() and
                self.grid.get_cell(location[0], location[1]).get_cell_type() != CellType.WALL)

    def print_policy(self):
        direction_mapping = ['→', '←', '↓', '↑']
        policy = [['' for _ in range(self.grid.get_width())] for _ in range(self.grid.get_height())]

        for row in range(self.grid.get_height()):
            for col in range(self.grid.get_width()):
                cell = self.grid.get_cell(row, col)
                if cell.get_cell_type() == CellType.WALL:
                    policy[row][col] = 'W'  # Wall
                elif cell.get_reward() > 0:
                    policy[row][col] = 'P'  # Positive reward
                elif cell.get_reward() < 0:
                    policy[row][col] = 'N'  # Negative reward
                else:
                    best_action = self.policy[row, col]
                    policy[row][col] = direction_mapping[best_action] if best_action is not None else ' '

        # Print Policy
        print("Policy:")
        for row in policy:
            print(" ".join(row))

        # Print Utilities using tabulate for prettier printing
        utilities_table = [["{:.5f}".format(value) for value in row] for row in self.utilities]
        print("\nUtilities:")
        print(tabulate(utilities_table, tablefmt="fancy_grid", numalign="center", stralign="center"))


# Example usage with the grid setup
def main_policy_iteration_boltzmann(w, h, L, p, r):
    grid = Grid(h, w)

    for x, y, value in L:
        grid.set_cell(h - y - 1, x, reward=value, cell_type=CellType.REWARD if value != 0 else CellType.WALL)

    for i in range(h):
        for j in range(w):
            if grid.get_cell(i, j).get_cell_type() == CellType.EMPTY:
                grid.set_cell(i, j, step_cost=r)

    pib = PolicyIterationBoltzmann(grid, temperature=1.0, cooling_rate=0.995)

    start_time = time.time()
    pib.run()
    end_time = time.time()

    policy_score = pib.evaluate_policy(episodes=1000)
    print("Policy Score:", policy_score)
    print(f"Policy Iteration Boltzmann Time: {end_time - start_time:.4f} seconds")

    print("\nPolicy:")
    pib.print_policy()
    print("=======================================")

# Test each case
test_cases = [
    # Test case 1
    (4, 3, [(1,1,0),(3,2,1),(3,1,-1)], 0.8, -0.04),
    # Test case 2
    (4, 3, [(1,1,0),(3,2,1),(3,1,-1)], 0.8, 0.04),
    # Test case 3
    (4, 3, [(1,1,0),(3,2,1),(3,1,-1)], 0.8, -1),
    # Test case 4
    (12, 4, [(1,0,-100),(2,0,-100),(3,0,-100),(4,0,-100),(5,0,-100),(6,0,-100),(7,0,-100),(8,0,-100),(9,0,-100),(10,0,-100),(11,0,1)], 1, -1),
    # Test case 5
    (12, 6, [(1,0,-100),(2,0,-100),(3,0,-100),(4,0,-100),(5,0,-100),(6,0,-100),(7,0,-100),(8,0,-100),(9,0,-100),(10,0,-100),(11,0,1)], 0.9, -1),
    # Test case 6
    (5, 5, [(4,0,-10),(0,4,-10),(1,1,1),(3,3,2)], 0.9, -0.5),
    # Test case 7
    (5, 5, [(2,2,-2),(4,4,-1),(1,1,1),(3,3,2)], 0.9, -0.25),
    # Test case 8
    (7, 7, [(1,1,-4),(1,5,-6),(5,1,1),(5,5,4)], 0.8, -0.5),
    # Test case 9
    (7, 7, [(1,1,-4),(1,5,-6),(5,1,1),(5,5,4)], 0.8, -0.5),
    # Test case 10
    (7, 7, [(3,1,0),(3,5,0),(1,1,-4),(1,5,-6),(5,1,1),(5,5,4)], 0.8, -0.25)
]

for i, (w, h, L, p, r) in enumerate(test_cases, start=1):
    print(f"\nRunning test case {i}")
    main_policy_iteration_boltzmann(w, h, L, p, r)



Running test case 1
Policy Score: 0.8799999999999959
Policy Iteration Boltzmann Time: 0.0044 seconds

Policy:
Policy:
→ → → P
↑ W ↑ N
↑ → ↑ ←

Utilities:
╒═════════╤═════════╤═════════╤═════════╕
│ 0.44903 │ 0.58588 │ 0.76902 │    1    │
├─────────┼─────────┼─────────┼─────────┤
│ 0.32858 │    0    │ 0.58588 │   -1    │
├─────────┼─────────┼─────────┼─────────┤
│ 0.23768 │ 0.32858 │ 0.44903 │ 0.33694 │
╘═════════╧═════════╧═════════╧═════════╛

Running test case 2
Policy Score: 39.999999999999844
Policy Iteration Boltzmann Time: 0.0018 seconds

Policy:
Policy:
→ ↓ ↑ P
↑ W ↑ N
↑ → ↑ ←

Utilities:
╒═════════╤═════════╤═════════╤═════════╕
│ 1.04005 │ 1.04006 │ 1.04001 │    1    │
├─────────┼─────────┼─────────┼─────────┤
│ 1.03791 │    0    │ 1.03994 │   -1    │
├─────────┼─────────┼─────────┼─────────┤
│ 1.03367 │ 1.03724 │ 1.03991 │ 1.03909 │
╘═════════╧═════════╧═════════╧═════════╛

Running test case 3
Policy Score: -2.0
Policy Iteration Boltzmann Time: 0.0162 seconds

Policy:
Polic

In [None]:
#----------------------
#Model-free RL (Q-Learning)
#----------------------
import numpy as np
import random
from tabulate import tabulate

class GridGameQLearning:
    def __init__(self, rows, cols, rewards, success_probabilities, costs, walls, gamma=0.9):
        self.rows = rows
        self.cols = cols
        self.rewards = rewards
        self.success_probabilities = success_probabilities
        self.costs = costs
        self.walls = walls
        self.gamma = gamma
        self.q_table = np.zeros((rows, cols, 4))
        self.policy = np.zeros((rows, cols), dtype=int)
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]

    def is_valid_location(self, location):
        return (0 <= location[0] < self.rows and
                0 <= location[1] < self.cols and
                location not in self.walls)

    def get_next_state(self, state, action):
        move = self.actions[action]
        next_state = [state[0] + move[0], state[1] + move[1]]

        # Check if the move results in hitting a wall
        if not self.is_valid_location(next_state):
            next_state = state
        else:
            # Determine if the move is successful based on probability
            success_prob = self.success_probabilities.get(tuple(move), 0)
            if random.uniform(0, 1) > success_prob:
                # Move failed, attempt to move right or left in the same direction
                if action == 0:  # Right
                    right_move = self.actions[2]  # Down
                    left_move = self.actions[3]   # Up
                elif action == 1:  # Left
                    right_move = self.actions[3]  # Up
                    left_move = self.actions[2]   # Down
                elif action == 2:  # Down
                    right_move = self.actions[1]  # Left
                    left_move = self.actions[0]   # Right
                elif action == 3:  # Up
                    right_move = self.actions[0]  # Right
                    left_move = self.actions[1]   # Left

                # Attempt right move
                if self.is_valid_location([state[0] + right_move[0], state[1] + right_move[1]]):
                    next_state = [state[0] + right_move[0], state[1] + right_move[1]]
                # If right move is not possible, attempt left move
                elif self.is_valid_location([state[0] + left_move[0], state[1] + left_move[1]]):
                    next_state = [state[0] + left_move[0], state[1] + left_move[1]]
                else:
                    next_state = state

        return next_state

    def choose_action(self, state, epsilon):
        if np.random.rand() < epsilon:
            return np.random.choice(range(4))
        else:
            row, col = state
            return np.argmax(self.q_table[row, col])

    def q_learning(self, num_episodes=1000, alpha=0.1, epsilon=0.1):
        for episode in range(num_episodes):
            state = [random.randint(0, self.rows - 1), random.randint(0, self.cols - 1)]
            while self.rewards[state[0]][state[1]] == 0:
                action = self.choose_action(state, epsilon)
                next_state = self.get_next_state(state, action)

                # Calculate reward based on next state
                reward = self.rewards[next_state[0]][next_state[1]] + self.costs[state[0]][state[1]]

                row, col = state
                next_row, next_col = next_state

                # Adjust reward based on success probability and step cost
                success_prob = self.success_probabilities.get(tuple(self.actions[action]), 0)
                effective_reward = reward + (1 - success_prob) * self.costs[state[0]][state[1]]

                self.q_table[row, col, action] += alpha * (
                    effective_reward + self.gamma * np.max(self.q_table[next_row, next_col]) - self.q_table[row, col, action])

                state = next_state

        for row in range(self.rows):
            for col in range(self.cols):
                self.policy[row][col] = np.argmax(self.q_table[row, col])

    def evaluate_policy(self, episodes=10000, epsilon=0.1):
        total_return = 0
        for _ in range(episodes):
            state = [0, 0]
            episode_return = 0
            while self.rewards[state[0]][state[1]] == 0:
                if random.uniform(0, 1) < epsilon:
                    action = random.choice(range(4))
                else:
                    action = self.policy[state[0], state[1]]
                next_state = self.get_next_state(state, action)
                reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
                episode_return += reward
                state = next_state
            total_return += episode_return
        return total_return / episodes

    def print_policy(self):
        direction_mapping = ['→', '←', '↓', '↑']
        policy = [[None for _ in range(self.cols)] for _ in range(self.rows)]
        for row in range(self.rows):
            for col in range(self.cols):
                if [row, col] in self.walls:
                    policy[row][col] = 'W'  # Wall
                elif self.rewards[row][col] > 0:
                    policy[row][col] = 'P'  # Positive reward
                elif self.rewards[row][col] < 0:
                    policy[row][col] = 'N'  # Negative reward
                else:
                    best_action = self.policy[row, col]
                    policy[row][col] = direction_mapping[best_action]
        for row in policy:
            print(" ".join(row))

    def print_utilities(self):
        utilities_table = [["{:.5f}".format(np.max(self.q_table[row, col])) for col in range(self.cols)] for row in range(self.rows)]
        for row in range(self.rows):
            for col in range(self.cols):
                if [row, col] in self.walls:
                    utilities_table[row][col] = 'W'  # Wall
                elif self.rewards[row][col] > 0:
                    utilities_table[row][col] = {self.rewards[row][col]}  # Positive reward
                elif self.rewards[row][col] < 0:
                    utilities_table[row][col] = {self.rewards[row][col]}  # Negative reward
        headers = [f"Col {col}" for col in range(self.cols)]
        print("\nUtilities:")
        print(tabulate(utilities_table, headers=headers, tablefmt="grid"))


def main_q_learning_multiple():
    test_cases = [
        {
            "w": 4,
            "h": 3,
            "L": [(1, 1, 0), (3, 2, 1), (3, 1, -1)],
            "p": 0.8,
            "r": -0.04
        },
        {
            "w": 4,
            "h": 3,
            "L": [(1, 1, 0), (3, 2, 1), (3, 1, -1)],
            "p": 0.8,
            "r": 0.04
        },
        {
            "w": 4,
            "h": 3,
            "L": [(1, 1, 0), (3, 2, 1), (3, 1, -1)],
            "p": 0.8,
            "r": -1
        },
        {
            "w": 12,
            "h": 4,
            "L": [(1, 0, -100), (2, 0, -100), (3, 0, -100), (4, 0, -100), (5, 0, -100), (6, 0, -100), (7, 0, -100), (8, 0, -100), (9, 0, -100), (10, 0, -100), (11, 0, 1)],
            "p": 1,
            "r": -1
        },
        {
            "w": 12,
            "h": 6,
            "L": [(1, 0, -100), (2, 0, -100), (3, 0, -100), (4, 0, -100), (5, 0, -100), (6, 0, -100), (7, 0, -100), (8, 0, -100), (9, 0, -100), (10, 0, -100), (11, 0, 1)],
            "p": 0.9,
            "r": -1
        },
        {
            "w": 5,
            "h": 5,
            "L": [(4, 0, -10), (0, 4, -10), (1, 1, 1), (3, 3, 2)],
            "p": 0.9,
            "r": -0.5
        },
        {
            "w": 5,
            "h": 5,
            "L": [(2, 2, -2), (4, 4, -1), (1, 1, 1), (3, 3, 2)],
            "p": 0.9,
            "r": -0.25
        },
        {
            "w": 7,
            "h": 7,
            "L": [(1, 1, -4), (1, 5, -6), (5, 1, 1), (5, 5, 4)],
            "p": 0.8,
            "r": -0.5
        },
        {
            "w": 7,
            "h": 7,
            "L": [(1, 1, -4), (1, 5, -6), (5, 1, 1), (5, 5, 4)],
            "p": 0.8,
            "r": -0.5
        },
        {
            "w": 7,
            "h": 7,
            "L": [(3, 1, 0), (3, 5, 0), (1, 1, -4), (1, 5, -6), (5, 1, 1), (5, 5, 4)],
            "p": 0.8,
            "r": -0.25
        }
    ]

    for idx, test_case in enumerate(test_cases):
        w = test_case["w"]
        h = test_case["h"]
        L = test_case["L"]
        p = test_case["p"]
        r = test_case["r"]

        rewards = [[0 for _ in range(w)] for _ in range(h)]
        costs = [[r for _ in range(w)] for _ in range(h)]
        walls = []

        for x, y, value in L:
            if value == 0:
                walls.append([h - y - 1, x])  # Convert (x, y) to (row, col) with (0,0) at lower-left
            else:
                rewards[h - y - 1][x] = value

        success_probabilities = {
            (0, 1): p,
            (0, -1): p,
            (1, 0): p,
            (-1, 0): p
        }

        print(f"Running test case {idx + 1}...")
        game_q_learning = GridGameQLearning(h, w, rewards, success_probabilities, costs, walls)
        start_time = time.time()
        game_q_learning.q_learning(num_episodes=30000, alpha=0.1, epsilon=0.1)
        end_time = time.time()
        policy_score = game_q_learning.evaluate_policy(episodes=1000, epsilon=0.1)
        print(f"Policy Score for test case {idx + 1}: {policy_score}")
        print(f"Q-learning Time for test case {idx + 1}: {end_time - start_time:.4f} seconds")
        print("\nPolicy for test case {idx + 1}:")
        game_q_learning.print_policy()
        print("\nUtilities for test case {idx + 1}:")
        game_q_learning.print_utilities()
        print("-----------------------------------\n")

if __name__ == "__main__":
    main_q_learning_multiple()

Running test case 1...
Policy Score for test case 1: -0.19200000000000048
Q-learning Time for test case 1: 2.4034 seconds

Policy for test case {idx + 1}:
→ → → P
↑ W ↑ N
↑ ← ← ←

Utilities for test case {idx + 1}:

Utilities:
+---------+---------+---------+----------+
|   Col 0 | Col 1   |   Col 2 | Col 3    |
| 0.51105 | 0.67108 | 0.84133 | {1}      |
+---------+---------+---------+----------+
| 0.39674 | W       | 0.28232 | {-1}     |
+---------+---------+---------+----------+
| 0.24967 | 0.16743 | 0.1092  | -0.04339 |
+---------+---------+---------+----------+
-----------------------------------

Running test case 2...
Policy Score for test case 2: 2.66228
Q-learning Time for test case 2: 5.7529 seconds

Policy for test case {idx + 1}:
→ ↑ → P
↑ W ← N
↑ ← ← →

Utilities for test case {idx + 1}:

Utilities:
+---------+---------+---------+---------+
|   Col 0 | Col 1   |   Col 2 | Col 3   |
| 0.80014 | 0.79198 | 0.81494 | {1}     |
+---------+---------+---------+---------+
| 0.77346 

In [None]:
import numpy as np


def is_number(value):
    """Check if the value is a number."""
    try:
        float(value)
        return True
    except (ValueError, TypeError):
        return False

def average_distance(matrix1, matrix2):
    """Calculate the average distance between two matrices, ignoring non-numeric entries."""
    if len(matrix1) != len(matrix2) or any(len(row1) != len(row2) for row1, row2 in zip(matrix1, matrix2)):
        raise ValueError("Matrices must have the same dimensions")

    total_distance = 0
    count = 0

    for row1, row2 in zip(matrix1, matrix2):
        for val1, val2 in zip(row1, row2):
            if is_number(val1) and is_number(val2):
                total_distance += abs(float(val1) - float(val2))
                count += 1

    if count == 0:
        return 0  # Avoid division by zero if no numeric entries are found
    return total_distance / count


# Example matrices
matrix1 = [
    [0.5144, 0.66944, 0.87395, 1],
    [0.40442, 0, 0.14432, -1],
    [0.30061, 0.19045, 0.15729, -0.1208]
]

matrix2 = [
    [0.5144, 0.66944, 0.87395, 1],
    [0.40442, 0, 0.14432, -1],
    [0.30061, 0.19045, 0.15729, -0.1208]
]





# Calculate the average distance
avg_distance = average_distance(matrix1, matrix2)
print("Average Distance:", avg_distance)

# Function to calculate absolute difference or return '---' if any element is '--------------' or None
def calculate_difference(a, b):
    if a == '--------------' or b == '--------------' or a is None or b is None:
        return '---'
    else:
        return abs(a - b)

# Initialize a list to store results
differences = []

# Iterate through each element in matrix1 and matrix2
for i in range(len(matrix1)):
    for j in range(len(matrix1[i])):
        diff = calculate_difference(matrix1[i][j], matrix2[i][j])
        differences.append((f"({i},{j})", diff))

# Print the results
index_values = " ".join([str(index) for index, diff in differences])
diff_values = " ".join([str(diff) for index, diff in differences])

# Print to console for copying
print("Index values:")
print(index_values)
print("\nDiff values:")
print(diff_values)

Average Distance: 0.0
Index values:
(0,0) (0,1) (0,2) (0,3) (1,0) (1,1) (1,2) (1,3) (2,0) (2,1) (2,2) (2,3)

Diff values:
0.0 0.0 0.0 0 0.0 0 0.0 0 0.0 0.0 0.0 0.0
