<a href="https://colab.research.google.com/github/matanaaa14/ai_task2/blob/main/Copy_of_planing_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import random

class GridGameRL:
    def __init__(self, rows, cols, rewards, move_probabilities, costs, goal, avoid, gamma=0.9, epsilon=0.1):
        self.rows = rows
        self.cols = cols
        self.rewards = rewards
        self.probabilities = move_probabilities  # Dictionary of move: probability
        self.costs = costs  # Cost matrix
        self.goal = goal  # Goal location
        self.avoid = avoid  # Avoid location
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_values = np.zeros((rows, cols, 4))  # Q-values for each state-action pair
        self.returns = {}  # Dictionary to store returns for state-action pairs
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # Possible moves: right, left, down, up

        # Initialize returns dictionary
        for row in range(rows):
            for col in range(cols):
                for action in range(4):
                    self.returns[((row, col), action)] = []

    def is_valid_location(self, location):
        return 0 <= location[0] < self.rows and 0 <= location[1] < self.cols

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(range(4))  # Explore: choose a random action
        else:
            return np.argmax(self.q_values[state[0], state[1]])  # Exploit: choose the best action

    def take_action(self, state, action):
        move = self.actions[action]
        next_state = [state[0] + move[0], state[1] + move[1]]
        if not self.is_valid_location(next_state):
            next_state = state  # Invalid move: stay in the same state
        return next_state

    def generate_episode(self):
        episode = []
        state = [0, 0]  # Start at the initial position
        while state != self.goal and state != self.avoid:
            action = self.choose_action(state)
            next_state = self.take_action(state, action)
            reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
            episode.append((state, action, reward))
            state = next_state
        return episode

    def update_q_values(self, episode):
        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = self.gamma * G + reward
            if (tuple(state), action) not in visited:
                self.returns[(tuple(state), action)].append(G)
                self.q_values[state[0], state[1], action] = np.mean(self.returns[(tuple(state), action)])
                visited.add((tuple(state), action))

    def train(self, episodes=1000):
        for episode_num in range(episodes):
            episode = self.generate_episode()
            self.update_q_values(episode)

    def evaluate_policy(self, episodes=100):
        total_return = 0
        for _ in range(episodes):
            state = [0, 0]
            episode_return = 0
            while state != self.goal and state != self.avoid:
                action = self.choose_action(state)
                next_state = self.take_action(state, action)
                reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
                episode_return += reward
                state = next_state
            total_return += episode_return
        return total_return / episodes

    def print_policy(self):
        direction_mapping = ['→', '←', '↓', '↑']
        policy = [[None for _ in range(self.cols)] for _ in range(self.rows)]
        for row in range(self.rows):
            for col in range(self.cols):
                if [row, col] == self.goal:
                    policy[row][col] = 'G'
                elif [row, col] == self.avoid:
                    policy[row][col] = 'X'
                else:
                    best_action = np.argmax(self.q_values[row, col])
                    policy[row][col] = direction_mapping[best_action]
        for row in policy:
            print(" ".join(row))

# Example setup:
rows = 5
cols = 5
rewards = [[0 for _ in range(cols)] for _ in range(rows)]  # Example reward matrix
rewards[4][4] = 10  # Goal reward
rewards[2][2] = -10  # Avoid penalty

probability = {
    (0, 1): 0.25,  # 25% chance to move right
    (0, -1): 0.25, # 25% chance to move left
    (1, 0): 0.25,  # 25% chance to move down
    (-1, 0): 0.25  # 25% chance to move up
}

costs = [[-1 for _ in range(cols)] for _ in range(rows)]  # Example cost matrix

goal = [4, 4]
avoid = [2, 2]

# Train and evaluate the first policy
game_rl_1 = GridGameRL(rows, cols, rewards, probability, costs, goal, avoid)
game_rl_1.train(episodes=10000)
policy_1_score = game_rl_1.evaluate_policy(episodes=1000)
print("Policy 1 Score:", policy_1_score)

# Train and evaluate the second policy
game_rl_2 = GridGameRL(rows, cols, rewards, probability, costs, goal, avoid)
game_rl_2.epsilon = 0.2  # Changing epsilon for the second policy as an example
game_rl_2.train(episodes=10000)
policy_2_score = game_rl_2.evaluate_policy(episodes=1000)
print("Policy 2 Score:", policy_2_score)

# Compare policies
if policy_1_score > policy_2_score:
    print("Policy 1 is better.")
elif policy_1_score < policy_2_score:
    print("Policy 2 is better.")
else:
    print("Both policies are equally good.")

# Print policies for visual comparison
print("\nPolicy 1:")
game_rl_1.print_policy()
print("\nPolicy 2:")
game_rl_2.print_policy()

Policy 1 Score: -4.394
Policy 2 Score: -4.913
Policy 1 is better.

Policy 1:
→ ↓ ↓ ↓ ↓
→ ↓ ↓ ← ↓
→ → X ← ↑
↑ → ↑ ← ↓
↑ ← ↑ → G

Policy 2:
↓ ↓ ↓ ← ↓
→ ↓ ↓ ← ↓
→ → X ← ↓
→ ↑ ↑ ↑ ↓
↑ ← → → G


In [None]:
import numpy as np
import random

class GridGameRL:
    def __init__(self, rows, cols, rewards, success_probabilities, costs, goal, avoid, walls, gamma=0.9, epsilon=0.1):
        self.rows = rows
        self.cols = cols
        self.rewards = rewards
        self.success_probabilities = success_probabilities  # Dictionary of move: success probability
        self.costs = costs  # Cost matrix
        self.goal = goal  # Goal location
        self.avoid = avoid  # Avoid location
        self.walls = walls  # List of wall locations
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_values = np.zeros((rows, cols, 4))  # Q-values for each state-action pair
        self.returns = {}  # Dictionary to store returns for state-action pairs
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # Possible moves: right, left, down, up

        # Initialize returns dictionary
        for row in range(rows):
            for col in range(cols):
                for action in range(4):
                    self.returns[((row, col), action)] = []

    def is_valid_location(self, location):
        return (0 <= location[0] < self.rows and
                0 <= location[1] < self.cols and
                location not in self.walls)

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(range(4))  # Explore: choose a random action
        else:
            return np.argmax(self.q_values[state[0], state[1]])  # Exploit: choose the best action

    def take_action(self, state, action):
        move = self.actions[action]
        success_prob = self.success_probabilities[move]
        if random.uniform(0, 1) < success_prob:
            next_state = [state[0] + move[0], state[1] + move[1]]
            if not self.is_valid_location(next_state):
                next_state = state  # Invalid move: stay in the same state
        else:
            next_state = state  # Move failed: stay in the same state
        return next_state

    def generate_episode(self):
        episode = []
        state = [0, 0]  # Start at the initial position
        while state != self.goal and state != self.avoid:
            action = self.choose_action(state)
            next_state = self.take_action(state, action)
            reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
            episode.append((state, action, reward))
            state = next_state
        return episode

    def update_q_values(self, episode):
        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = self.gamma * G + reward
            if (tuple(state), action) not in visited:
                self.returns[(tuple(state), action)].append(G)
                self.q_values[state[0], state[1], action] = np.mean(self.returns[(tuple(state), action)])
                visited.add((tuple(state), action))

    def train(self, episodes=1000):
        for episode_num in range(episodes):
            episode = self.generate_episode()
            self.update_q_values(episode)

    def evaluate_policy(self, episodes=100):
        total_return = 0
        for _ in range(episodes):
            state = [0, 0]
            episode_return = 0
            while state != self.goal and state != self.avoid:
                action = self.choose_action(state)
                next_state = self.take_action(state, action)
                reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
                episode_return += reward
                state = next_state
            total_return += episode_return
        return total_return / episodes

    def print_policy(self):
        direction_mapping = ['→', '←', '↓', '↑']
        policy = [[None for _ in range(self.cols)] for _ in range(self.rows)]
        for row in range(self.rows):
            for col in range(self.cols):
                if [row, col] == self.goal:
                    policy[row][col] = 'G'
                elif [row, col] == self.avoid:
                    policy[row][col] = 'X'
                elif [row, col] in self.walls:
                    policy[row][col] = 'W'
                else:
                    best_action = np.argmax(self.q_values[row, col])
                    policy[row][col] = direction_mapping[best_action]
        for row in policy:
            print(" ".join(row))

# Example setup:
rows = 5
cols = 5
rewards = [[0 for _ in range(cols)] for _ in range(rows)]  # Example reward matrix
rewards[4][4] = 10  # Goal reward
rewards[2][2] = -10  # Avoid penalty

success_probabilities = {
    (0, 1): 0.8,  # 80% chance to move right
    (0, -1): 0.8, # 80% chance to move left
    (1, 0): 0.8,  # 80% chance to move down
    (-1, 0): 0.8  # 80% chance to move up
}

costs = [[-1 for _ in range(cols)] for _ in range(rows)]  # Example cost matrix

goal = [4, 4]
avoid = [2, 2]
walls = [[1, 1], [1, 2], [2, 1]]  # Example walls

# Train and evaluate the first policy
game_rl_1 = GridGameRL(rows, cols, rewards, success_probabilities, costs, goal, avoid, walls)
game_rl_1.train(episodes=10000)
policy_1_score = game_rl_1.evaluate_policy(episodes=1000)
print("Policy 1 Score:", policy_1_score)

# Train and evaluate the second policy
game_rl_2 = GridGameRL(rows, cols, rewards, success_probabilities, costs, goal, avoid, walls)
game_rl_2.epsilon = 0.2  # Changing epsilon for the second policy as an example
game_rl_2.train(episodes=10000)
policy_2_score = game_rl_2.evaluate_policy(episodes=1000)
print("Policy 2 Score:", policy_2_score)

# Compare policies
if policy_1_score > policy_2_score:
    print("Policy 1 is better.")
elif policy_1_score < policy_2_score:
    print("Policy 2 is better.")
else:
    print("Both policies are equally good.")

# Print policies for visual comparison
print("\nPolicy 1:")
game_rl_1.print_policy()
print("\nPolicy 2:")
game_rl_2.print_policy()


Policy 1 Score: -8.422
Policy 2 Score: -9.555
Policy 1 is better.

Policy 1:
↓ ← ← ↑ ↓
↓ W W ↓ ↓
↓ W X ← ←
→ → ↑ ↑ ↓
↑ ↑ → → G

Policy 2:
↓ ← ← ↓ ↓
↓ W W ↑ ↓
↓ W X ← ↓
→ → ↑ ← ↓
→ → ↑ → G


In [None]:
import numpy as np
import random

class GridGameRL:
    def __init__(self, rows, cols, rewards, success_probabilities, costs, walls, gamma=0.9, epsilon=0.1):
        self.rows = rows
        self.cols = cols
        self.rewards = rewards
        self.success_probabilities = success_probabilities  # Dictionary of move: success probability
        self.costs = costs  # Cost matrix
        self.walls = walls  # List of wall locations
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_values = np.zeros((rows, cols, 4))  # Q-values for each state-action pair
        self.returns = {}  # Dictionary to store returns for state-action pairs
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # Possible moves: right, left, down, up

        # Initialize returns dictionary
        for row in range(rows):
            for col in range(cols):
                for action in range(4):
                    self.returns[((row, col), action)] = []

    def is_valid_location(self, location):
        return (0 <= location[0] < self.rows and
                0 <= location[1] < self.cols and
                location not in self.walls)

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(range(4))  # Explore: choose a random action
        else:
            return np.argmax(self.q_values[state[0], state[1]])  # Exploit: choose the best action

    def take_action(self, state, action):
        move = self.actions[action]
        success_prob = self.success_probabilities[move]
        if random.uniform(0, 1) < success_prob:
            next_state = [state[0] + move[0], state[1] + move[1]]
            if not self.is_valid_location(next_state):
                next_state = state  # Invalid move: stay in the same state
        else:
            next_state = state  # Move failed: stay in the same state
        return next_state

    def generate_episode(self):
        episode = []
        state = [0, 0]  # Start at the initial position
        while True:
            action = self.choose_action(state)
            next_state = self.take_action(state, action)
            reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
            episode.append((state, action, reward))
            if self.rewards[state[0]][state[1]] != 0:
                break
            state = next_state
        return episode

    def update_q_values(self, episode):
        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = self.gamma * G + reward
            if (tuple(state), action) not in visited:
                self.returns[(tuple(state), action)].append(G)
                self.q_values[state[0], state[1], action] = np.mean(self.returns[(tuple(state), action)])
                visited.add((tuple(state), action))

    def train(self, episodes=1000):
        for episode_num in range(episodes):
            episode = self.generate_episode()
            self.update_q_values(episode)

    def evaluate_policy(self, episodes=100):
        total_return = 0
        for _ in range(episodes):
            state = [0, 0]
            episode_return = 0
            while True:
                action = self.choose_action(state)
                next_state = self.take_action(state, action)
                reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
                episode_return += reward
                if self.rewards[state[0]][state[1]] != 0:
                    break
                state = next_state
            total_return += episode_return
        return total_return / episodes

    def print_policy(self):
        direction_mapping = ['→', '←', '↓', '↑']
        policy = [[None for _ in range(self.cols)] for _ in range(self.rows)]
        for row in range(self.rows):
            for col in range(self.cols):
                if [row, col] in self.walls:
                    policy[row][col] = 'W'
                elif self.rewards[row][col] > 0:
                    policy[row][col] = '+{}'.format(int(self.rewards[row][col]))
                elif self.rewards[row][col] < 0:
                    policy[row][col] = '{}'.format(int(self.rewards[row][col]))
                else:
                    best_action = np.argmax(self.q_values[row, col])
                    policy[row][col] = direction_mapping[best_action]
        for row in policy:
            print(" ".join(row))

# Example setup:
rows = 4
cols = 3
rewards = [[0 for _ in range(cols)] for _ in range(rows)]  # Example reward matrix
rewards[3][2] = 1  # Negative reward
rewards[3][1] = -1  # Negative reward
success_probabilities = {
    (0, 1): 0.8,  # 80% chance to move right
    (0, -1): 0.8, # 80% chance to move left
    (1, 0): 0.8,  # 80% chance to move down
    (-1, 0): 0.8  # 80% chance to move up
}

costs = [[-1 for _ in range(cols)] for _ in range(rows)]  # Example cost matrix

walls = [[1, 1]]  # Example walls

# Train and evaluate the first policy
game_rl_1 = GridGameRL(rows, cols, rewards, success_probabilities, costs, walls)
game_rl_1.train(episodes=10000)
policy_1_score = game_rl_1.evaluate_policy(episodes=1000)
print("Policy 1 Score:", policy_1_score)

# Train and evaluate the second policy
game_rl_2 = GridGameRL(rows, cols, rewards, success_probabilities, costs, walls)
game_rl_2.epsilon = 0.01  # Changing epsilon for the second policy as an example
game_rl_2.train(episodes=10000)
policy_2_score = game_rl_2.evaluate_policy(episodes=1000)
print("Policy 2 Score:", policy_2_score)

# Compare policies
if policy_1_score > policy_2_score:
    print("Policy 1 is better.")
elif policy_1_score < policy_2_score:
    print("Policy 2 is better.")
else:
    print("Both policies are equally good.")

# Print policies for visual comparison
print("\nPolicy 1:")
game_rl_1.print_policy()
print("\nPolicy 2:")
game_rl_2.print_policy()


Policy 1 Score: -6.902
Policy 2 Score: -7.028
Policy 1 is better.

Policy 1:
→ → ↓
↓ W ↓
→ → ↓
→ -1 +1

Policy 2:
↓ ← →
↓ W →
↓ ↓ ↓
→ -1 +1


In [None]:
import numpy as np
import random

class GridGameRL:
    def __init__(self, rows, cols, rewards, success_probabilities, costs, goal, avoid, walls, gamma=0.9, epsilon=0.1):
        self.rows = rows
        self.cols = cols
        self.rewards = rewards
        self.success_probabilities = success_probabilities  # Dictionary of move: success probability
        self.costs = costs  # Cost matrix
        self.goal = goal  # Goal location
        self.avoid = avoid  # Avoid location
        self.walls = walls  # List of wall locations
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_values = np.zeros((rows, cols, 4))  # Q-values for each state-action pair
        self.returns = {}  # Dictionary to store returns for state-action pairs
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # Possible moves: right, left, down, up

        # Initialize returns dictionary
        for row in range(rows):
            for col in range(cols):
                for action in range(4):
                    self.returns[((row, col), action)] = []

    def is_valid_location(self, location):
        return (0 <= location[0] < self.rows and
                0 <= location[1] < self.cols and
                location not in self.walls)

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(range(4))  # Explore: choose a random action
        else:
            return np.argmax(self.q_values[state[0], state[1]])  # Exploit: choose the best action

    def take_action(self, state, action):
        move = self.actions[action]
        success_prob = self.success_probabilities[move]
        if random.uniform(0, 1) < success_prob:
            next_state = [state[0] + move[0], state[1] + move[1]]
            if not self.is_valid_location(next_state):
                next_state = state  # Invalid move: stay in the same state
        else:
            next_state = state  # Move failed: stay in the same state
        return next_state

    def generate_episode(self):
        episode = []
        state = [0, 0]  # Start at the initial position
        while state != self.goal and state != self.avoid:
            action = self.choose_action(state)
            next_state = self.take_action(state, action)
            reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
            episode.append((state, action, reward))
            state = next_state
        return episode

    def update_q_values(self, episode):
        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = self.gamma * G + reward
            if (tuple(state), action) not in visited:
                self.returns[(tuple(state), action)].append(G)
                self.q_values[state[0], state[1], action] = np.mean(self.returns[(tuple(state), action)])
                visited.add((tuple(state), action))

    def train(self, episodes=1000):
        for episode_num in range(episodes):
            episode = self.generate_episode()
            self.update_q_values(episode)

    def evaluate_policy(self, episodes=100):
        total_return = 0
        for _ in range(episodes):
            state = [0, 0]
            episode_return = 0
            while state != self.goal and state != self.avoid:
                action = self.choose_action(state)
                next_state = self.take_action(state, action)
                reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
                episode_return += reward
                state = next_state
            total_return += episode_return
        return total_return / episodes

    def print_policy(self):
        direction_mapping = ['→', '←', '↓', '↑']
        policy = [[None for _ in range(self.cols)] for _ in range(self.rows)]
        for row in range(self.rows):
            for col in range(self.cols):
                if [row, col] == self.goal:
                    policy[row][col] = 'G'
                elif [row, col] == self.avoid:
                    policy[row][col] = 'X'
                elif [row, col] in self.walls:
                    policy[row][col] = 'W'
                else:
                    best_action = np.argmax(self.q_values[row, col])
                    policy[row][col] = direction_mapping[best_action]
        for row in policy:
            print(" ".join(row))

# Example setup:
rows = 5
cols = 5
rewards = [[0 for _ in range(cols)] for _ in range(rows)]  # Example reward matrix
rewards[4][4] = 10  # Goal reward
rewards[2][2] = -10  # Avoid penalty

success_probabilities = {
    (0, 1): 0.8,  # 80% chance to move right
    (0, -1): 0.8, # 80% chance to move left
    (1, 0): 0.8,  # 80% chance to move down
    (-1, 0): 0.8  # 80% chance to move up
}

costs = [[-1 for _ in range(cols)] for _ in range(rows)]  # Example cost matrix

goal = [4, 4]
avoid = [2, 2]
walls = [[1, 1], [1, 2], [2, 1]]  # Example walls

# Train and evaluate the first policy
game_rl_1 = GridGameRL(rows, cols, rewards, success_probabilities, costs, goal, avoid, walls)
game_rl_1.train(episodes=10000)
policy_1_score = game_rl_1.evaluate_policy(episodes=1000)
print("Policy 1 Score:", policy_1_score)

# Train and evaluate the second policy
game_rl_2 = GridGameRL(rows, cols, rewards, success_probabilities, costs, goal, avoid, walls)
game_rl_2.epsilon = 0.2  # Changing epsilon for the second policy as an example
game_rl_2.train(episodes=10000)
policy_2_score = game_rl_2.evaluate_policy(episodes=1000)
print("Policy 2 Score:", policy_2_score)

# Compare policies
if policy_1_score > policy_2_score:
    print("Policy 1 is better.")
elif policy_1_score < policy_2_score:
    print("Policy 2 is better.")
else:
    print("Both policies are equally good.")

# Print policies for visual comparison
print("\nPolicy 1:")
game_rl_1.print_policy()
print("\nPolicy 2:")
game_rl_2.print_policy()


Policy 1 Score: -8.585
Policy 2 Score: -9.632
Policy 1 is better.

Policy 1:
↓ ← ↓ ↓ ↑
↓ W W ↓ ↓
↓ W X ← ←
→ → ↑ ↑ ↓
→ → ↑ → G

Policy 2:
↓ ← → ↓ →
↓ W W ↓ ↓
↓ W X ← ←
→ → ↑ → ↓
→ → → → G


In [None]:
import numpy as np
import random
# first try of RL
class GridGameMBRL:
    def __init__(self, rows, cols, rewards, success_probabilities, costs, walls, gamma=0.9):
        self.rows = rows
        self.cols = cols
        self.rewards = rewards
        self.success_probabilities = success_probabilities  # Dictionary of move: success probability
        self.costs = costs  # Cost matrix
        self.walls = walls  # List of wall locations
        self.gamma = gamma  # Discount factor
        self.value_table = np.zeros((rows, cols))  # Value function for each state
        self.policy = np.zeros((rows, cols), dtype=int)  # Policy for each state
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # Possible moves: right, left, down, up

    def is_valid_location(self, location):
        return (0 <= location[0] < self.rows and
                0 <= location[1] < self.cols and
                location not in self.walls)

    def get_next_state(self, state, action):
        move = self.actions[action]
        success_prob = self.success_probabilities[move]
        if random.uniform(0, 1) < success_prob:
            next_state = [state[0] + move[0], state[1] + move[1]]
            if not self.is_valid_location(next_state):
                next_state = state  # Invalid move: stay in the same state
        else:
            next_state = state  # Move failed: stay in the same state
        return next_state

    def value_iteration(self, theta=0.0001):
        while True:
            delta = 0
            for row in range(self.rows):
                for col in range(self.cols):
                    if [row, col] in self.walls:
                        continue
                    old_value = self.value_table[row, col]
                    new_value = float('-inf')
                    for action in range(4):
                        next_state = self.get_next_state([row, col], action)
                        reward = self.rewards[row][col] + self.costs[row][col]
                        value = reward + self.gamma * self.value_table[next_state[0], next_state[1]]
                        if value > new_value:
                            new_value = value
                            self.policy[row, col] = action
                    self.value_table[row, col] = new_value
                    delta = max(delta, abs(old_value - new_value))
            if delta < theta:
                break

    def evaluate_policy(self, episodes=100):
        total_return = 0
        for _ in range(episodes):
            state = [0, 0]
            episode_return = 0
            while self.rewards[state[0]][state[1]] == 0:  # Continue until reaching a terminal state
                action = self.policy[state[0], state[1]]
                next_state = self.get_next_state(state, action)
                reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
                episode_return += reward
                state = next_state
            total_return += episode_return
        return total_return / episodes

    def print_policy(self):
        direction_mapping = ['→', '←', '↓', '↑']
        policy = [[None for _ in range(self.cols)] for _ in range(self.rows)]
        for row in range(self.rows):
            for col in range(self.cols):
                if [row, col] in self.walls:
                    policy[row][col] = 'W'
                else:
                    best_action = self.policy[row, col]
                    policy[row][col] = direction_mapping[best_action]
        for row in policy:
            print(" ".join(row))

# Example setup:
rows = 5
cols = 5
rewards = [[0 for _ in range(cols)] for _ in range(rows)]  # Example reward matrix
rewards[2][3] = 1  # Positive reward
rewards[4][4] = -1  # Negative reward

success_probabilities = {
    (0, 1): 0.8,  # 80% chance to move right
    (0, -1): 0.8, # 80% chance to move left
    (1, 0): 0.8,  # 80% chance to move down
    (-1, 0): 0.8  # 80% chance to move up
}

costs = [[-1 for _ in range(cols)] for _ in range(rows)]  # Example cost matrix

walls = [[1, 1], [1, 2], [2, 1]]  # Example walls

# Train and evaluate the policy
game_mbrl = GridGameMBRL(rows, cols, rewards, success_probabilities, costs, walls)

import time

start_time = time.time()
game_mbrl.value_iteration()
end_time = time.time()

policy_score = game_mbrl.evaluate_policy(episodes=1000)
print("Policy Score:", policy_score)
print(f"Value Iteration Time: {end_time - start_time:.4f} seconds")

# Print policy for visual comparison
print("\nPolicy:")
game_mbrl.print_policy()


KeyboardInterrupt: 

In [4]:
import numpy as np
import random

class GridGameMBRL:
    def __init__(self, rows, cols, rewards, success_probabilities, costs, walls, gamma=0.9):
        self.rows = rows
        self.cols = cols
        self.rewards = rewards
        self.success_probabilities = success_probabilities  # Dictionary of move: success probability
        self.costs = costs  # Cost matrix
        self.walls = walls  # List of wall locations
        self.gamma = gamma  # Discount factor
        self.value_table = np.zeros((rows, cols))  # Value function for each state
        self.policy = np.zeros((rows, cols), dtype=int)  # Policy for each state
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # Possible moves: right, left, down, up

    def is_valid_location(self, location):
        return (0 <= location[0] < self.rows and
                0 <= location[1] < self.cols and
                location not in self.walls)

    def get_next_state(self, state, action):
        move = self.actions[action]
        next_state = [state[0] + move[0], state[1] + move[1]]
        if not self.is_valid_location(next_state):
            next_state = state  # Invalid move: stay in the same state
        return next_state

    def value_iteration(self, theta=0.0001):
        while True:
            delta = 0
            new_value_table = np.copy(self.value_table)
            for row in range(self.rows):
                for col in range(self.cols):
                    if [row, col] in self.walls:
                        continue
                    old_value = self.value_table[row, col]
                    new_value = float('-inf')
                    for action in range(4):
                        next_state = self.get_next_state([row, col], action)
                        reward = self.rewards[row][col] + self.costs[row][col]
                        value = reward + self.gamma * self.value_table[next_state[0], next_state[1]]
                        if value > new_value:
                            new_value = value
                            self.policy[row, col] = action
                    new_value_table[row, col] = new_value
                    delta = max(delta, abs(old_value - new_value))
            self.value_table = new_value_table
            if delta < theta:
                break

    def evaluate_policy(self, episodes=10000):
        total_return = 0
        for _ in range(episodes):
            state = [0, 0]
            episode_return = 0
            while self.rewards[state[0]][state[1]] == 0:  # Continue until reaching a terminal state
                action = self.policy[state[0], state[1]]
                next_state = self.get_next_state(state, action)
                reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
                episode_return += reward
                state = next_state
            total_return += episode_return
        return total_return / episodes

    def print_policy(self):
        direction_mapping = ['→', '←', '↓', '↑']
        policy = [[None for _ in range(self.cols)] for _ in range(self.rows)]
        for row in range(self.rows):
            for col in range(self.cols):
                if [row, col] in self.walls:
                    policy[row][col] = 'W'
                else:
                    best_action = self.policy[row, col]
                    policy[row][col] = direction_mapping[best_action]
        for row in policy:
            print(" ".join(row))

# Example setup:
rows = 5
cols = 5
rewards = [[0 for _ in range(cols)] for _ in range(rows)]  # Example reward matrix
rewards[2][3] = 1  # Positive reward
rewards[4][4] = -1  # Negative reward

success_probabilities = {
    (0, 1): 0.8,  # 80% chance to move right
    (0, -1): 0.8, # 80% chance to move left
    (1, 0): 0.8,  # 80% chance to move down
    (-1, 0): 0.8  # 80% chance to move up
}

costs = [[-1 for _ in range(cols)] for _ in range(rows)]  # Example cost matrix

walls = [[1, 1], [1, 2], [2, 1]]  # Example walls

# Train and evaluate the policy
game_mbrl = GridGameMBRL(rows, cols, rewards, success_probabilities, costs, walls)

import time

start_time = time.time()
game_mbrl.value_iteration()
end_time = time.time()

policy_score = game_mbrl.evaluate_policy(episodes=1000)
print("Policy Score:", policy_score)
print(f"Value Iteration Time: {end_time - start_time:.4f} seconds")

# Print policy for visual comparison
print("\nPolicy:")
game_mbrl.print_policy()


Policy Score: -5.0
Value Iteration Time: 0.0181 seconds

Policy:
→ → → ↓ ←
↓ W W ↓ ←
↓ W → → ←
→ → → ↑ ←
→ → → ↑ ←


In [9]:
import numpy as np
import random

class GridGameMBRL:
    def __init__(self, rows, cols, rewards, success_probabilities, costs, walls, gamma=0.9):
        self.rows = rows
        self.cols = cols
        self.rewards = rewards
        self.success_probabilities = success_probabilities  # Dictionary of move: success probability
        self.costs = costs  # Cost matrix
        self.walls = walls  # List of wall locations
        self.gamma = gamma  # Discount factor
        self.value_table = np.zeros((rows, cols))  # Value function for each state
        self.policy = np.zeros((rows, cols), dtype=int)  # Policy for each state
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # Possible moves: right, left, down, up

    def is_valid_location(self, location):
        return (0 <= location[0] < self.rows and
                0 <= location[1] < self.cols and
                location not in self.walls)

    def get_next_state(self, state, action):
        move = self.actions[action]
        next_state = [state[0] + move[0], state[1] + move[1]]
        if not self.is_valid_location(next_state):
            next_state = state  # Invalid move: stay in the same state
        return next_state

    def value_iteration(self, theta=0.0001):
        while True:
            delta = 0
            new_value_table = np.copy(self.value_table)
            for row in range(self.rows):
                for col in range(self.cols):
                    if [row, col] in self.walls:
                        continue
                    old_value = self.value_table[row, col]
                    new_value = float('-inf')
                    for action in range(4):
                        next_state = self.get_next_state([row, col], action)
                        reward = self.rewards[row][col] + self.costs[row][col]
                        value = reward + self.gamma * self.value_table[next_state[0], next_state[1]]
                        if value > new_value:
                            new_value = value
                            self.policy[row, col] = action
                    new_value_table[row, col] = new_value
                    delta = max(delta, abs(old_value - new_value))
            self.value_table = new_value_table
            if delta < theta:
                break

    def evaluate_policy(self, episodes=10000):
        total_return = 0
        for _ in range(episodes):
            state = [0, 0]
            episode_return = 0
            while self.rewards[state[0]][state[1]] == 0:  # Continue until reaching a terminal state
                action = self.policy[state[0], state[1]]
                next_state = self.get_next_state(state, action)
                reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
                episode_return += reward
                state = next_state
            total_return += episode_return
        return total_return / episodes

    def print_policy(self):
        direction_mapping = ['→', '←', '↓', '↑']
        policy = [[None for _ in range(self.cols)] for _ in range(self.rows)]
        for row in range(self.rows):
            for col in range(self.cols):
                if [row, col] in self.walls:
                    policy[row][col] = 'W'
                else:
                    best_action = self.policy[row, col]
                    policy[row][col] = direction_mapping[best_action]
        for row in policy:
            print(" ".join(row))

def main():
    # Set up the grid parameters
    w = 12
    h = 4
    L = [(1,0,-100),(2,0,-100),(3,0,-100),(4,0,-100),(5,0,-100),(6,0,-100),(7,0,-100),(8,0,-100),(9,0,-100),(10,0,-100),(11,0,0)]
    p = 1
    r = -1

    # Initialize the rewards, costs, and walls
    rewards = [[0 for _ in range(w)] for _ in range(h)]
    costs = [[r for _ in range(w)] for _ in range(h)]
    walls = []

    for x, y, value in L:
        if value == 0:
            walls.append([y, x])  # Note the (x, y) to (row, col) conversion
        else:
            rewards[y][x] = value

    # Define success probabilities
    success_probabilities = {
        (0, 1): p,  # 80% chance to move right
        (0, -1): p, # 80% chance to move left
        (1, 0): p,  # 80% chance to move down
        (-1, 0): p  # 80% chance to move up
    }

    # Create the GridGameMBRL instance
    game_mbrl = GridGameMBRL(h, w, rewards, success_probabilities, costs, walls)

    # Perform value iteration to find the best policy
    import time
    start_time = time.time()
    game_mbrl.value_iteration()
    end_time = time.time()

    # Evaluate the policy
    policy_score = game_mbrl.evaluate_policy(episodes=1000)
    print("Policy Score:", policy_score)
    print(f"Value Iteration Time: {end_time - start_time:.4f} seconds")

    # Print the policy for visual comparison
    print("\nPolicy:")
    game_mbrl.print_policy()

if __name__ == "__main__":
    main()


KeyboardInterrupt: 

In [12]:
import numpy as np
import random

class GridGameMBRL:
    def __init__(self, rows, cols, rewards, success_probabilities, costs, walls, gamma=0.9):
        self.rows = rows
        self.cols = cols
        self.rewards = rewards
        self.success_probabilities = success_probabilities
        self.costs = costs
        self.walls = walls
        self.gamma = gamma
        self.value_table = np.zeros((rows, cols))
        self.policy = np.zeros((rows, cols), dtype=int)
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]

    def is_valid_location(self, location):
        return (0 <= location[0] < self.rows and
                0 <= location[1] < self.cols and
                location not in self.walls)

    def get_next_state(self, state, action):
        move = self.actions[action]
        next_state = [state[0] + move[0], state[1] + move[1]]
        if not self.is_valid_location(next_state):
            next_state = state
        return next_state

    def value_iteration(self, theta=0.0001):
        while True:
            delta = 0
            new_value_table = np.copy(self.value_table)
            for row in range(self.rows):
                for col in range(self.cols):
                    if [row, col] in self.walls:
                        continue
                    old_value = self.value_table[row, col]
                    new_value = float('-inf')
                    for action in range(4):
                        next_state = self.get_next_state([row, col], action)
                        reward = self.rewards[row][col] + self.costs[row][col]
                        value = reward + self.gamma * self.value_table[next_state[0], next_state[1]]
                        if value > new_value:
                            new_value = value
                            self.policy[row, col] = action
                    new_value_table[row, col] = new_value
                    delta = max(delta, abs(old_value - new_value))
            self.value_table = new_value_table
            if delta < theta:
                break

    def evaluate_policy(self, episodes=10000, epsilon=0.1):
        total_return = 0
        for _ in range(episodes):
            state = [0, 0]
            episode_return = 0
            while self.rewards[state[0]][state[1]] == 0:
                if random.uniform(0, 1) < epsilon:
                    action = random.choice(range(4))
                else:
                    action = self.policy[state[0], state[1]]
                next_state = self.get_next_state(state, action)
                reward = self.rewards[state[0]][state[1]] + self.costs[state[0]][state[1]]
                episode_return += reward
                state = next_state
            total_return += episode_return
        return total_return / episodes

    def print_policy(self):
        direction_mapping = ['→', '←', '↓', '↑']
        policy = [[None for _ in range(self.cols)] for _ in range(self.rows)]
        for row in range(self.rows):
            for col in range(self.cols):
                if [row, col] in self.walls:
                    policy[row][col] = 'W'  # Wall
                elif self.rewards[row][col] > 0:
                    policy[row][col] = 'P'  # Positive reward
                elif self.rewards[row][col] < 0:
                    policy[row][col] = 'N'  # Negative reward
                else:
                    best_action = self.policy[row, col]
                    policy[row][col] = direction_mapping[best_action]
        for row in policy:
            print(" ".join(row))

def main():
    # Set up the grid parameters
    w = 12
    h = 4
    L = [(1,0,-100),(2,0,-100),(3,0,-100),(4,0,-100),(5,0,-100),(6,0,-100),(7,0,-100),(8,0,-100),(9,0,-100),(10,0,-100),(11,0,0)]
    p = 1
    r = -1

    # Initialize the rewards, costs, and walls
    rewards = [[0 for _ in range(w)] for _ in range(h)]
    costs = [[r for _ in range(w)] for _ in range(h)]
    walls = []

    for x, y, value in L:
        if value == 0:
            walls.append([y, x])  # Note the (x, y) to (row, col) conversion
        else:
            rewards[y][x] = value

    # Define success probabilities
    success_probabilities = {
        (0, 1): p,
        (0, -1): p,
        (1, 0): p,
        (-1, 0): p
    }

    # Create the GridGameMBRL instance
    game_mbrl = GridGameMBRL(h, w, rewards, success_probabilities, costs, walls)

    # Perform value iteration to find the best policy
    import time
    start_time = time.time()
    game_mbrl.value_iteration()
    end_time = time.time()

    # Evaluate the policy
    policy_score = game_mbrl.evaluate_policy(episodes=1000, epsilon=0.1)
    print("Policy Score:", policy_score)
    print(f"Value Iteration Time: {end_time - start_time:.4f} seconds")

    # Print the policy for visual comparison
    print("\nPolicy:")
    game_mbrl.print_policy()

if __name__ == "__main__":
    main()


Policy Score: -1726.545
Value Iteration Time: 0.0305 seconds

Policy:
← N N N N N N N N N N W
→ → → → → → → → → → → →
→ → → → → → → → → → → →
→ → → → → → → → → → → →
