In [36]:
import numpy as np

# Configuration de base
discount_factor = 0.9
grid_rows, grid_cols = 3, 4  # Dimensions de la grille
target_cell = (0, 3)
danger_cell = (1, 3)
blocked_cell = (1, 1)
target_reward = 1
danger_penalty = -1
step_cost = 0  # Coût par pas

# Initialisation des valeurs et des récompenses
state_values = np.zeros((grid_rows, grid_cols))
reward_grid = np.full((grid_rows, grid_cols), step_cost, dtype=float)
reward_grid[target_cell] = target_reward
reward_grid[danger_cell] = danger_penalty
reward_grid[blocked_cell] = None  # Cellule bloquée

# Définitions des mouvements
moves = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # Haut, Bas, Gauche, Droite
move_symbols = {(-1, 0): '↑', (1, 0): '↓', (0, -1): '←', (0, 1): '→'}

# Algorithme d'itération sur les valeurs
def compute_values(state_values, reward_grid, discount_factor, max_iterations=100):
    for _ in range(max_iterations):
        updated_values = np.copy(state_values)
        for i in range(grid_rows):
            for j in range(grid_cols):
                if (i, j) in [target_cell, danger_cell, blocked_cell]:  # États terminaux ou obstacles
                    continue
                possible_values = []
                for move in moves:
                    new_i, new_j = i + move[0], j + move[1]
                    if 0 <= new_i < grid_rows and 0 <= new_j < grid_cols and (new_i, new_j) != blocked_cell:
                        possible_values.append((reward_grid[(new_i, new_j)] + discount_factor * state_values[new_i, new_j], move))
                if possible_values:
                    updated_values[i, j], _ = max(possible_values)
        state_values = updated_values
    return state_values

# Extraction de la politique optimale
def derive_policy(state_values, reward_grid):
    optimal_policy = np.full((grid_rows, grid_cols), ' ', dtype=str)
    for i in range(grid_rows):
        for j in range(grid_cols):
            if (i, j) == target_cell:
                optimal_policy[i, j] = 'G'
            elif (i, j) == danger_cell:
                optimal_policy[i, j] = '🔥'
            elif (i, j) == blocked_cell:
                optimal_policy[i, j] = '█'
            else:
                best_move = None
                highest_value = float('-inf')
                for move in moves:
                    new_i, new_j = i + move[0], j + move[1]
                    if 0 <= new_i < grid_rows and 0 <= new_j < grid_cols and (new_i, new_j) != blocked_cell:
                        value = reward_grid[(new_i, new_j)] + discount_factor * state_values[new_i, new_j]
                        if value > highest_value:
                            highest_value = value
                            best_move = move
                if best_move:
                    optimal_policy[i, j] = move_symbols[best_move]
    return optimal_policy

# Exécution des fonctions
state_values = compute_values(state_values, reward_grid, discount_factor)
optimal_policy = derive_policy(state_values, reward_grid)

# Affichage des résultats
for row in optimal_policy:
    print(" ".join(row))

print(state_values)

→ → → G
↑ █ ↑ 🔥
↑ → ↑ ←
[[0.81   0.9    1.     0.    ]
 [0.729  0.     0.9    0.    ]
 [0.6561 0.729  0.81   0.729 ]]
