In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


In [None]:
!pip install gym
!pip install pygame

In [7]:
import gym
import pygame
import numpy as np

# Initialize pygame (this must be done before using pygame functions)
pygame.init()

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=False)
env.reset()

# Define colors
WHITE = (255, 255, 255)  # Frozen tiles
BLUE = (0, 0, 255)       # Start tile
BLACK = (0, 0, 0)        # Hole tile
GREEN = (0, 255, 0)      # Goal tile
RED = (255, 0, 0)        # Player's position

# Set up window size
tile_size = 50  # Size of each tile
grid = env.unwrapped.desc  # Get the grid layout
env_size = grid.shape[0]  # Determine the grid size dynamically
window_size = env_size * tile_size

# Create pygame window (after initialization)
screen = pygame.display.set_mode((window_size, window_size))

# Function to draw the environment
def draw_env(env, screen):
    screen.fill(WHITE)  # Fill the screen with white
    grid = env.unwrapped.desc  # Get the grid layout of FrozenLake

    for i in range(grid.shape[0]):  # Use the shape of the grid to determine size
        for j in range(grid.shape[1]):
            tile = grid[i, j].decode('utf-8')  # Get tile type
            rect = pygame.Rect(j * tile_size, i * tile_size, tile_size, tile_size)

            # Color tiles based on type
            if tile == 'S':
                pygame.draw.rect(screen, BLUE, rect)  # Start
            elif tile == 'F':
                pygame.draw.rect(screen, WHITE, rect)  # Frozen
            elif tile == 'H':
                pygame.draw.rect(screen, BLACK, rect)  # Hole
            elif tile == 'G':
                pygame.draw.rect(screen, GREEN, rect)  # Goal

    # Draw the player's position
    player_row, player_col = np.unravel_index(env.s, grid.shape)
    player_rect = pygame.Rect(player_col * tile_size, player_row * tile_size, tile_size, tile_size)
    pygame.draw.rect(screen, RED, player_rect)

# Main loop to render the environment and play random actions
MAX_ITERATIONS = 10
rewards = []
done = False

# Reset environment before starting the game
env.reset()

# Start the rendering loop
for i in range(MAX_ITERATIONS):
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            done = True
            break

    # Take a random action
    random_action = env.action_space.sample()
    new_state, reward, done, info, probs = env.step(random_action)

    # Render the environment using pygame
    draw_env(env, screen)

    # Update the pygame display
    pygame.display.flip()

    # Add a delay (in milliseconds) to slow down the rendering
    pygame.time.delay(500)  # Delay for 500 milliseconds (0.5 seconds)

    rewards.append(reward)

    if done:
        print(f"Game ended in iteration {i+1}.")
        break

# Ensure pygame.quit() is called **after** the loop
pygame.quit()

# Output the accumulated rewards
print("Rewards:", rewards)


Game ended in iteration 2.
Rewards: [0.0, 0.0]


In [3]:
random.uniform(0,1)

0.6484833473059604

In [4]:
import numpy as np
import random

# Simple environment function y = f(x1, x2)
def environment(x1, x2):
    # Example of a simple quadratic function
    return -((x1 - 5) ** 2 + (x2 - 3) ** 2) + 10

# Initialize state-action Q-table
q_table = np.zeros((11, 11, 4))  # State space (for x1 and x2), action space (4 actions)

# Define action space
actions = ['increase_x1', 'decrease_x1', 'increase_x2', 'decrease_x2']

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.99  # Decay factor for exploration rate
min_epsilon = 0.01  # Minimum epsilon
episodes = 1000  # Number of episodes

# Track the agent's position in the environment
x1, x2 = 5, 5  # Start in the middle of the grid

# Q-learning algorithm
for episode in range(episodes):
    # Reset the environment for each episode
    x1, x2 = random.randint(0, 10), random.randint(0, 10)

    for step in range(100):  # Maximum steps per episode
        # Choose an action (epsilon-greedy policy)
        if random.uniform(0, 1) < epsilon:
            action_index = random.randint(0, 3)  # Explore: Random action
        else:
            action_index = np.argmax(q_table[x1, x2])  # Exploit: Best action

        # Take the action and update the state
        if actions[action_index] == 'increase_x1':
            x1_new = min(x1 + 1, 10)
            x2_new = x2
        elif actions[action_index] == 'decrease_x1':
            x1_new = max(x1 - 1, 0)
            x2_new = x2
        elif actions[action_index] == 'increase_x2':
            x1_new = x1
            x2_new = min(x2 + 1, 10)
        elif actions[action_index] == 'decrease_x2':
            x1_new = x1
            x2_new = max(x2 - 1, 0)

        # Calculate the reward (change in y)
        y_current = environment(x1, x2)
        y_new = environment(x1_new, x2_new)
        reward = y_new - y_current  # Positive if y increases, negative if it decreases

        # Update Q-table using the Bellman equation
        best_future_q = np.max(q_table[x1_new, x2_new])
        q_table[x1, x2, action_index] = (1 - alpha) * q_table[x1, x2, action_index] + \
            alpha * (reward + gamma * best_future_q)

        # Move to the new state
        x1, x2 = x1_new, x2_new

        # If we've reached an optimal point (based on reward), stop the episode early
        if reward == 0:
            break

    # Decay the exploration rate
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

# Display learned Q-table
print(q_table)


[[[ 7.64719439e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 4.95595760e+00  1.96028250e-01  0.00000000e+00 -5.00000000e-01]
  [ 1.06130199e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 9.00694455e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 9.24343632e+00  2.09846672e-01  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  2.70000000e-02  0.00000000e+00  3.46628235e+00]
  [ 8.04518268e+00  0.00000000e+00 -5.92011312e-01  0.00000000e+00]
  [ 7.42184217e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 1.39228448e+01  0.00000000e+00  0.00000000e+00  9.00000000e-01]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  7.55306689e+00]
  [ 9.96324932e+00  0.00000000e+00  2.25180000e-01  0.00000000e+00]]

 [[ 1.26019814e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 1.16902219e+01  0.00000000e+00  6.97395000e-01  0.00000000e+00]
  [ 1.23109720e+01 -9.00000000e-01  2.71000000e-01  3.63298133e-01]
  [ 1.05719915e+01  0.00000000e+00  0.00000000

In [5]:
import numpy as np
import random

# Known heuristic: If x1 increases, y generally increases, but only when x2 is in the range [3, 7].
# We can model this in the reward function and exploration strategy.

# Simple environment function y = f(x1, x2) with some known heuristic behavior
def environment(x1, x2):
    return -((x1 - 5) ** 2 + (x2 - 3) ** 2) + 10  # This is a simple quadratic function

# Known heuristic: A function that estimates whether an action is likely to be beneficial
def heuristic(x1, x2, action):
    if action == 'increase_x1' and 3 <= x2 <= 7:
        return 0.5  # Positive heuristic (more likely to be beneficial)
    elif action == 'decrease_x1' and (x2 < 3 or x2 > 7):
        return 0.2  # Minor positive heuristic
    elif action == 'increase_x2':
        return 0.3  # Some likelihood of improvement
    else:
        return -0.1  # Negative heuristic, not likely to improve

# Initialize state-action Q-table
q_table = np.zeros((11, 11, 4))  # State space (for x1 and x2), action space (4 actions)

# Define action space
actions = ['increase_x1', 'decrease_x1', 'increase_x2', 'decrease_x2']

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.99  # Decay factor for exploration rate
min_epsilon = 0.01  # Minimum epsilon
episodes = 1000  # Number of episodes

# Track the agent's position in the environment
x1, x2 = 5, 5  # Start in the middle of the grid

# Q-learning algorithm with heuristic guidance
for episode in range(episodes):
    # Reset the environment for each episode
    x1, x2 = random.randint(0, 10), random.randint(0, 10)

    for step in range(100):  # Maximum steps per episode
        # Choose an action (epsilon-greedy policy with heuristic bias)
        if random.uniform(0, 1) < epsilon:
            action_index = random.randint(0, 3)  # Explore: Random action
        else:
            # Heuristic-based bias: Adjust Q-values based on the heuristic before selecting an action
            heuristic_values = np.array([heuristic(x1, x2, action) for action in actions])
            q_values_with_heuristic = q_table[x1, x2] + heuristic_values
            action_index = np.argmax(q_values_with_heuristic)  # Exploit: Best action with heuristic bias

        # Take the action and update the state
        if actions[action_index] == 'increase_x1':
            x1_new = min(x1 + 1, 10)
            x2_new = x2
        elif actions[action_index] == 'decrease_x1':
            x1_new = max(x1 - 1, 0)
            x2_new = x2
        elif actions[action_index] == 'increase_x2':
            x1_new = x1
            x2_new = min(x2 + 1, 10)
        elif actions[action_index] == 'decrease_x2':
            x1_new = x1
            x2_new = max(x2 - 1, 0)

        # Calculate the reward (change in y)
        y_current = environment(x1, x2)
        y_new = environment(x1_new, x2_new)
        reward = y_new - y_current  # Positive if y increases, negative if it decreases

        # Update Q-table using the Bellman equation
        best_future_q = np.max(q_table[x1_new, x2_new])
        q_table[x1, x2, action_index] = (1 - alpha) * q_table[x1, x2, action_index] + \
            alpha * (reward + heuristic(x1, x2, actions[action_index]) + gamma * best_future_q)

        # Move to the new state
        x1, x2 = x1_new, x2_new

        # If we've reached an optimal point (based on reward), stop the episode early
        if reward == 0:
            break

    # Decay the exploration rate
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

# Display learned Q-table
print(q_table)


[[[ 0.00000000e+00  0.00000000e+00  4.45948533e+00 -1.00000000e-02]
  [ 1.14402491e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  2.00000000e-02  9.44601389e+00  0.00000000e+00]
  [ 2.22156941e+01  0.00000000e+00  2.94500000e-02 -1.76257654e-01]
  [ 1.19183954e+01  0.00000000e+00 -2.04345000e-01  9.00000000e-02]
  [ 8.31100423e+00  2.21705000e-01 -8.07500000e-01  3.75500000e-01]
  [ 8.31804012e+00  0.00000000e+00  0.00000000e+00  5.75500000e-01]
  [ 1.11237908e+01 -1.90000000e-02  0.00000000e+00  0.00000000e+00]
  [ 2.42898926e+01  0.00000000e+00 -1.06820000e+00  0.00000000e+00]
  [ 1.86096849e+00  2.00000000e-02 -1.18990000e+00  1.66820972e+01]
  [ 8.90000000e-01  0.00000000e+00  0.00000000e+00  1.11357407e+01]]

 [[ 5.96178024e+00  0.00000000e+00  5.65533191e-01  0.00000000e+00]
  [ 7.60209612e-01  0.00000000e+00  1.13911478e+01 -8.61091812e-01]
  [ 7.20146977e-01  0.00000000e+00  1.53389148e+01 -2.74466809e-01]
  [ 1.74042892e+01 -2.23585238e+00  7.06949911