In [11]:
# -*- coding: utf-8 -*-
"""
Get Q-learning to work on the FrozenLake environment
"""

import gymnasium as gym

# Create the environment
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False)

# Get the number of states
num_states = env.observation_space.n
num_actions = env.action_space.n

# Generate a list of states
states = range(num_states)
actions= range(num_actions)

#Make a Q-table
q_table={}
for s in states:
    for a in actions:
        state_action=(s,a)
        q_table[state_action]=0

In [24]:
import numpy as np

n_episodes=500000
alpha=0.001
gamma=0.9
epsilon=0.1

def epsilon_greedy_policy(state, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()  # Choose a random action
    else:
        return max(actions, key=lambda a: q_table[(state, a)])  # Choose the action with the highest Q-value

for i in range(n_episodes):
    state, info = env.reset()  # Reset the environment
    done = False
    while not done:
        action = epsilon_greedy_policy(state, epsilon)  # Choose action based on epsilon-greedy policy
        next_state, reward, done, truncated, info = env.step(action)  # Take the action

        # Update Q-table using the Q-learning formula
        best_next_action = max(actions, key=lambda a: q_table[(next_state, a)])
        td_target = reward + gamma * q_table[(next_state, best_next_action)]
        td_error = td_target - q_table[(state, action)]
        q_table[(state, action)] += alpha * td_error

        # Move to the next state
        state = next_state
print(q_table)

{(0, 0): 0.5314275733489845, (0, 1): 0.5904899999997402, (0, 2): 0.4781174946200239, (0, 3): 0.5314286714323224, (1, 0): 0.5314227471323519, (1, 1): 0.0, (1, 2): 0.02230779810453289, (1, 3): 0.1250973156618463, (2, 0): 0.18157031906607565, (2, 1): 0.01098609352469124, (2, 2): 1.402872263834947e-06, (2, 3): 0.0011662018590399765, (3, 0): 0.000726102759814892, (3, 1): 0.0, (3, 2): 1.7215660428138127e-09, (3, 3): 0.0, (4, 0): 0.5904684489563743, (4, 1): 0.656099999999773, (4, 2): 0.0, (4, 3): 0.5314268088133346, (5, 0): 0, (5, 1): 0, (5, 2): 0, (5, 3): 0, (6, 0): 0.0, (6, 1): 0.8098345956293571, (6, 2): 0.0, (6, 3): 0.017924931889547375, (7, 0): 0, (7, 1): 0, (7, 2): 0, (7, 3): 0, (8, 0): 0.6560713423042682, (8, 1): 0.0, (8, 2): 0.7289999999998094, (8, 3): 0.5904645924875154, (9, 0): 0.6560481079239304, (9, 1): 0.8098731374467368, (9, 2): 0.8099999999998497, (9, 3): 0.0, (10, 0): 0.7289378265156317, (10, 1): 0.8999999999998947, (10, 2): 0.0, (10, 3): 0.727938078933294, (11, 0): 0, (11, 1)

In [25]:
#Define Agent with a class

class Agent:
    def __init__(self):
        self.n_episodes=500000
        self.alpha=0.001
        self.gamma=0.9
        self.epsilon=0.1

        self.Q=self.initialize_Q()

    def initialize_Q(self):
        q_table={}
        for s in states:
            for a in actions:
                state_action=(s,a)
                q_table[state_action]=0
        return q_table
    
    def policy(self, state):
        if np.random.rand() < self.epsilon:
            return env.action_space.sample()  # Choose a random action
        else:
            return max(actions, key=lambda a: self.Q[(state, a)])  # Choose the action with the highest Q-value

agent=Agent()

for i in range(n_episodes):
    state, info = env.reset()  # Reset the environment
    done = False
    while not done:
        action = agent.policy(state)  # Choose action based on epsilon-greedy policy
        next_state, reward, done, truncated, info = env.step(action)  # Take the action

        # Update Q-table using the Q-learning formula
        best_next_action = max(actions, key=lambda a: agent.Q[(next_state, a)])
        td_target = reward + gamma * agent.Q[(next_state, best_next_action)]
        td_error = td_target - agent.Q[(state, action)]
        agent.Q[(state, action)] += alpha * td_error

        # Move to the next state
        state = next_state
    

In [None]:
import numpy as np

class Agent:
    def __init__(self, states, actions, env):
        self.n_episodes = 500000
        self.alpha = 0.001
        self.gamma = 0.9
        self.epsilon = 0.1
        self.states = states
        self.actions = actions
        self.env = env

        self.Q = self.initialize_Q()

    def initialize_Q(self):
        q_table = {}
        for s in self.states:
            for a in self.actions:
                state_action = (s, a)
                q_table[state_action] = 0
        return q_table
    
    def policy(self, state):
        if np.random.rand() < self.epsilon:
            return self.env.action_space.sample()  # Choose a random action
        else:
            return max(self.actions, key=lambda a: self.Q[(state, a)])  # Choose the action with the highest Q-value

# Assuming env, states, and actions are defined somewhere in the code
# env = ... (initialize your environment)
# states = ... (define your state space)
# actions = ... (define your action space)
# agent = Agent(states, actions, env)

agent = Agent(states, actions, env)

for i in range(agent.n_episodes):
    state, info = agent.env.reset()  # Reset the environment
    done = False
    while not done:
        action = agent.policy(state)  # Choose action based on epsilon-greedy policy
        next_state, reward, done, truncated, info = agent.env.step(action)  # Take the action

        # Update Q-table using the Q-learning formula
        best_next_action = max(agent.actions, key=lambda a: agent.Q[(next_state, a)])
        td_target = reward + agent.gamma * agent.Q[(next_state, best_next_action)]
        td_error = td_target - agent.Q[(state, action)]
        agent.Q[(state, action)] += agent.alpha * td_error

        # Move to the next state
        state = next_state
