In [1]:
import numpy as np
import random

In [2]:
# ----- ENVIRONMENT -----

# 5 states (0 to 4)
# State 4 is final goal of model
n_states = 5
goal_state = 4

# Actions: 0 = move left, 1 = move right
n_actions = 2

In [3]:
# Rewards
# Reaching goal gives +10, otherwise 0
def get_reward(state, action):
    if state == 3 and action == 1:
        return 10  # moving right from state 3 -> state 4 (goal)
    return 0

# Next state logic
def get_next_state(state, action):
    if action == 0:  # left
        return max(0, state - 1)
    else:            # right
        return min(4, state + 1)

In [4]:
# ----- Q-LEARNING PARAMETERS -----
alpha = 0.1        # learning rate: How fast it learns
gamma = 0.9        # discount factor: How much the agent cares about future rewards vs immediate rewards.
epsilon = 0.2      # exploration chance : Chance to try new actions
episodes = 200     # How many times the agent repeats the learning process.

In [5]:
# Q-table (remember the table of each action and state reward?)
Q = np.zeros((n_states, n_actions))

In [6]:
# ----- TRAINING LOOP -----
for ep in range(episodes):
    state = 0  # start always at state 0

    while state != goal_state:

        # Decide an action (explore or exploit)
        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, n_actions - 1)  #exploration
        else:
            action = np.argmax(Q[state])  #exploit

        reward = get_reward(state, action) # Get the reward
        next_state = get_next_state(state, action) # Move to the next state

        # Q-learning update (the LEARNING step)
        Q[state][action] = Q[state][action] + alpha * (
            reward + gamma * np.max(Q[next_state]) - Q[state][action]
        )

        state = next_state # Move to the next state

print("Training completed!")
print("Final Q-Table:")
print(Q)

Training completed!
Final Q-Table:
[[5.53074977 7.28999726]
 [5.12627537 8.09999953]
 [6.15857297 8.99999993]
 [7.35433108 9.99999999]
 [0.         0.        ]]


In [7]:
# ----- TEST THE LEARNED POLICY -----
state = 0  # starting position of the agent.
path = [state]  # record every step the agent takes

while state != goal_state: # This loop keeps running until the agent reaches the goal.
    action = np.argmax(Q[state]) # This is the KEY part,It chooses the best possible action based on what the agent learned
    state = get_next_state(state, action)
    path.append(state) # add the new state to the path list.

print("Learned path:", path)

Learned path: [0, 1, 2, 3, 4]
