<a href="https://colab.research.google.com/github/osama-kheshaifaty/SPE-KSA-WORKSHOP-2025/blob/main/reinforecement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Simple Reinforcement Learning Example: Balancing a Pole (CartPole)
# -------------------------------------------------------------------

# Step 1: Install libraries if missing
# Uncomment if running for the first time
# !pip install gym

# Step 2: Import Required Libraries
import gym
import numpy as np
import random

# Step 3: Create the Environment
# ----------------------------------------------------------
# CartPole: A pole is attached to a cart.
# Goal: Move the cart left or right to keep the pole balanced.

env = gym.make("CartPole-v1")

# Step 4: Define the Q-Learning Agent
# ----------------------------------------------------------
# Q-Learning is a table-based method where we learn
# the best action to take from every possible state.

# Note:
# - In real CartPole, states are continuous (positions, velocities).
# - So for simplicity, we'll **discretize** the states into buckets.

n_buckets = (1, 1, 6, 12)  # discretization for each state variable
n_actions = env.action_space.n  # 2 actions: left (0), right (1)
state_bounds = list(zip(env.observation_space.low, env.observation_space.high))
state_bounds[1] = [-0.5, 0.5]  # limit cart velocity
state_bounds[3] = [-np.radians(50), np.radians(50)]  # limit pole angle rate

q_table = np.zeros(n_buckets + (n_actions,))  # initialize Q-table with zeros

# Step 5: Define Helper Functions
# ----------------------------------------------------------

def discretize_state(state):
    """Convert continuous state into discrete buckets"""
    ratios = [(state[i] + abs(state_bounds[i][0])) / (state_bounds[i][1] - state_bounds[i][0]) for i in range(len(state))]
    new_state = [int(round((n_buckets[i] - 1) * ratios[i])) for i in range(len(state))]
    new_state = [min(n_buckets[i] - 1, max(0, new_state[i])) for i in range(len(state))]
    return tuple(new_state)

def choose_action(state, epsilon):
    """Epsilon-greedy policy: Explore randomly or exploit known best action"""
    if random.random() < epsilon:
        return env.action_space.sample()  # Explore: random action
    else:
        return np.argmax(q_table[state])  # Exploit: best known action

def update_q(state, action, reward, new_state, alpha, gamma):
    """Update Q-value based on reward and future expected rewards"""
    best_future_q = np.max(q_table[new_state])
    current_q = q_table[state + (action,)]
    q_table[state + (action,)] = current_q + alpha * (reward + gamma * best_future_q - current_q)

# Step 6: Set Training Parameters
# ----------------------------------------------------------
n_episodes = 500  # how many games to play
alpha = 0.1       # learning rate (how much to trust new info)
gamma = 0.99      # discount factor (importance of future rewards)
epsilon = 1.0     # initial exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995

# Step 7: Train the Agent
# ----------------------------------------------------------
rewards = []

for episode in range(n_episodes):
    current_state = discretize_state(env.reset()[0])
    total_reward = 0
    done = False

    while not done:
        action = choose_action(current_state, epsilon)
        obs, reward, done, _, _ = env.step(action)
        new_state = discretize_state(obs)

        update_q(current_state, action, reward, new_state, alpha, gamma)

        current_state = new_state
        total_reward += reward

    epsilon = max(epsilon_min, epsilon * epsilon_decay)  # gradually explore less
    rewards.append(total_reward)

    if (episode + 1) % 50 == 0:
        print(f"Episode {episode + 1}: Average Reward (last 50) = {np.mean(rewards[-50:]):.2f}")

# Step 8: Evaluate the Trained Agent
# ----------------------------------------------------------
# After training, let's see how well it performs without exploration.

state = discretize_state(env.reset()[0])
done = False
total_reward = 0

print("\nWatching the trained agent...")

while not done:
    env.render()
    action = np.argmax(q_table[state])  # always exploit
    obs, reward, done, _, _ = env.step(action)
    state = discretize_state(obs)
    total_reward += reward

env.close()

print(f"Total reward achieved: {total_reward}")

# -----------------------------------------------------------------------------------
# End of Simple Reinforcement Learning Example
# -----------------------------------------------------------------------------------
