<a href="https://colab.research.google.com/github/paviayyala/AIML-Lab/blob/main/RL_Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================================
# 🧠 Simple Q-Learning Example
# ------------------------------------------------------------
# Concept: The agent learns which DISCOUNT to offer based on DEMAND.
# States represent demand levels (Low, Medium, High).
# Actions represent possible discounts (0%, 10%, 20%, 30%).
# The goal: maximize the long-term reward (e.g., sales/profit).
# ============================================================

import numpy as np

# Format how numbers are printed (for cleaner tables)
np.set_printoptions(precision=2, suppress=True)

# ============================================================
# 1️⃣ DEFINE STATES AND ACTIONS
# ------------------------------------------------------------
# The agent can see one of three states (demand levels)
# and can choose one of four possible discounts.
# ============================================================

states = ["Low", "Medium", "High"]              # indices: 0, 1, 2
actions = ["0%", "10%", "20%", "30%"]           # indices: 0, 1, 2, 3

n_states = len(states)   # number of possible demand levels
n_actions = len(actions) # number of possible discount actions

# ============================================================
# 2️⃣ INITIALIZE Q-TABLE
# ------------------------------------------------------------
# Q[s, a] will store how "good" it is to take action a in state s.
# It starts with 0 because the agent knows nothing at first.
# ============================================================

Q = np.zeros((n_states, n_actions), dtype=float)

# ============================================================
# 3️⃣ DEFINE REWARD TABLE (ENVIRONMENT)
# ------------------------------------------------------------
# This simulates the environment feedback.
# Each cell represents the IMMEDIATE REWARD (e.g., profit or sales)
# if the agent takes a particular action in a particular state.
# Higher reward = better outcome.
# ============================================================

reward_table = np.array([
    # Actions: 0%   10%   20%   30%
    [10,   25,   45,   40],   # Low demand
    [30,   55,   65,   50],   # Medium demand
    [80,   75,   60,   45],   # High demand
], dtype=float)

# ============================================================
# 4️⃣ Q-LEARNING PARAMETERS (HYPERPARAMETERS)
# ------------------------------------------------------------
# α (alpha): Learning rate — how fast the agent updates old beliefs
# γ (gamma): Discount factor — how much future rewards matter
# ε (epsilon): Exploration rate — how often to try something new
# episodes: Number of training iterations (the more, the better)
# ============================================================

alpha = 0.1       # learning rate
gamma = 0.9       # discount factor
epsilon = 0.2     # exploration probability (20% explore, 80% exploit)
episodes = 500    # number of learning rounds
print_every = 100 # how often to print progress

# ============================================================
# 5️⃣ TRAINING LOOP (THE HEART OF Q-LEARNING)
# ------------------------------------------------------------
# In each episode:
#   1. Pick a random state (like "today's market condition").
#   2. Choose a discount action using ε-greedy policy.
#   3. Receive a reward (sales result).
#   4. Move to a new random state (simulated next day).
#   5. Update the Q-value using the Q-learning formula.
# ------------------------------------------------------------
# Formula:
# Q(s,a) ← Q(s,a) + α [r + γ * max(Q(s’,·)) − Q(s,a)]
# ============================================================

for ep in range(1, episodes + 1):
    # Pick a random initial state
    s = np.random.randint(n_states)

    # ------------------------------
    # Step 1: Choose an action
    # ------------------------------
    # With probability ε, explore (try a random discount)
    # Otherwise, exploit (choose the best-known discount)
    if np.random.rand() < epsilon:
        a = np.random.randint(n_actions)   # Explore: try something random
    else:
        a = np.argmax(Q[s])                # Exploit: choose best action so far

    # ------------------------------
    # Step 2: Get immediate reward
    # ------------------------------
    r = reward_table[s, a]                 # What we earned today (reward)

    # ------------------------------
    # Step 3: Simulate next state
    # ------------------------------
    # In real life, next state depends on environment changes.
    # Here we simplify by picking a random new demand level.
    s_next = np.random.randint(n_states)

    # ------------------------------
    # Step 4: Q-value update
    # ------------------------------
    best_next_q = np.max(Q[s_next])        # Best possible future value
    td_target = r + gamma * best_next_q    # Target value (reward + discounted future)
    td_error = td_target - Q[s, a]         # Difference between new & old estimate
    Q[s, a] += alpha * td_error            # Update the Q-value slightly

    # ------------------------------
    # Step 5: Print progress occasionally
    # ------------------------------
    if ep % print_every == 0 or ep == 1:
        print(f"Episode {ep:3d}: state={states[s]}, action={actions[a]}, reward={r:.1f}")
        print("Current Q-table:")
        print(Q)
        print("-" * 50)

# ============================================================
# 6️⃣ SHOW FINAL RESULTS
# ------------------------------------------------------------
# After training, the Q-table contains learned values.
# The highest Q-value in each row shows the best discount for that demand level.
# ============================================================

print("\n✅ Final Q-table (States as rows, Actions as columns):")
for i, st in enumerate(states):
    row = Q[i]
    row_display = ", ".join(f"{actions[j]}:{row[j]:.2f}" for j in range(n_actions))
    print(f"{st:6s} -> {row_display}")

# ============================================================
# 7️⃣ LEARNED POLICY (GREEDY STRATEGY)
# ------------------------------------------------------------
# The agent now simply picks the discount with the highest Q-value for each state.
# This represents what it has learned after many trials.
# ============================================================

policy = [actions[np.argmax(Q[i])] for i in range(n_states)]

print("\n🏁 Learned Optimal Policy (what the agent learned):")
for st, act in zip(states, policy):
    print(f"  When demand is {st:6s} → offer {act}")


Episode   1: state=High, action=0%, reward=80.0
Current Q-table:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [8. 0. 0. 0.]]
--------------------------------------------------
Episode 100: state=Medium, action=0%, reward=30.0
Current Q-table:
[[ 67.06  20.15   8.    27.27]
 [ 79.57  36.13   0.    19.95]
 [155.13  40.58  18.31   5.04]]
--------------------------------------------------
Episode 200: state=Low, action=10%, reward=25.0
Current Q-table:
[[128.39  42.88   8.    75.14]
 [169.25  36.13  31.38 102.29]
 [208.45  64.16  31.39  47.45]]
--------------------------------------------------
Episode 300: state=Medium, action=0%, reward=30.0
Current Q-table:
[[200.03  94.66  49.01  89.45]
 [220.24  85.49  54.49 122.85]
 [262.01  99.75  75.85  63.1 ]]
--------------------------------------------------
Episode 400: state=Low, action=0%, reward=10.0
Current Q-table:
[[242.39  94.66 121.42 125.44]
 [253.61 122.68  80.33 145.5 ]
 [300.43 135.12 114.08 115.83]]
----------------------------------------------