<a href="https://colab.research.google.com/github/manikanta-eng/Reinforcement-learning/blob/main/lab_05_rml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Implementing Q-Learning for discrete action space problems -e.g., Ta Reinforcement

In [1]:
import gymnasium as gym
import numpy as np

# Environment
env = gym.make("Taxi-v3")
nS, nA = env.observation_space.n, env.action_space.n
Q = np.zeros((nS, nA))

# Hyperparameters
episodes = 20000
gamma, alpha = 0.99, 0.1
eps, eps_min, eps_decay = 1.0, 0.01, 0.999

# Training
all_rewards = []
for ep in range(1, episodes + 1):
    s, _ = env.reset()
    done, total_r = False, 0
    while not done:
        a = env.action_space.sample() if np.random.rand() < eps else np.argmax(Q[s])
        s2, r, terminated, truncated, _ = env.step(a)
        done = terminated or truncated
        Q[s, a] += alpha * (r + gamma * np.max(Q[s2]) - Q[s, a])
        s = s2
        total_r += r
    eps = max(eps_min, eps * eps_decay)
    all_rewards.append(total_r)

    # Progress log
    if ep % 2000 == 0:
        avg_r = np.mean(all_rewards[-2000:])
        print(f"Episode {ep}, Avg Reward (last 2000): {avg_r:.2f}, ε={eps:.3f}")

# Evaluation
def evaluate(n_eval=100):
    total, successes = 0, 0
    for _ in range(n_eval):
        s, _ = env.reset()
        done, ep_r = False, 0
        while not done:
            a = np.argmax(Q[s])
            s, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated
            ep_r += r
        total += ep_r
        if ep_r > 0:  # success in Taxi
            successes += 1
    return total / n_eval, successes / n_eval * 100

avg_return, success_rate = evaluate()
print("\n=== Taxi-v3 Evaluation ===")
print(f"Average Return: {avg_return:.2f}")
print(f"Success Rate: {success_rate:.1f}%")
print("\nSample Q-table (first 5 states):\n", Q[:5])


Episode 2000, Avg Reward (last 2000): -183.83, ε=0.135
Episode 4000, Avg Reward (last 2000): 4.87, ε=0.018
Episode 6000, Avg Reward (last 2000): 7.25, ε=0.010
Episode 8000, Avg Reward (last 2000): 7.44, ε=0.010
Episode 10000, Avg Reward (last 2000): 7.42, ε=0.010
Episode 12000, Avg Reward (last 2000): 7.36, ε=0.010
Episode 14000, Avg Reward (last 2000): 7.36, ε=0.010
Episode 16000, Avg Reward (last 2000): 7.41, ε=0.010
Episode 18000, Avg Reward (last 2000): 7.40, ε=0.010
Episode 20000, Avg Reward (last 2000): 7.36, ε=0.010

=== Taxi-v3 Evaluation ===
Average Return: 7.99
Success Rate: 100.0%

Sample Q-table (first 5 states):
 [[  0.           0.           0.           0.           0.
    0.        ]
 [  2.43515925   2.65239587   2.00028705   0.66096351   9.6220697
   -4.24779593]
 [  7.29318517   6.64161177   3.20039978   8.18530935  14.11880599
    0.10818399]
 [  4.24362962   2.95679373  -2.37878975   1.87810232  10.72936333
   -4.12537617]
 [  1.13164151  -7.80703145  -7.76261824  -

In [2]:
import gymnasium as gym
import numpy as np

# Environment
env = gym.make("FrozenLake-v1", is_slippery=False)  # change to True for stochastic env
nS, nA = env.observation_space.n, env.action_space.n
Q = np.zeros((nS, nA))

# Hyperparameters
episodes = 10000
gamma, alpha = 0.99, 0.1
eps, eps_min, eps_decay = 1.0, 0.01, 0.995

# Training
all_rewards = []
for ep in range(1, episodes + 1):
    s, _ = env.reset()
    done, total_r = False, 0
    while not done:
        a = env.action_space.sample() if np.random.rand() < eps else np.argmax(Q[s])
        s2, r, terminated, truncated, _ = env.step(a)
        done = terminated or truncated
        Q[s, a] += alpha * (r + gamma * np.max(Q[s2]) - Q[s, a])
        s = s2
        total_r += r
    eps = max(eps_min, eps * eps_decay)
    all_rewards.append(total_r)

    # Progress log
    if ep % 1000 == 0:
        avg_r = np.mean(all_rewards[-1000:])
        print(f"Episode {ep}, Avg Reward (last 1000): {avg_r:.2f}, ε={eps:.3f}")

# Evaluation
def evaluate(n_eval=100):
    successes = 0
    for _ in range(n_eval):
        s, _ = env.reset()
        done = False
        while not done:
            a = np.argmax(Q[s])
            s, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated
            if r == 1:
                successes += 1
    return successes / n_eval * 100

success_rate = evaluate()
print("\n=== FrozenLake-v1 Evaluation ===")
print(f"Success Rate: {success_rate:.1f}%")
print("\nSample Q-table (first 5 states):\n", Q[:5])


Episode 1000, Avg Reward (last 1000): 0.00, ε=0.010
Episode 2000, Avg Reward (last 1000): 0.00, ε=0.010
Episode 3000, Avg Reward (last 1000): 0.00, ε=0.010
Episode 4000, Avg Reward (last 1000): 0.00, ε=0.010
Episode 5000, Avg Reward (last 1000): 0.00, ε=0.010
Episode 6000, Avg Reward (last 1000): 0.00, ε=0.010
Episode 7000, Avg Reward (last 1000): 0.00, ε=0.010
Episode 8000, Avg Reward (last 1000): 0.00, ε=0.010
Episode 9000, Avg Reward (last 1000): 0.00, ε=0.010
Episode 10000, Avg Reward (last 1000): 0.00, ε=0.010

=== FrozenLake-v1 Evaluation ===
Success Rate: 0.0%

Sample Q-table (first 5 states):
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
