<a href="https://colab.research.google.com/github/manikanta-eng/Reinforcement-learning/blob/main/lab_03_rml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import gymnasium as gym

def epsilon_greedy(Q, s, nA, eps):
    if np.random.rand() < eps:
        return np.random.randint(nA)
    qvals = Q[s]
    return np.random.choice(np.flatnonzero(qvals == np.max(qvals)))

def greedy_policy(Q):
    return np.argmax(Q, axis=1)

def evaluate(env, policy, episodes=1000):
    total, success = 0, 0
    for _ in range(episodes):
        s, _ = env.reset()
        done, ret = False, 0
        while not done:
            s, r, terminated, truncated, _ = env.step(policy[s])
            done, ret = terminated or truncated, ret + r
        total += ret
        success += (ret > 0)
    return total / episodes, success / episodes


In [2]:
def td0(env, policy_probs, alpha=0.1, gamma=0.99, episodes=20000):
    nS = env.observation_space.n
    V = np.zeros(nS)
    for _ in range(episodes):
        s, _ = env.reset()
        done = False
        while not done:
            a = np.random.choice(env.action_space.n, p=policy_probs[s])
            s2, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated
            V[s] += alpha * (r + (0 if done else gamma * V[s2]) - V[s])
            s = s2
    return V



In [3]:
def sarsa(env, alpha=0.1, gamma=0.99, eps1=1.0, eps2=0.05, episodes=30000):
    nS, nA = env.observation_space.n, env.action_space.n
    Q = np.zeros((nS, nA))

    def eps_at(ep): return eps2 + (eps1 - eps2) * (episodes - ep) / episodes

    for ep in range(1, episodes + 1):
        eps = eps_at(ep)
        s, _ = env.reset()
        a = epsilon_greedy(Q, s, nA, eps)
        done = False
        while not done:
            s2, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated
            if not done:
                a2 = epsilon_greedy(Q, s2, nA, eps)
                target = r + gamma * Q[s2, a2]
            else:
                target = r
            Q[s, a] += alpha * (target - Q[s, a])
            s, a = s2, (a2 if not done else 0)
    return Q


In [4]:
def main():
    env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True)
    nS, nA = env.observation_space.n, env.action_space.n

    policy_probs = np.ones((nS, nA)) / nA
    V = td0(env, policy_probs)
    print("TD(0) Values:\n", V.reshape(4, 4))

    Q = sarsa(env)
    policy = greedy_policy(Q)
    print("\nSARSA Greedy Policy:\n", policy.reshape(4, 4))

    avg, succ = evaluate(env, policy, 5000)
    print(f"\nAvg return={avg:.3f}, Success={succ*100:.2f}%")

if __name__ == "__main__":
    main()


TD(0) Values:
 [[0.01128485 0.0052601  0.01131197 0.00414983]
 [0.01689198 0.         0.02325782 0.        ]
 [0.02464562 0.06178836 0.08466198 0.        ]
 [0.         0.12051186 0.3863904  0.        ]]

SARSA Greedy Policy:
 [[0 3 0 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]

Avg return=0.721, Success=72.08%
