In [35]:
# !pip install "gymnasium[toy-text]"

import gymnasium as gym
import numpy as np
import random
from collections import defaultdict

def train_q_learning(env_name: str,
                     num_episodes: int = 2000,
                     max_steps_per_episode: int = 100,
                     learning_rate: float = 0.1,
                     discount_factor: float = 0.99,
                     epsilon: float = 1.0,
                     min_epsilon: float = 0.01,
                     epsilon_decay: float = 0.995):
    """
    Train a Q-learning agent on the specified Gymnasium environment.
    Returns the learned Q-table and a list of episode rewards.
    """
    # Create environment
    env = gym.make(env_name, is_slippery=False)  # deterministic version
    n_states = env.observation_space.n
    n_actions = env.action_space.n

    # Initialize Q-table (state x action) to zeros
    Q = np.zeros((n_states, n_actions), dtype=float)

    # Track rewards for plotting / analysis
    episode_rewards = []

    for ep in range(num_episodes):
        state, _ = env.reset()
        total_reward = 0.0

        for step in range(max_steps_per_episode):
            # Epsilon-greedy action selection
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                action = int(np.argmax(Q[state]))

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # Q-learning update
            best_next = np.max(Q[next_state])
            td_target = reward + discount_factor * best_next
            td_error = td_target - Q[state, action]
            Q[state, action] += learning_rate * td_error

            state = next_state
            total_reward += reward

            if done:
                break

        # Decay epsilon
        epsilon = max(min_epsilon, epsilon * epsilon_decay)
        episode_rewards.append(total_reward)

        # Optional: print progress every 100 episodes
        if (ep + 1) % 100 == 0:
            avg_reward = np.mean(episode_rewards[-100:])
            print(f"Episode {ep+1:4d} | Avg Reward (last 100): {avg_reward:.2f} | Epsilon: {epsilon:.3f}")

    env.close()
    return Q, episode_rewards

def evaluate_policy(env_name: str, Q: np.ndarray, num_episodes: int = 100, max_steps: int = 100):
    """
    Evaluate a learned Q-table by running the greedy policy over several episodes.
    Returns the average total reward.
    """
    env = gym.make(env_name, is_slippery=False)
    total_rewards = []

    for _ in range(num_episodes):
        state, _ = env.reset()
        ep_reward = 0.0

        for _ in range(max_steps):
            action = int(np.argmax(Q[state]))
            state, reward, terminated, truncated, _ = env.step(action)
            ep_reward += reward
            if terminated or truncated:
                break

        total_rewards.append(ep_reward)

    env.close()
    avg_reward = np.mean(total_rewards)
    print(f"Evaluation over {num_episodes} episodes: Average Reward = {avg_reward:.2f}")
    return avg_reward

if __name__ == "__main__":
    ENV_NAME = "FrozenLake-v1"
    # 1) Train
    Q_table, rewards = train_q_learning(
        env_name=ENV_NAME,
        num_episodes=2000,
        max_steps_per_episode=100,
        learning_rate=0.1,
        discount_factor=0.99,
        epsilon=1.0,
        min_epsilon=0.01,
        epsilon_decay=0.995
    )

    # 2) Evaluate
    evaluate_policy(ENV_NAME, Q_table, num_episodes=500, max_steps=100)

    # 3) (Optional) Inspect learned Q-values
    print("\nLearned Q-table (first 5 states):")
    print(Q_table[:5])


Episode  100 | Avg Reward (last 100): 0.00 | Epsilon: 0.606
Episode  200 | Avg Reward (last 100): 0.00 | Epsilon: 0.367
Episode  300 | Avg Reward (last 100): 0.00 | Epsilon: 0.222
Episode  400 | Avg Reward (last 100): 0.00 | Epsilon: 0.135
Episode  500 | Avg Reward (last 100): 0.00 | Epsilon: 0.082
Episode  600 | Avg Reward (last 100): 0.00 | Epsilon: 0.049
Episode  700 | Avg Reward (last 100): 0.00 | Epsilon: 0.030
Episode  800 | Avg Reward (last 100): 0.00 | Epsilon: 0.018
Episode  900 | Avg Reward (last 100): 0.00 | Epsilon: 0.011
Episode 1000 | Avg Reward (last 100): 0.00 | Epsilon: 0.010
Episode 1100 | Avg Reward (last 100): 0.00 | Epsilon: 0.010
Episode 1200 | Avg Reward (last 100): 0.00 | Epsilon: 0.010
Episode 1300 | Avg Reward (last 100): 0.00 | Epsilon: 0.010
Episode 1400 | Avg Reward (last 100): 0.00 | Epsilon: 0.010
Episode 1500 | Avg Reward (last 100): 0.00 | Epsilon: 0.010
Episode 1600 | Avg Reward (last 100): 0.00 | Epsilon: 0.010
Episode 1700 | Avg Reward (last 100): 0.

In [36]:
import gymnasium as gym
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

# ——— Increase notebook embed limit to 50 MB ———
import matplotlib as mpl
mpl.rcParams['animation.embed_limit'] = 50  # in MB

def train_q_learning(env_name: str,
                     num_episodes: int = 2000,
                     max_steps_per_episode: int = 100,
                     learning_rate: float = 0.1,
                     discount_factor: float = 0.99,
                     epsilon: float = 1.0,
                     min_epsilon: float = 0.01,
                     epsilon_decay: float = 0.995):
    """
    Train a Q-learning agent and return the Q-table.
    """
    env = gym.make(env_name, is_slippery=False)
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    Q = np.zeros((n_states, n_actions), dtype=float)

    for ep in range(num_episodes):
        state, _ = env.reset()
        for _ in range(max_steps_per_episode):
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                action = int(np.argmax(Q[state]))

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            td_target = reward + discount_factor * np.max(Q[next_state])
            Q[state, action] += learning_rate * (td_target - Q[state, action])

            state = next_state
            if done:
                break

        epsilon = max(min_epsilon, epsilon * epsilon_decay)

    env.close()
    return Q

def animate_episodes(env_name: str,
                     Q: np.ndarray,
                     num_episodes: int = 3,
                     max_steps: int = 100,
                     interval: int = 300):
    """
    Run `num_episodes` back-to-back, collect all frames, and build a FuncAnimation.
    Returns the FuncAnimation (display via HTML(anim.to_jshtml())).
    """
    env = gym.make(env_name, render_mode="rgb_array", is_slippery=False)
    all_frames = []

    for ep in range(num_episodes):
        state, _ = env.reset()
        for _ in range(max_steps):
            all_frames.append(env.render())
            action = int(np.argmax(Q[state]))
            state, _, terminated, truncated, _ = env.step(action)
            if terminated or truncated:
                all_frames.append(env.render())
                break

    env.close()

    # Optionally sample down frames to every 2nd frame:
    # all_frames = all_frames[::2]

    # Build the animation with a smaller figure size
    fig = plt.figure(figsize=(4, 4))
    im = plt.imshow(all_frames[0])
    plt.axis('off')

    def update(i):
        im.set_array(all_frames[i])
        return (im,)

    ani = animation.FuncAnimation(
        fig,
        update,
        frames=len(all_frames),
        interval=interval,
        blit=True
    )
    plt.close(fig)
    return ani

if __name__ == "__main__":
    ENV = "FrozenLake-v1"

    # 1) Train
    Q_table = train_q_learning(
        ENV,
        num_episodes=2000,
        max_steps_per_episode=100,
        learning_rate=0.1,
        discount_factor=0.99,
        epsilon=1.0,
        min_epsilon=0.01,
        epsilon_decay=0.995
    )

    # 2) Animate multiple episodes
    ani = animate_episodes(
        ENV,
        Q_table,
        num_episodes=5,    # how many episodes to stitch together
        max_steps=100,
        interval=300
    )

    # 3) In a Jupyter notebook:
    #    from IPython.display import HTML
    #    HTML(ani.to_jshtml())

    # 4) As a standalone script, save to MP4 instead:
    #    ani.save("multiple_eps.mp4", fps=1000/300)


In [37]:
from IPython.display import HTML
ani = animate_episodes("FrozenLake-v1", Q_table, num_episodes=1)
HTML(ani.to_jshtml())
