# Demo Frozen Lake Q-Learning Agent
 - Demos the Q-Learning off-policy RL agent for the Frozen Lake problem

#### Installs

In [1]:
# ! pip install gymnasium numpy matplotlib pyvirtualdisplay


#### Imports

In [2]:
import os, sys
from dataclasses import asdict

sys.path.insert(1, "../")

from src.main.rl_agents.sarsa_agent import SarsaAgent
from src.main.configs.q_learning_agent_configs import QLearningAgentConfig
import src.main.configs.global_configs as configs
from src.main.utility.utils import Helpers
from src.main.utility.chart_results import ChartResults


#### Define global configs and variables

#### Utility class of helper functions

In [4]:
class Helpers:
  """
  Utility class of helper functions
  """
  @staticmethod
  def animateEnvironment(images: List[Any]):
    """
    Animates the environment
    :param images: Images
    """
    plt.figure(
        figsize=(images[0].shape[1]/DPI,images[0].shape[0]/DPI),
        dpi=DPI
        )
    patch = plt.imshow(images[0])
    plt.axis=('off')
    animate = lambda i: patch.set_data(images[i])
    ani = FuncAnimation(
        plt.gcf(),
        animate,
        frames=len(images),
        interval=INTERVAL)
    display.display(display.HTML(ani.to_jshtml()))
    plt.close()

#### Solution 3 Steps:
 - Step 1: Demonstrate the performance of a random agent
 - Step 2: Implement Q-learning RL (off-policy) agent
 - Step 3: Implement the RL training loop
 - Step 4: Implement the RL evaluation (animation) policy


##### Step 1: Demonstrate the performance of a random agent

##### Step 1: Q-learning RL (off-policy) implementation

In [5]:
class QLearningAgent:
    def __init__(
        self,
        env,
        alpha=0.1,
        gamma=0.99,
        epsilon=1.0,
        epsilon_decay=0.995,
        min_epsilon=0.01
        ):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        n_states = env.observation_space.n
        n_actions = env.action_space.n
        self.Q = np.zeros((n_states, n_actions)) # + np.random.rand(n_states, n_actions)

        print(f"Frozen Lake environment creation..")
        print(f"Observation space: {n_states}")
        print(f"Action space: {n_actions}")
        print(f"""Q-learning hyperparameters are:
                  \nalpha: {self.alpha}
                  \nepsilon: {self.epsilon}
                  \nepsilon_decay: {self.epsilon_decay}
                  \nmin_epsilon: {self.min_epsilon}\n""")

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.Q[state])

    def update(self, s, a, r, s_next):
        td_target = r + self.gamma * np.max(self.Q[s_next, :])
        td_error = td_target - self.Q[s, a]
        self.Q[s, a] += self.alpha * td_error

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)


##### Step 2: Training loop

In [6]:
def train(agent, env, n_episodes=5000, max_steps=100):
    rewards = []
    for ep in range(n_episodes):
        s, _ = env.reset()                      # start new episode

        # print(f"s: {s}\ta: {a}")
        total_reward = 0

        for _ in range(max_steps):
            a = agent.choose_action(s)
            s_next, r, done, _, _ = env.step(a)
            # a_next = np.argmax(agent.Q[s_next])
            # print(f"s: {s}, a: {a}, r: {r}, s_next: {s_next}")
            agent.update(s, a, r, s_next)
            # s, a = s_next, a_next
            s = s_next

            total_reward += r
            if done:
                break

        agent.decay_epsilon()
        rewards.append(total_reward)
        if (ep+1) % 500 == 0:
            print(f"Episode {ep+1}/{n_episodes}  Average Reward: {np.mean(rewards[-500:]):.3f}")
            print(f"\nQ: {agent.Q}\n")
    return rewards

# Create environment
env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="rgb_array")
agent = QLearningAgent(env)

# Train
training_rewards = train(agent, env)


Frozen Lake environment creation..
Observation space: 16
Action space: 4
Q-learning hyperparameters are:
                  
alpha: 0.1
                  
epsilon: 1.0
                  
epsilon_decay: 0.995
                  
min_epsilon: 0.01

Episode 500/5000  Average Reward: 0.052

Q: [[5.04735017e-02 7.59574376e-02 5.81532423e-02 4.22135452e-02]
 [3.12405076e-02 5.01389442e-02 4.32351330e-02 9.13053396e-02]
 [1.12245806e-01 4.81018750e-02 3.77343726e-02 2.77865895e-02]
 [2.98164677e-05 1.27169496e-02 1.02458477e-03 3.26553452e-02]
 [7.33865790e-02 3.58944637e-02 4.99878207e-02 3.57057765e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.53949025e-01 3.33126510e-02 3.26730677e-02 6.21413701e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.10226676e-02 2.62173539e-02 6.81245527e-02 5.01887733e-02]
 [1.48494878e-02 7.23158292e-02 1.74374955e-01 3.82786225e-02]
 [2.38519910e-01 7.14120665e-02 9.11908948e-02 1.19831756e-02]
 [0.00000000e+00 0

##### Step 3: Implement the RL evaluation (animation) policy

In [7]:
def animate_policy(agent, env, fps=2):
    # Start virtual display
    display = Display(visible=0, size=(400, 400))
    display.start()

    n_rows, n_cols = agent.Q.shape
    states, actions = [], []
    s, _ = env.reset()
    states.append(s)

    # Rollout under greedy policy
    for _ in range(100):
        a = np.argmax(agent.Q[s])
        s, _, done, _, _ = env.step(a)
        states.append(s)
        if done:
            break

    # Setup plot
    fig, ax = plt.subplots()
    # ax.set_xlim(0, env.desc.shape[1])
    # ax.set_ylim(0, env.desc.shape[0])
    ax.set_xlim(0, n_cols)
    ax.set_ylim(0, n_rows)
    agent_dot, = ax.plot([], [], 'ro', ms=20)

    def init():
        agent_dot.set_data([], [])
        return agent_dot,

    def update(frame):
        # Convert state index to (row, col)
        row, col = divmod(states[frame], n_cols)
        agent_dot.set_data(col + 0.5, n_rows - row - 0.5)
        return agent_dot,

    anim = FuncAnimation(fig, update, init_func=init,
                         frames=len(states), interval=1000/fps, repeat=False)


    plt.close(fig)  # prevent static display
    display.stop()
    return anim

# Generate and display the animation in, e.g., a Jupyter notebook
# anim = animate_policy(agent, env)
# display.display(HTML(anim.to_jshtml()))


In [8]:
class EvaluateAgent:
  """
  Evaluate the Q learning RL agent using animation of the simulation runs
  """
  def __init__(self, agent, env, n_episodes=2, max_steps=100):
    """
    Constructor
    """
    self.agent = agent
    self.env = env
    self.n_episodes = n_episodes
    self.max_steps = max_steps
    # self.display = Display(visible=0, size=(400, 400))
    # self.display.start()
    self.states = []

    self.images = []

  def _evaluate(self):
    """
    Evaluate the agent
    """
    for ep in range(self.n_episodes):
      s, _ = self.env.reset()
      self.states.append(s)

      # Rollout under greedy policy
      for _ in range(self.max_steps):
          a = np.argmax(agent.Q[s])
          s, _, done, _, _ = self.env.step(a)
          self.env.render()
          #self.images.append(self.env.render())
          self.states.append(s)
          if done:
              break

      # self.env.close()


  def run(self):
    """
    Run the RL evaluation with animation
    """
    self._evaluate()
    # Helpers.animateEnvironment(self.images)







In [9]:
env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="human")
evaluate = EvaluateAgent(agent, env)
evaluate.run()