# SETUP

In [1]:
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import os
import torch

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

MODEL_PATH = "ppo_frozenlake.zip"

In [2]:
'''
Train an agent on the FrozenLake 4x4 environment using PPO (Proximal Policy Optimization).
Saves the trained model to disk.
'''
def train(episodes=15000, slippery=False):
    # setup environment
    env = make_vec_env(
        lambda: gym.make("FrozenLake-v1", map_name="4x4", is_slippery=slippery),
        n_envs=1
    )

    # setup, train, save model
    model = PPO("MlpPolicy", env, verbose=1, device='cpu') # apparently using the cpu is better here, so force it
    model.learn(total_timesteps=episodes)
    model.save("ppo_frozenlake_model")

    env.close()

In [3]:
'''
Test a trained PPO agent on the FrozenLake 4x4 environment.
Displays the environment if render=True.
'''
def test(episodes=1, render=False, slippery=False):
    # load model
    model = PPO.load("ppo_frozenlake_model")

    # setup environment
    env = gym.make(
        "FrozenLake-v1",
        map_name="4x4",
        is_slippery=slippery,
        render_mode='human' if render else None
    )

    
    rewards_per_episode = np.zeros(episodes)

    for i in range(episodes):
        obs, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, _ = env.step(int(action))
            total_reward += reward
            done = terminated or truncated

        rewards_per_episode[i] = total_reward

    env.close()

# Non-slippery

In [None]:
# train the model and output stats
train(episodes=3000, slippery=False) 

In [None]:
# see how the guy does!
test(episodes=1, render=True, slippery=False)  

# Slippery

In [None]:
train(episodes=15000, slippery=True) 

In [None]:
test(episodes=1, render=True, slippery=True)  