# CSC 380 Artificial Intelligence I (Fall 2025)
# HW\#2 FrozenLake Example Code

In [1]:
# First import gymnasium (https://gymnasium.farama.org/)
!pip install gymnasium

Defaulting to user installation because normal site-packages is not writeable


## Create an environment

In [5]:
import gymnasium as gym
import numpy as np 

# Create a FrozenLake 8x8 environment using Gymnasium
# (https://gymnasium.farama.org/environments/toy_text/frozen_lake/).
env = gym.make('FrozenLake-v1', desc=None, map_name="8x8", is_slippery=True, render_mode="ansi")

# This line is critically needed to access the state and the probabilities 
# in the environment due to the recent code update by Gymnasium.
env_unwrapped = env.unwrapped

# Reset the environment and display it (in ansi ascii)
env.reset()
print (env.render())  # wrap render() in print()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG



In [None]:
## Alternatively you can create a random map, as described in 
## the FrozenLake documentation page.  It's commented out for now.

#from gymnasium.envs.toy_text.frozen_lake import generate_random_map

#env = gym.make('FrozenLake-v1', desc=generate_random_map(size=8))

In [6]:
# Make one (random) action
action = env.action_space.sample()
print (action)
env.step(action)
print (env.render())

3
  (Up)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG



In [7]:
print (env_unwrapped.s)

0


## Inspect the environment

In [9]:
nS = int(env.observation_space.n)    # number of states -- 8x8=64
nA = int(env.action_space.n)         # number of actions -- four directions; 0:left, 1:down, 2:right, 3:up
print (f"number of states: {nS}\nnumber of actions: {nA}")

number of states: 64
number of actions: 4


Note that actions are 0-based integers.  You can check in the Gymnasium source code (https://github.com/Farama-Foundation/Gymnasium/blob/d71a13588266256a4c900b5e0d72d10785816c3a/gymnasium/envs/toy_text/frozen_lake.py)

- 0: Move left
- 1: Move down
- 2: Move right
- 3: Move up

### All of the environment's probabilities are stored in 'env_unwrapped.P'.  
It is a dictionary, keyed by the state index e.g. env_unwrapped.P[0], env_unwrapped.P[1] etc.)

Then for each state, the value is a dictionary, keyed by the actions (0-based).  Then for each action, the value is a list, showing the **probability of transitioning into the next state, the index of the next state, reward, and True/False** (done=True if the next state is a Hole or the Goal).

In [10]:
# Probabilies from State 0
env_unwrapped.P[0]

{0: [(0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 8, 0.0, False)],
 1: [(0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 8, 0.0, False),
  (0.3333333333333333, 1, 0.0, False)],
 2: [(0.3333333333333333, 8, 0.0, False),
  (0.3333333333333333, 1, 0.0, False),
  (0.3333333333333333, 0, 0.0, False)],
 3: [(0.3333333333333333, 1, 0.0, False),
  (0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 0, 0.0, False)]}

### Note on the environment (from the Gymnasium page)
is_slippery=True: If true the player will move in intended direction with probability of 1/3 else will move in either perpendicular direction with equal probability of 1/3 in both directions.

For example, if action is left and is_slippery is True, then:

- P(move left)=1/3
- P(move up)=1/3
- P(move down)=1/3

In [11]:
# Probabilies from State 62 (left of the Goal state).  
# Notice some 'True' results (implying the goal is reached).
env_unwrapped.P[62]

{0: [(0.3333333333333333, 54, 0.0, True),
  (0.3333333333333333, 61, 0.0, False),
  (0.3333333333333333, 62, 0.0, False)],
 1: [(0.3333333333333333, 61, 0.0, False),
  (0.3333333333333333, 62, 0.0, False),
  (0.3333333333333333, 63, 1.0, True)],
 2: [(0.3333333333333333, 62, 0.0, False),
  (0.3333333333333333, 63, 1.0, True),
  (0.3333333333333333, 54, 0.0, True)],
 3: [(0.3333333333333333, 63, 1.0, True),
  (0.3333333333333333, 54, 0.0, True),
  (0.3333333333333333, 61, 0.0, False)]}

# ** Functions to create a fixed deterministic policy, and to run one experiment for a given number of episodes

In [15]:
def generate_random_policy(num_actions, num_states, seed=None):
    """
    A policy is a 1D array of length # of states, where each element is a
    number between 0 (inclusive) and # of actions (exclusive) randomly chosen.
    If a specific seed is passed, the same numbers are genereated, while
    if the seed is None, the numbers are unpredictable every time.
    """
    rng = np.random.default_rng(seed)
    return rng.integers(low=0, high=num_actions, size=num_states)


def run_oneexperiment(env, policy, num_episodes, display=False):
    """
    Run one experiment, when agent follows a policy, for a given number of episodes.
    """    
    # Count the number of goals made and getting stuck in a hole
    goals = 0
    holes = 0
    # Total rewards and steps
    total_rewards = 0
    total_goal_steps = 0
    
    for _ in range(num_episodes):
        # For each time,
        env.reset()
        done = False
        rewards = 0
        steps = 0

        if display:
            episode = [(env.env_unwrapped.s)] # initial state (in a tuple)

        while not done:
            # choose the action based on the policy
            state = env.s
            action = policy[state]

            # take the action
            next_state, reward, done, info, p = env.step(action)
            steps += 1

            # extend the episode
            if display:
                episode.append(tuple([action,next_state]))
            # accumulate rewards
            rewards += reward
        
        # Calculate stats
        total_rewards += rewards
        if reward == 1.0: # Goal, or env.s == 63
            goals += 1
            total_goal_steps += steps
        else:
            holes += 1
            
        # Display
        if display:
            print (env.render())
            
    # One experiment finished,
    return goals, holes, total_rewards, total_goal_steps

### * A utility function to display a 1D array/policy in a 2D array/grid *

In [13]:
import math

def display_policy(policy):
    side = int(math.sqrt(nS))  # assuming a square
    policy = policy.reshape((side, side))
    return policy

## One experiment run

In [14]:
policy = generate_random_policy(nA, nS, 17) # change seed to a specific number, or None (default)
print ("*** Policy ***\n{}".format(display_policy(policy)))

num_episodes = 10000

goals, holes, total_rewards, total_goal_steps \
    = run_oneexperiment(env, policy, num_episodes)

percent_goal = goals / num_episodes
percent_hole = holes / num_episodes
mean_reward = total_rewards / num_episodes
mean_goal_steps = 0.0 if (goals == 0) else (total_goal_steps / goals)

print ("\n*** RESULTS ***:\nGoals: {:>5d}/{} = {:>7.3%}\nHoles: {:>5d}/{} = {:>7.3%}"
       .format(goals, num_episodes, percent_goal, holes, num_episodes, percent_hole))
print("mean reward:          {:.5f}\nmean goal steps:     {:.2f}".format(mean_reward, mean_goal_steps))

*** Policy ***
[[2 3 0 0 1 2 3 1]
 [0 0 1 1 3 1 2 2]
 [0 2 0 0 2 1 1 2]
 [0 0 2 3 2 3 1 0]
 [2 2 2 2 1 0 2 0]
 [1 0 2 1 1 2 0 1]
 [2 3 0 3 1 1 3 0]
 [0 1 2 3 1 1 3 3]]


AttributeError: 'TimeLimit' object has no attribute 's'

In [11]:
env.reset()

for _ in range(5):
    action = env.action_space.sample()
    print (action)
    env.step(action)
    print (env.render())

1
  (Down)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

3
  (Up)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

0
  (Left)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

0
  (Left)
SFFFFFFF
F[41mF[0mFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

1
  (Down)
SFFFFFFF
FF[41mF[0mFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

