# Phase1:

In [23]:
import gym
import numpy as np
import random

In [14]:
env = gym.make("FrozenLake-v0")

# 2. Identify the state space (observation_space) size and the action space size
state_space_size = env.observation_space.n
action_space_size = env.action_space.n

# 3. Initialize the Q-Table (observation_space_size, action_space_size)
qtable = np.zeros((state_space_size, action_space_size))

# 4. Initialize the hyper-parameters
learning_rate = 0.1
discount_factor = 0.5
exploration_rate = 0.1

epochs_list = [1000, 5000, 10000]

In [15]:
# Function to choose an action based on the Q-Table
def choose_action(state):
    if np.random.uniform(0, 1) < exploration_rate:
        # Exploration: choose a random action
        action = env.action_space.sample()
    else:
        # Exploitation: choose the action with the highest Q-value
        action = np.argmax(qtable[state, :])
    return action

In [16]:
# Function to train the model
def train_model(epochs):
    rewards_per_epoch = []
    steps_per_epoch = []

    for epoch in range(epochs + 1):
        state = env.reset()
        done = False
        total_rewards = 0
        total_steps = 0

        while not done:
            # Choose an action and perform it
            action = choose_action(state)
            new_state, reward, done, _ = env.step(action)

            # Update Q-Table
            qtable[state, action] += learning_rate * (reward + discount_factor * np.max(qtable[new_state, :]) - qtable[state, action])

            # Update state, rewards, and steps
            state = new_state
            total_rewards += reward
            total_steps += 1

        rewards_per_epoch.append(total_rewards)
        steps_per_epoch.append(total_steps)

        # Print rewards and steps based on the epochs_list
        if epochs == 1000 and epoch % 100 == 0:
            print("Episode {}: Total Rewards: {} Steps: {}".format(epoch, total_rewards, total_steps))
        elif epochs == 5000 and epoch % 500 == 0:
             print("Episode {}: Total Rewards: {} Steps: {}".format(epoch, total_rewards, total_steps))
        elif epochs == 10000 and epoch % 1000 == 0:
             print("Episode {}: Total Rewards: {} Steps: {}".format(epoch, total_rewards, total_steps))

    return rewards_per_epoch, steps_per_epoch


In [22]:

# 5. Training the model for different epochs settings
for epochs in epochs_list:
    print("-------------------------------------------------------")
    print(f"Training with {epochs} epochs")
    rewards, steps = train_model(epochs)

    

# Function to evaluate the agent
def evaluate_agent(discount_factor):
    num_episodes = 10
    average_rewards = 0
    average_steps = 0

    for _ in range(num_episodes):
        state = env.reset()
        done = False
        total_rewards = 0
        total_steps = 0

        while not done:
            # Choose the action with the highest Q-value
            action = np.argmax(qtable[state, :])
            new_state, reward, done, _ = env.step(action)

            # Update rewards and steps
            total_rewards += reward
            total_steps += 1

            # Update state
            state = new_state

        average_rewards += total_rewards
        average_steps += total_steps

    average_rewards /= num_episodes
    average_steps /= num_episodes

    return average_rewards, average_steps




-------------------------------------------------------
Training with 1000 epochs
Episode 0: Total Rewards: 0.0 Steps: 33
Episode 100: Total Rewards: 0.0 Steps: 4
Episode 200: Total Rewards: 1.0 Steps: 20
Episode 300: Total Rewards: 0.0 Steps: 21
Episode 400: Total Rewards: 0.0 Steps: 16
Episode 500: Total Rewards: 1.0 Steps: 35
Episode 600: Total Rewards: 1.0 Steps: 13
Episode 700: Total Rewards: 0.0 Steps: 41
Episode 800: Total Rewards: 0.0 Steps: 6
Episode 900: Total Rewards: 0.0 Steps: 24
Episode 1000: Total Rewards: 1.0 Steps: 19
-------------------------------------------------------
Training with 5000 epochs
Episode 0: Total Rewards: 0.0 Steps: 27
Episode 500: Total Rewards: 0.0 Steps: 5
Episode 1000: Total Rewards: 1.0 Steps: 48
Episode 1500: Total Rewards: 0.0 Steps: 22
Episode 2000: Total Rewards: 0.0 Steps: 25
Episode 2500: Total Rewards: 0.0 Steps: 43
Episode 3000: Total Rewards: 0.0 Steps: 10
Episode 3500: Total Rewards: 0.0 Steps: 17
Episode 4000: Total Rewards: 0.0 Steps

# Phase 2

In [29]:

def train_agent(epochs, discount_factor):
    env = gym.make("FrozenLake-v0")
    max_steps = env.spec.max_episode_steps

    gamma = discount_factor
    learning_rate = 0.1
    exploration_rate = 0.1

    qtable = np.zeros((env.observation_space.n, env.action_space.n))

    rewards = []

    # For each episode
    for episode in range(epochs + 1):
        state = env.reset()
        done = False
        total_rewards = 0

        for step in range(max_steps):
            # Choose an action a in the current world state (s)
            # First, we randomize a number
            exp_exp_tradeoff = random.uniform(0, 1)

            # If this number > epsilon, exploitation (taking the biggest Q value for this state)
            if exp_exp_tradeoff > exploration_rate:
                action = np.argmax(qtable[state, :])
            else:
                # Else, doing a random choice, exploration
                action = env.action_space.sample()

            # Take the action (a) and observe the outcome state (s') and reward (r)
            new_state, reward, done, info = env.step(action)

            # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
            qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])

            total_rewards += reward

            # Our new state is the state
            state = new_state

            # If done (if we're dead or reached the goal): finish the episode
            if done:
                break

        # Reduce epsilon (because we need less and less exploration)
        rewards.append(total_rewards)

    return qtable, rewards

def evaluate_agent(qtable, discount_factor, num_episodes=10):
    env = gym.make("FrozenLake-v0")
    max_steps = env.spec.max_episode_steps

    avg_rewards = 0
    avg_steps = 0

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        for step in range(max_steps):
            action = np.argmax(qtable[state, :])
            new_state, reward, done, info = env.step(action)

            if done:
                break
            state = new_state

        avg_rewards += reward
        avg_steps += step

    avg_rewards /= num_episodes
    avg_steps /= num_episodes
    

    return avg_rewards, avg_steps

# Define discount factors to test
discount_factors = [0.3, 0.5, 0.9]
epochs = 10000
num_eval_episodes = 10

for discount_factor in discount_factors:
    print("Training with Discount Factor =", discount_factor)
    qtable, _ = train_agent(epochs, discount_factor)
    avg_rewards, avg_steps = evaluate_agent(qtable, discount_factor, num_eval_episodes)
    print("Average Rewards:", avg_rewards)
    print("Average Steps", avg_steps)


Training with Discount Factor = 0.3
Average Rewards: 0.0
Average Steps 13.1
Training with Discount Factor = 0.5
Average Rewards: 0.0
Average Steps 13.0
Training with Discount Factor = 0.9
Average Rewards: 0.3
Average Steps 42.9


In [31]:
qtable

array([[0.04057935, 0.04054891, 0.04083077, 0.04038407],
       [0.03003663, 0.0334785 , 0.03749905, 0.04192252],
       [0.05824787, 0.03901053, 0.04124201, 0.04031733],
       [0.02293675, 0.01303342, 0.01276868, 0.01414479],
       [0.05748962, 0.04973843, 0.04891829, 0.03818435],
       [0.        , 0.        , 0.        , 0.        ],
       [0.09522533, 0.06306597, 0.06714113, 0.02887922],
       [0.        , 0.        , 0.        , 0.        ],
       [0.05233883, 0.07164165, 0.05308914, 0.13992977],
       [0.11794504, 0.23068462, 0.14633922, 0.08322399],
       [0.23129986, 0.14590723, 0.17801205, 0.15350813],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.23484866, 0.21399452, 0.36152637, 0.21331401],
       [0.39543523, 0.40393747, 0.40672782, 0.50181291],
       [0.        , 0.        , 0.        , 0.        ]])

In [30]:
from IPython.display import clear_output
from time import sleep

lengths = []

for tripnum in range(1, 11):
    state = env.reset()
    done = False
    trip_length = 0

    while not done and trip_length < 25:
        action = np.argmax(qtable[state, :])
        next_state, reward, done, info = env.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(env.render(mode='ansi'))
        sleep(.2)
        state = next_state
        trip_length += 1

    lengths.append(trip_length)
    sleep(.2)

avg_len = sum(lengths) / 10
print(avg_len)


Trip number 10 Step 24
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG

20.6
