# Q-Learning Temporal Difference

In [1]:
import numpy as np
import random
import time


In [2]:
import gymnasium as gym
from IPython.display import clear_output

# Create FrozenLake
env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="ansi", map_name="4x4")
# env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="human", map_name="4x4")

# env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="ansi", map_name="8x8")
# env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="human", map_name="8x8")

In [3]:
# Hyperparameters:

ACTIONS = ['left', 'down', 'right', 'up']     # available actions
# 0: Move left
# 1: Move down
# 2: Move right
# 3: Move up

EPSILON = 0.9   # greedy policy
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor
THETA = 1e-8            # Convergence threshold


FRESH_TIME = 0.3    # refresh/pause time for one move used in time.sleep

MAX_EPISODES = 50   # maximum episodes

num_episodes = 10000
max_steps_per_episode = 100
learning_rate = 0.1
discount_rate = 0.99
epsilon = 0.2

In [4]:
# Create an initial Q table 
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)


[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [5]:
# This is our Policy (AKA Model) (AKA Q Table)
def choose_action(env, state, q_table):

    # Exploration-exploitation trade-off
    exploration_rate_threshold = random.uniform(0, 1)
    if exploration_rate_threshold > epsilon:
        action = np.argmax(q_table[state,:])
    else:
        action = env.action_space.sample()


    return action

In [6]:
def q_learning_TD(env, q_table):
    for episode in range(num_episodes):
        # Fix reset() usage
        state, info = env.reset()
        
        print("\n*****EPISODE ", episode+1, "*****\n")
        
        for step in range(max_steps_per_episode):
  
            action = choose_action(env, state, q_table)

            new_state, reward, terminated, truncated, info = env.step(action)
            
            # Update Q-table for Q(s,a)
            q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
                                    learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

            state = new_state

            
            # #Print info
            # print(f"Episode Step {step} Given Action {action} I got reward {reward} and next state {new_state}")
            

            # the goal position in the 4x4 map can be calculated as follows: 3 * 4 + 3 = 15.
            if reward and new_state==15:
                print("You Won!!!")
                print(f"Episode {episode + 1} ended with reward: {reward}")
                # env.close()
                break
            
            #Check for Termination
            if terminated or truncated:
                print("GAME OVER --- Terminated!!!")
                print(f"Episode {episode + 1} ended with reward: {reward}")
                # env.close()
                break

    
    
    return q_table

In [7]:
q_table = q_learning_TD(env, q_table)

env.close()


*****EPISODE  1 *****

GAME OVER --- Terminated!!!
Episode 1 ended with reward: 0.0

*****EPISODE  2 *****

GAME OVER --- Terminated!!!
Episode 2 ended with reward: 0.0

*****EPISODE  3 *****

GAME OVER --- Terminated!!!
Episode 3 ended with reward: 0.0

*****EPISODE  4 *****

GAME OVER --- Terminated!!!
Episode 4 ended with reward: 0.0

*****EPISODE  5 *****

GAME OVER --- Terminated!!!
Episode 5 ended with reward: 0.0

*****EPISODE  6 *****

GAME OVER --- Terminated!!!
Episode 6 ended with reward: 0.0

*****EPISODE  7 *****

GAME OVER --- Terminated!!!
Episode 7 ended with reward: 0.0

*****EPISODE  8 *****

GAME OVER --- Terminated!!!
Episode 8 ended with reward: 0.0

*****EPISODE  9 *****

GAME OVER --- Terminated!!!
Episode 9 ended with reward: 0.0

*****EPISODE  10 *****

GAME OVER --- Terminated!!!
Episode 10 ended with reward: 0.0

*****EPISODE  11 *****

GAME OVER --- Terminated!!!
Episode 11 ended with reward: 0.0

*****EPISODE  12 *****

GAME OVER --- Terminated!!!
Episode 

In [8]:
# Print updated Q-table
print("\n\n********Q-table********\n")
print(q_table)




********Q-table********

[[0.94148015 0.95099005 0.93206535 0.94148015]
 [0.94148015 0.         0.40300128 0.77537667]
 [0.73082205 0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.95099005 0.96059601 0.         0.94148015]
 [0.         0.         0.         0.        ]
 [0.         0.83299225 0.         0.06190854]
 [0.         0.         0.         0.        ]
 [0.96059601 0.         0.970299   0.95099005]
 [0.96059601 0.9801     0.9801     0.        ]
 [0.86884979 0.99       0.         0.57178851]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9801     0.99       0.970299  ]
 [0.9801     0.99       1.         0.9801    ]
 [0.         0.         0.         0.        ]]


In [9]:
def testModel(env, q_table):
    #Deploy Code
    actionLog = []
    for episode in range(1):
        state, info = env.reset()  # Fix: Unpack both state and info

        for step in range(max_steps_per_episode):
            clear_output(wait=True)
            env.render()
            

            action = np.argmax(q_table[state,:])
            actionLog.append(action)
            # Fix: Handle all five return values from step()
            new_state, reward, terminated, truncated, info = env.step(action)
            
            # time.sleep(0.3)

            # the goal position in the 4x4 map can be calculated as follows: 3 * 4 + 3 = 15.
            if reward and new_state==15:
                print("You Won!!!")
                env.close()
                break
            
            #Check for Termination
            if terminated or truncated:
                print("GAME OVER --- Terminated!!!")
                env.close()
                break            

            state = new_state

            print(f"Episode {episode} - Step {step} Given Action {action} I got reward {reward} and next state {new_state}")

    env.close()
    
    return actionLog



In [10]:
# Create FrozenLake
env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="human", map_name="4x4")
# env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="human", map_name="8x8")

actionLog = testModel(env, q_table)


You Won!!!


In [None]:
len(actionLog)

6

: 