In [2]:
!pip install gymnasium numpy



In [4]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
import random
import os
import time

In [5]:
def clear_screen():
    os.system('cls' if os.name == 'nt' else 'clear')

In [17]:
class MazeEnv(gym.Env):
  def __init__(self):
    super(MazeEnv,self).__init__()
    self.maze = np.array([
        [2,0,0,1],
        [1,1,0,0],
        [0,0,1,0],
        [0,0,0,3]
    ])
    self.action_space = spaces.Discrete(4)
    self.start_pos = (0,0)
    self.goal_pos = (3,3)
    self.current_pos = self.start_pos

  def reset(self,seed=None):
    self.current_pos=self.start_pos
    return self.current_pos ,{}

  def step(self,action):
    moves = [(-1,0),(0,1),(1,0),(0,-1)]
    new_x = self.current_pos[0]+moves[action][0]
    new_y = self.current_pos[1]+moves[action][1]

    if(0<= new_x<4 and 0<=new_y<4 and self.maze[new_x,new_y]!=1):
      self.current_pos = (new_x,new_y)

    done = self.current_pos == self.goal_pos
    reward = 1 if done else -0.01

    return self.current_pos,reward,done,False,{}

  def render(self):
    clear_screen()
    for i in range(4):
      row = ""
      for j in range(4):
        if(i,j)==self.current_pos:
          row+="A"
        elif(self.maze[i,j]==1):
          row+="#"
        elif(self.maze[i,j]==2):
          row+="S"
        elif(self.maze[i,j]==3):
          row+="G"
        else:
          row+=" "
      print(row)
    time.sleep(0.1)


In [18]:
class Agent:
  def __init__(self,env):
    self.env = env
    self.q_table = np.zeros((4,4,4))

  def get_action(self,state,epsilon=0.1):
    if random.random()<epsilon:
      return self.env.action_space.sample()
    return np.argmax(self.q_table[state[0],state[1]])

  def train(self,episodes=100):
    for episode in range(episodes):
      state,_ = self.env.reset()
      done = False

      if episode&10==0:
        print(f"Episode {episode}")
        self.env.render()

      while not done:
        action = self.get_action(state)
        next_state,reward,done,_,_ = self.env.step(action)

        old_q = self.q_table[state[0],state[1],action]
        next_max = np.max(self.q_table[next_state[0],next_state[1]])
        new_q = old_q+0.1*(reward+0.9*next_max-old_q)
        self.q_table[state[0],state[1],action]=new_q

        if episode%10==0:
          self.env.render()

        state = next_state


In [19]:
def main():
  env = MazeEnv()
  agent = Agent(env)

  print("=== Starting training ===")
  agent.train(episodes=100)

  print("=== Starting testing ===")
  state,_ = env.reset()
  env.render()
  done = False
  steps = 0

  while not done and steps<20:
    action = np.argmax(agent.q_table[state[0],state[1]])
    state ,reward ,done, _ , _ = env.step(action)
    env.render()
    steps+=1

  print("===finished===")

In [20]:
if __name__ == "__main__":
  main()

=== Starting training ===
Episode 0
A  #
##  
  # 
   G
A  #
##  
  # 
   G
SA #
##  
  # 
   G
SA #
##  
  # 
   G
S A#
##  
  # 
   G
S A#
##  
  # 
   G
S A#
##  
  # 
   G
S  #
##A 
  # 
   G
S A#
##  
  # 
   G
SA #
##  
  # 
   G
SA #
##  
  # 
   G
A  #
##  
  # 
   G
A  #
##  
  # 
   G
A  #
##  
  # 
   G
A  #
##  
  # 
   G
SA #
##  
  # 
   G
SA #
##  
  # 
   G
S A#
##  
  # 
   G
S A#
##  
  # 
   G
S A#
##  
  # 
   G
S  #
##A 
  # 
   G
S  #
## A
  # 
   G
S  #
## A
  # 
   G
S  #
## A
  # 
   G
S  #
##  
  #A
   G
S  #
##  
  # 
   A
Episode 1
A  #
##  
  # 
   G
Episode 4
A  #
##  
  # 
   G
Episode 5
A  #
##  
  # 
   G
SA #
##  
  # 
   G
S A#
##  
  # 
   G
S  #
##A 
  # 
   G
S  #
## A
  # 
   G
S  #
##  
  #A
   G
S  #
##  
  # 
   A
Episode 16
A  #
##  
  # 
   G
Episode 17
A  #
##  
  # 
   G
Episode 20
A  #
##  
  # 
   G
SA #
##  
  # 
   G
S A#
##  
  # 
   G
S  #
##A 
  # 
   G
S  #
## A
  # 
   G
S  #
##  
  #A
   G
S  #
##  
  # 
   A
Episode 21
A  #
##  
