<a href="https://colab.research.google.com/github/nithin-sudarsan/q-learning/blob/main/QLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random

In [2]:
class Gridworld:
  def __init__(self):
    # 1. define 5*5 grid
    # 2. define start position
    self.grid = np.array([[0,-1,0,0,0],
                          [0,0,1,0,0],
                          [0,-1,0,-1,0],
                          [-1,0,0,0,0],
                          [0,0,0,0,0]])
    self.start_state = (4,0)
    self.state = self.start_state

  def reset(self):
    # Current state must be set to the start state
    self.state = self.start_state
    return self.state

  def is_terminal(self, state):
    # if the value of the grid at the coordinates of state is 1 or -1 then its terminal
    return self.grid[state] == 1 or self.grid[state] == -1

  def get_next_state(self, state, action):
    next_state = list(state)
    if action == 0: # move up
      next_state[0] = max(0, state[0]-1)
    elif action == 1: # move down
      next_state[0] = min(4, state[0]+1)
    elif action == 2: # move left
      next_state[1] = max(0, state[1]-1)
    elif action == 3: # move right
      next_state[1] = min(4, state[1]+1)
    return(tuple(next_state))

  def step(self, action):
    next_state = self.get_next_state(self.state, action)
    reward = self.grid[next_state]
    self.state = next_state
    done = self.is_terminal(next_state)
    return next_state, reward, done


In [3]:
class QLearningAgent:
  def __init__(self, learning_rate=0.1, exploration_rate=0.1, discount_factor=0.9):
      self.q_table = np.zeros((5,5,4)) # state, action, q_value
      self.learning_rate = learning_rate
      self.discount_factor = discount_factor
      self.exploration_rate = exploration_rate

  def choose_action(self, state):
      if random.uniform(0,1) < self.exploration_rate:
        return random.randint(0,3)
      else:
        return np.argmax(self.q_table[state])

  def update_q_value(self, state, action, reward, next_state):
    max_future_q = np.max(self.q_table[next_state])
    current_q = self.q_table[state][action]
    # Bellman equation
    self.q_table[state][action] = current_q + self.learning_rate * (reward + self.discount_factor * max_future_q - current_q)

r`andom.uniform(0, 1)` is a function from Python's built-in random module. It returns a random floating-point number between 0 (inclusive) and 1 (exclusive).

In the context of the `QLearningAgent` class, it's used to implement the epsilon-greedy action selection strategy. A random number is generated between 0 and 1, and if this number is less than the `exploration_rate`, the agent chooses a random action (exploration). Otherwise, it chooses the action with the highest Q-value (exploitation).

In [4]:
env = Gridworld()
agent = QLearningAgent()

episodes = 1000

for episode in range(episodes):
  state = env.reset()
  done = False

  while not done:
    action = agent.choose_action(state)
    next_state, reward, done = env.step(action)
    agent.update_q_value(state, action, reward, next_state)
    state = next_state

In [5]:
agent.q_table

array([[[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.1       ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 1.        ,  0.71989395, -0.96566316, -0.9282102 ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        ,  0.        ],
  

In [6]:
import pandas as pd

# Reshape the q_table for visualization
# The shape is (4, 4, 4) representing (row, column, action)
# We want a table with states as rows and actions as columns
# We can flatten the first two dimensions (states) into 16 rows (0-15)
# And keep the actions (0-3) as columns
q_table_reshaped = agent.q_table.reshape(-1, 4)

# Create a list of state labels (e.g., (0,0), (0,1), ..., (3,3))
states_labels = [(i, j) for i in range(5) for j in range(5)]

# Create a pandas DataFrame
q_table_df = pd.DataFrame(q_table_reshaped, index=states_labels, columns=['Up', 'Down', 'Left', 'Right'])

# Display the DataFrame
display(q_table_df)

Unnamed: 0,Up,Down,Left,Right
"(0, 0)",0.0,0.0,0.0,0.0
"(0, 1)",0.0,0.0,0.0,0.0
"(0, 2)",0.0,0.1,0.0,0.0
"(0, 3)",0.0,0.0,0.0,0.0
"(0, 4)",0.0,0.0,0.0,0.0
"(1, 0)",0.0,0.0,0.0,0.0
"(1, 1)",0.0,0.0,0.0,0.0
"(1, 2)",0.0,0.0,0.0,0.0
"(1, 3)",0.0,0.0,0.0,0.0
"(1, 4)",0.0,0.0,0.0,0.0



| 0 | X | 0 | 0 | 0 |

| 0 | 0 | G | 0 | 0 |

| 0 | X | 0 | X | 0 |

| X | 0 | 0 | 0 | 0 |

| S | 0 | 0 | 0 | 0 |

In [7]:
def visualize_optimal_policy(grid_env, q_agent):
    state = grid_env.reset()
    path = [state]
    done = False

    while not done:
        # Choose the action with the highest Q-value for the current state
        action = np.argmax(q_agent.q_table[state])

        # Get the next state and reward
        next_state, reward, done = grid_env.step(action)

        # Add the next state to the path
        path.append(next_state)

        # Update the current state
        state = next_state

        # To avoid infinite loops in case the agent gets stuck
        if len(path) > 100:
            print("Path is too long, stopping.")
            break

    return path

# Visualize the optimal path
optimal_path = visualize_optimal_policy(env, agent)
print("Optimal path:", optimal_path)

# Optional: Print the grid with the path
# Create a copy of the grid with object dtype to store strings
grid_display = np.copy(env.grid).astype(object)
for r, c in optimal_path:
    if grid_display[r, c] == 0:
        grid_display[r, c] = 'P' # Mark path with 'P'
    elif grid_display[r,c] == 1:
      grid_display[r,c] = 'G' # Mark goal with 'G'

print("\nGrid with Optimal Path:")
print(grid_display)

Optimal path: [(4, 0), (4, 1), (3, 1), (3, 2), (2, 2), (1, 2)]

Grid with Optimal Path:
[[0 -1 0 0 0]
 [0 0 'G' 0 0]
 [0 -1 'P' -1 0]
 [-1 'P' 'P' 0 0]
 ['P' 'P' 0 0 0]]
