In [5]:
import numpy as np
import random
import pickle

class GridWorld:
    def __init__(self, size=5, obstacles=5):
        self.size = size
        self.start = (0, 0)
        self.goal = (4, 4)
        self.obstacles = set()
        self.generate_obstacles(obstacles)
        self.reset()

    def generate_obstacles(self, count):
        while len(self.obstacles) < count:
            x, y = random.randint(0, self.size-1), random.randint(0, self.size-1)
            if (x, y) not in [self.start, self.goal]:
                self.obstacles.add((x, y))

    def reset(self):
        self.agent_pos = self.start
        return self.agent_pos

    def get_actions(self, state):
        x, y = state
        actions = []
        if x > 0: actions.append("LEFT")
        if x < self.size - 1: actions.append("RIGHT")
        if y > 0: actions.append("UP")
        if y < self.size - 1: actions.append("DOWN")
        return actions

    def step(self, action):
        x, y = self.agent_pos
        if action == "LEFT": x -= 1
        if action == "RIGHT": x += 1
        if action == "UP": y -= 1
        if action == "DOWN": y += 1

        new_state = (x, y)
        reward = -1  # Default move penalty

        if new_state == self.goal:
            reward = 10  # Goal reward
            done = True
        elif new_state in self.obstacles:
            reward = -5  # Obstacle penalty
            done = False
        else:
            done = False

        self.agent_pos = new_state
        return new_state, reward, done

class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.2):
        self.q_table = {}  # (state, action) -> Q-value
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate

    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0)

    def choose_action(self, state, available_actions):
        if random.uniform(0, 1) < self.epsilon:  # Explore
            return random.choice(available_actions)
        # Exploit: Pick best action based on Q-values
        q_values = {a: self.get_q_value(state, a) for a in available_actions}
        return max(q_values, key=q_values.get)

    def update_q_table(self, state, action, reward, next_state, done):
        old_value = self.get_q_value(state, action)
        future_max = max([self.get_q_value(next_state, a) for a in ["LEFT", "RIGHT", "UP", "DOWN"]], default=0)
        new_value = old_value + self.alpha * (reward + self.gamma * future_max * (1 - int(done)) - old_value)
        self.q_table[(state, action)] = new_value

def train_q_agent(episodes=5000):
    env = GridWorld()
    agent = QLearningAgent()
    scores = []

    for _ in range(episodes):
        state = env.reset()
        done = False
        total_score = 0

        while not done:
            available_actions = env.get_actions(state)
            action = agent.choose_action(state, available_actions)
            next_state, reward, done = env.step(action)
            agent.update_q_table(state, action, reward, next_state, done)
            state = next_state
            total_score += reward  # Accumulate score

        scores.append(total_score)  # Store episode score

    # Save trained Q-table
    with open("grid_q_table.pkl", "wb") as f:
        pickle.dump(agent.q_table, f)

    print("Training completed and Q-table saved!")
    print(f"Average Score: {sum(scores) / len(scores)}")

if __name__ == "__main__":
    train_q_agent()


Training completed and Q-table saved!
Average Score: -0.9316


In [6]:
def play_with_q_agent():
    with open("grid_q_table.pkl", "rb") as f:
        q_table = pickle.load(f)

    env = GridWorld()
    agent = QLearningAgent()
    agent.q_table = q_table  # Load trained Q-values

    state = env.reset()
    total_score = 0

    print(f"Start: {state}")

    while True:
        print(f"Agent at: {state}")
        available_actions = env.get_actions(state)
        action = agent.choose_action(state, available_actions)
        print(f"Agent chooses: {action}")

        state, reward, done = env.step(action)
        total_score += reward

        if done:
            print(f"Final Position: {state}")
            print("Goal reached!" if state == env.goal else "Agent hit an obstacle!")
            print(f"Total Score: {total_score}")
            break

play_with_q_agent()


Start: (0, 0)
Agent at: (0, 0)
Agent chooses: RIGHT
Agent at: (1, 0)
Agent chooses: RIGHT
Agent at: (2, 0)
Agent chooses: RIGHT
Agent at: (3, 0)
Agent chooses: DOWN
Agent at: (3, 1)
Agent chooses: DOWN
Agent at: (3, 2)
Agent chooses: RIGHT
Agent at: (4, 2)
Agent chooses: DOWN
Agent at: (4, 3)
Agent chooses: LEFT
Agent at: (3, 3)
Agent chooses: DOWN
Agent at: (3, 4)
Agent chooses: RIGHT
Final Position: (4, 4)
Goal reached!
Total Score: -11
