In [29]:
env = LawnmowerEnv(max_steps=100, alpha=0.5, gamma=0.9, epsilon=0.1)


In [30]:
observation = env.reset()



In [4]:
import gym
from gym import spaces
import numpy as np
import matplotlib.pyplot as plt


class LawnmowerEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, max_steps = 100):
        # Define the observation and action spaces
        self.observation_space = spaces.Discrete(16)
        self.action_space = spaces.Discrete(4)

        # Initialize the grid
        self.grid = np.array([[6, 5, 6, 5],
                              [5, 6,  5, 6],
                              [5,  6, 6,  6],
                              [6, 5,  6, 6]])

        # Initialize the agent and goal positions
        self.agent_pos = np.array([0, 0])
        self.goal_pos = np.array([3, 3])

        # Initialize the reward
        self.reward = 0
        # self.history = []
        self.max_steps = max_steps
        self.current_step = 0

    def reset(self):
        
        self.current_step = 0
        if np.array_equal(self.agent_pos, self.goal_pos):
            self.agent_pos = np.array([0, 0])

        # Reset the agent position and reward
        self.agent_pos = np.array([0, 0])
        self.reward = 0

        # Return the initial observation
        return self._get_observation()

    def step(self, action):
        
        self.current_step += 1
        if self.current_step >= self.max_steps:
            done = True
        # Move the agent based on the chosen action
        if action == 0:
            self.agent_pos[0] -= 1  # Up
        elif action == 1:
            self.agent_pos[0] += 1  # Down
        elif action == 2:
            self.agent_pos[1] += 1  # Right
        elif action == 3:
            self.agent_pos[1] -= 1  # Left

        # Keep the agent within the grid boundaries
        self.agent_pos = np.clip(self.agent_pos, 0, 3)

        # Calculate the reward based on the new agent position
        self.reward = self.grid[tuple(self.agent_pos)]
        # self.history.append({"observation": self._get_observation(), "reward": self.reward, "done": done})
        # Check if the episode is done
        # self.reward > 0
        done = bool(  np.array_equal(
            self.agent_pos, self.goal_pos))
        
        # self.history.append({"observation": self._get_observation(), "reward": self.reward, "done": done})
        # Return the new observation, reward, done flag, and info dictionary
        return self._get_observation(), self.reward, done, {}

    def render(self, mode='human'):
        # Create a new figure
        fig, ax = plt.subplots()
     
        # Plot the grid with markers for different tile types
        for i in range(4):
            for j in range(4):
                if self.grid[i][j] < 0:
                    ax.scatter(j, i, marker='X', color='r', s=1000)
                elif self.grid[i][j] == 0:
                    ax.scatter(j, i, marker='o', color='b', s=1000)
                elif self.grid[i][j] > 0:
                    ax.scatter(j, i, marker='o', color='g', s=1000)

        # Plot the agent and goal positions
        ax.scatter(self.agent_pos[1], 3 - self.agent_pos[0],
                   marker='s', color='y', s=1000)
        ax.scatter(self.goal_pos[1], 3 - self.goal_pos[0],
                   marker='s', color='m', s=1000)

        # Set the x and y axis limits
        ax.set_xlim([-0.5, 3.5])
        ax.set_ylim([-0.5, 3.5])

        # Add title and axis labels
        ax.set_title('Lawnmower Environment')
        ax.set_xlabel('X Position')
        ax.set_ylabel('Y Position')

        # Add gridlines
        ax.grid(which='major', color='gray', linestyle='-', linewidth=2)
        ax.set_xticks(np.arange(-0.5, 4, 1))
        ax.set_yticks(np.arange(-0.5, 4, 1))
        ax.xaxis.tick_top()

        # Show the plot
        plt.show()


    def _get_observation(self):
        # Convert the agent position to an observation index
        return np.ravel_multi_index(tuple(self.agent_pos), (4, 4))



env = LawnmowerEnv()
# Parameters
epsilon = 0.9
total_episodes = 100
max_steps = 100
alpha = 0.05
gamma = 0.95
  
#Initializing the Q-vaue
Q = np.zeros((env.observation_space.n, env.action_space.n))

# Function to choose the next action with episolon greedy
def choose_action(state):
    action=0
    if np.random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state, :])
    return action
    
#Initializing the reward
reward=0
  
# Starting the SARSA learning
for episode in range(total_episodes):
    t = 0
    state1 = env.reset()
    action1 = choose_action(state1)
  
    while t < max_steps:
        # Visualizing the training
        # env.render()
          
        # Getting the next state
        state2, reward, done, info = env.step(action1)
  
        #Choosing the next action
        action2 = choose_action(state2)
          
        #Learning the Q-value
        Q[state1, action1] = Q[state1, action1] + alpha * (reward + gamma * Q[state2, action2] - Q[state1, action1])
  
        state1 = state2
        action1 = action2
          
        #Updating the respective vaLues
        t += 1
        reward += 1
          
        #If at the end of learning process
        if done:
            break
            
#Evaluating the performance
print ("Performace : ", reward/total_episodes)
  
#Visualizing the Q-matrix
print(Q)

Performace :  0.07
[[36.09263464 30.35561939 32.45963105 36.86205384]
 [33.02654009 27.92641182 27.00887321 35.82426229]
 [26.21121197 19.8820103  20.53375167 30.97327178]
 [18.31272957 17.97156786 18.29592159 25.81989225]
 [34.8730991  21.80380099 27.68482021 27.07566991]
 [31.3243344  23.63778102 21.08820936 25.96711932]
 [25.08418672 18.55558595 16.89828621 21.8007756 ]
 [18.89740066 13.36075065 15.10139646 16.60189167]
 [26.16183417 16.26384942 22.09887539 19.97170276]
 [26.10928823 16.08337423 19.90755169 20.45506704]
 [19.56947002 16.00493069 12.16378913 19.91447923]
 [12.40963564  5.10065848  9.81119747 15.42833809]
 [17.67715771 13.52134015 15.5474692  11.57340761]
 [20.38308955 12.63490841 16.39001862 13.37540791]
 [17.91264774 13.98042037  5.40335846 14.00560344]
 [ 0.          0.          0.          0.        ]]
