# Piero Pettenà - RL project
Using lecture jupyter notebooks and adapting them to my project. Credit goes to     

NOTE: maybe 'transition' and 'rewards' methods can be joint.

## Plotting utilities

In [19]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np

plt.rcParams['figure.figsize'] = [10, 7]
plt.rcParams['figure.dpi'] = 100 
plt.rcParams['font.size'] = 6

def plot_world(World):
    # ------------------
    Ly, Lx = World.shape

    fig, ax = plt.subplots()
    im = ax.imshow(World, cmap=plt.get_cmap("Spectral"))
    
    # We want to show all ticks...
    ax.set_xticks(np.arange(Lx))
    ax.set_yticks(np.arange(Ly))

    goal = np.where(np.logical_or( world.grid > 0.0, World < -1.0))
    blocks = np.where(World == -1.0)
    # Loop over data dimensions and create text annotations.
    for i in range(Lx):
        for j in range(Ly):
            if np.logical_and(goal[0]==j,goal[1]==i).any():
                text = ax.text(i,j, 'G{}'.format(int(World[j,i])), ha="center", va="center", color="black")
            elif np.logical_and(blocks[0]==j,blocks[1]==i).any():
                 text = ax.text(i,j, 'X', ha="center", va="center", color="black", backgroundcolor="black")
            else:
                pass
    plt.show()
    # -------------------

    

def plot_world_values(World, Values):
    # ------------------
    Ly, Lx = World.shape

    fig, (ax, ax2) = plt.subplots(1,2)
    im = ax.imshow(World, cmap=plt.get_cmap("Spectral"))

    # We want to show all ticks...
    ax.set_xticks(np.arange(Lx))
    ax.set_yticks(np.arange(Ly))

    goal = np.where(np.logical_or( World > 0.0, World < -1.0))
    blocks = np.where(World == -1.0)
    # Loop over data dimensions and create text annotations.
    for i in range(Lx):
        for j in range(Ly):
            if np.logical_and(goal[0]==j,goal[1]==i).any():
                text = ax.text(i,j, 'G{}'.format(World[j,i]), ha="center", va="center", color="black")
            elif np.logical_and(blocks[0]==j,blocks[1]==i).any():
                text = ax.text(i,j, 'X', ha="center", va="center", color="black", backgroundcolor="black")
            else:
                pass

    im2 = ax2.imshow(Values, cmap=plt.get_cmap("Spectral"))

    # We want to show all ticks...
    ax2.set_xticks(np.arange(Lx))
    ax2.set_yticks(np.arange(Ly))

    # Loop over data dimensions and create text annotations.
    for i in range(Lx):
        for j in range(Ly):
            if np.logical_and(goal[0]==j, goal[1]==i).any():
                text = ax2.text(i,j, 'G{}'.format(World[j,i]), ha="center", va="center", color="black")
            elif np.logical_and(blocks[0]==j,blocks[1]==i).any():
                text = ax2.text(i,j, 'X', ha="center", va="center", color="black", backgroundcolor="black")
            else:
                text = ax2.text(i, j, '{:.2f}'.format(Values[j, i]), ha="center", va="center", color="black")
                
                
    plt.show()
    # -------------------

    

def plot_world_values_policy(World, Values, Policy):
    # ------------------
    Ly, Lx = World.shape

    fig, (ax, ax2, ax3) = plt.subplots(1,3)
    im = ax.imshow(World, cmap=plt.get_cmap("Spectral"))

    # We want to show all ticks...
    ax.set_xticks(np.arange(Lx))
    ax.set_yticks(np.arange(Ly))

    goal = np.where(np.logical_or( World > 0.0, World < -1.0))
    blocks = np.where(World == -1.0)
    # Loop over data dimensions and create text annotations.
    for i in range(Lx):
        for j in range(Ly):
            if np.logical_and(goal[0]==j,goal[1]==i).any():
                text = ax.text(i,j, 'G-{}'.format(World[j,i]), ha="center", va="center", color="black")
            elif np.logical_and(blocks[0]==j,blocks[1]==i).any():
                text = ax.text(i,j, 'X', ha="center", va="center", color="black", backgroundcolor="black")
            else:
                pass

    im2 = ax2.imshow(Values, cmap=plt.get_cmap("Spectral"))

    # We want to show all ticks...
    ax2.set_xticks(np.arange(Lx))
    ax2.set_yticks(np.arange(Ly))

    # Loop over data dimensions and create text annotations.
    for i in range(Lx):
        for j in range(Ly):
            if np.logical_and(goal[0]==j, goal[1]==i).any():
                text = ax2.text(i,j, 'G{}'.format(World[j,i]), ha="center", va="center", color="black")
                text = ax3.text(i,j, 'G{}'.format(World[j,i]), ha="center", va="center", color="black")
            elif np.logical_and(blocks[0]==j,blocks[1]==i).any():
                text = ax2.text(i,j, 'X', ha="center", va="center", color="black", backgroundcolor="black")
                text = ax3.text(i,j, 'X', ha="center", va="center", color="black", backgroundcolor="black")
            else:
                text = ax2.text(i, j, '{:.2f}'.format(Values[j, i]), ha="center", va="center", color="black")
    
    im3 = ax3.imshow(Values, cmap=plt.get_cmap("Spectral"))
    X = np.arange(Lx)
    Y = np.arange(Ly)
    U, V = Policy[:,:,1], -Policy[:,:,0]
    q = ax3.quiver(X, Y, U, V, color="black")

    plt.show()
    # -------------------
    

# Creating the world
There will be no "block" cells as they are not needed for our problem.

In [20]:
import numpy as np
import math
# TYPICAL (GRID)WORLD

def new_world(Lx, Ly, goal, rewards):
    """
    Construct a gridworld of width Lx and height Ly, 
    with a number of blocks Nblocks (to be distributed randomly)
    and a list of tuple for positions of goal, and a list of corresponding rewards 
    """
    
    # Checks that the number of goals is consistent with the number of rewards
    assert len(goal) == len(rewards)
    
    # Constructs the empty matrix
    World = np.zeros((Ly,Lx))
    
    # For all pairs of goals and rewards
    for g, r in zip(goal, rewards):
        World[g] = r
    return World

class World():
    """World environment class. It contains the grid, the datum and 
    goal positions, info on the current and 'explored grid' which contains
    the last moment each block has been visited.
    
    NOTE: should add a variable that improves visibility of adjacent cells

    Attributes:
        grid:       grid of the world
        goal:       position of target in grid
        datum:      position of datum (reference) in grid
        current:    current offset (future implementation)
        explored:   contains time of last exploration of a cell
        action:     List of action that the agent can take. I want these to be part of the environment
        randomExplorerFlag:  if true, action is picked at random each time
        rem_time:   remaining time to find the target
        budget:     initial budget of time to find the target (will not be updated)

    """
    def __init__(self, Ly = 20, Lx = 20, goal = (0,0), rem_time = 1000):
        self.goal = goal
        if self.goal == (0,0):
            self.goal = (np.random.randint(Lx), np.random.randint(Ly))  #maybe this should be an array

        self.grid = np.full((Lx, Ly), -1)
        self.grid[goal] = 0
        self.actions = np.array([[1,0],[-1,0],[0,1],[0,-1]])        #Actions = [Up, Down, Right, Left]
        self.randomExplorerFlag = True
        self.datum = (Lx//2, Ly//2)
        self.budget = rem_time
        self.rem_time = rem_time
        self.current = (0,0)    #maybe for addition of current

        self.explored = np.zeros_like(self.grid) 

    def plot_world(self, agent = None):
        """Plots the world (copied from tutor)"""
        Ly, Lx = self.grid.shape

        fig, ax = plt.subplots()
        im = ax.imshow(self.grid, cmap=plt.get_cmap("Spectral"))
        
        # We want to show all ticks...
        ax.set_xticks(np.arange(Lx))
        ax.set_yticks(np.arange(Ly))

        # Loop over data dimensions and create text annotations.
        for i in range(Lx):
            for j in range(Ly):
                if np.logical_and(self.goal[0]==j, self.goal[1]==i):
                    text = ax.text(i,j, 'G'.format(int(self.grid[j,i])), ha="center", va="center", color="black", backgroundcolor="blue")
                elif np.logical_and(self.datum[0]==j, self.datum[1]==i):
                    text = ax.text(i,j, 'D', ha="center", va="center", color="black", backgroundcolor="green")
                    pass
                elif (agent != None) and (np.logical_and(agent.pos[0]==j, agent.pos[1]==i)):
                    text = ax.text(i,j, 'A'.format(int(self.grid[j,i])), ha="center", va="center", color="black", backgroundcolor="white")
        plt.show()

    def plot_world_with_path(self, agent = None):

        """Plots the world (copied from tutor)"""
        Ly, Lx = self.grid.shape

        fig, ax = plt.subplots()
        im = ax.imshow(self.grid, cmap=plt.get_cmap("Spectral"))
        
        # We want to show all ticks...
        ax.set_xticks(np.arange(Lx))
        ax.set_yticks(np.arange(Ly))

        # Loop over data dimensions and create text annotations.
        for i in range(Lx):
            for j in range(Ly):
                if np.logical_and(self.goal[0]==j, self.goal[1]==i):
                    text = ax.text(i,j, 'G'.format(int(self.grid[j,i])), ha="center", va="center", color="black", backgroundcolor="blue")
                elif np.logical_and(self.datum[0]==j, self.datum[1]==i):
                    text = ax.text(i,j, 'D', ha="center", va="center", color="black", backgroundcolor="green")

        if (agent != None):
            for cell in agent.explored:
                text = ax.text(cell[1], cell[0], '.'.format(int(self.grid[cell[1],cell[0]])), ha="center", va="center", color="white", backgroundcolor="white")
        
        text = ax.text(i,j, 'A'.format(int(self.grid[agent.pos[1],agent.pos[0]])), ha="center", va="center", color="black", backgroundcolor="white")
    
        plt.show()

    def plot_world_policy(self, actions):
        
        policy = [[1, 0] if x == 0 else
                 [-1, 0] if x == 1 else
                 [0, 1] if x == 2 else
                 [0, -1] for x in actions]

        Ly, Lx = self.grid.shape
        fig, ax = plt.subplots()
        im = ax.imshow(self.grid, cmap=plt.get_cmap("Spectral"))
        X = np.arange(Lx)
        Y = np.arange(Ly)
        U, V = policy[:,:,1], -policy[:,:,0]
        q = ax3.quiver(X, Y, U, V, color="black")

        plt.show()


In [21]:
class Agent():
    """Agent = explorator"""
    def __init__(self, world):
        self.pos = (world.datum)    #initial position is the same as the datum
        self.visibility = 1     #try to change visibility of agent (sees not only strictly adjacent cells)

        self.env = world        #environment of agent. I hope this isn't a full copy of the world
        self.ddist = 0          #distance from datum
        self.choices = np.zeros_like(world.grid)    #contains last choices for each position
        self.explored = [self.pos]      #array with already explored cells

    
    def chooseAction(self):
        if self.env.randomExplorerFlag:
            # Get the shape of the array
            num_rows, num_cols = self.env.actions.shape

            # Generate random indices for row and column
            random_row = np.random.randint(num_rows)

            # Pick the random element using the random indices
            action = self.env.actions[random_row]

        else:
            print("Needs implementation for randomExplorerFlag = False")
        return action
    
    def transition(self, A):
        """Takes the selected action A and returns the resulting new S.
        Updates remaining time in environment attribute of the agent"""
        # I try to move
        S_new = self.pos + A
        Ly, Lx = self.env.grid.shape
        # if I go out of the world, I stay still
        if ((S_new[0] == Ly) or np.any(S_new == -1) or (S_new[1] == Lx)):
            #print("Invalid action proposal, staying here.")
            S_new = self.pos
        else:
            #update choices matrix
            self.choices[self.pos] = np.where((self.env.actions == A).all(axis=1))[0][0]   #stores index of action (one number per cell of choices matrix)

        # get the reward for the new state
        reward = self.env.grid[S_new[0], S_new[1]]
        # reward is also influenced by the distance


        # add cell to list of visited ones
        self.explored = np.append(self.explored, [S_new], axis=0)

        #update last visit of this cell in environment attribute
        self.env.explored[self.pos] = self.env.budget - self.env.rem_time

        #update remaining time in environment attribute
        self.env.rem_time = self.env.rem_time + reward

        # returns the new state
        return S_new

    def search(self):
        i = 0
        while np.any(self.pos != self.env.goal) and (i in range(self.env.rem_time)):
            #choose action
            action = self.chooseAction()
            #perform action
            self.pos  = self.transition(action)
            #update info relative to datum
            self.ddist = math.dist(self.pos, self.env.datum)

        # if np.all(self.pos == self.env.goal):
        #     print(f"Target found with {self.env.budget - self.env.rem_time} iterations")
        #     print("Weight would be ", self.env.rem_time / self.env.budget)
        # else:
        #     print("Target not found")

        #end run and return remaining time
        return self.env.rem_time/self.env.budget


# Playground

In [22]:
Lx = 20
Ly = 20
n_runs = 1000

weights = np.zeros(n_runs)
action_mtx = np.zeros((Lx, Ly))

for i in range(n_runs):
    goal = (np.random.randint(Lx), np.random.randint(Ly))
    world = World(Ly, Lx, goal)
    ag = Agent(world)
    weights[i] = ag.search()

    for row in range(Lx):
        for col in range(Ly):
            action_mtx[row, col] = action_mtx[row, col] + ag.choices[row, col]*weights[i]
            #action_mtx[row, col] = action_mtx[row, col] + ag.choices[row, col]


action_mtx = action_mtx/(np.sum(weights))
#action_mtx = action_mtx/(n_runs)

print("ag.choices = ", ag.choices)

print("weights = ", weights)
print("action matrix: ")
print(action_mtx)
#world.plot_world_policy(ag.choices)

print("Visual representation of the gridworld:")
world.plot_world_with_path(ag)

# Transition and Rewards

 - Transitions: Given a state S and the Action A, we return the new state, taking care that we do not do forbidden actions. We will use deterministic transitions.

 - Rewards: Given a state S and the Action A and the new state S', we also have the probability to receive a reward R.

__PS: Achtung!__
The convention for python arrays is $A[i_y, i_x]$, where $i_y$ is the row-index and $i_x$ is the column-index... So the convention with _up_, _down_, _right_ and _left_ directions consistent, but may be a bit confusing: Use special care!

In [None]:
# The list of actions I can take: Actions = [Up, Down, Right, Left]
Actions = np.array([[1,0],[-1,0],[0,1],[0,-1]])

def transition(S, A, world):
    """
    Takes the current position S and selected action A,
    and returns the resulting new S given a world World.
    """
    # I try to move
    S_new = S + A
    Ly, Lx = world.grid.shape
    # if I go out of the world, I stay still
    if ((S_new[0] == Ly) or np.any(S_new == -1) or (S_new[1] == Lx)):
        S_new = S
    # if I found a block I stay still
    elif world.grid[S_new[0],S_new[1]] == -1:
        S_new = S
    # returns the new state
    return S_new 

def rewards(S, A, S_new, world):
    """
    Takes the current position S and selected action A,
    and returns the resulting reward given that it has ended up in S_new and 
    the gridworld is World.
    """
    # reward is always zero...
    reward = 0
    # expect when I reach the final goal
    if (world.grid[S_new[0], S_new[1]] > 0) or (world.grid[S_new[0], S_new[1]] < -1):
        reward = world.grid[S_new[0], S_new[1]]
    # return the reward
    return reward


In [None]:
def update_values(Values, world, gamma, p=0.9):
    """
    Takes the current matrix of values (V_k(s) )
    The associated gridworld,
    And computes the bellman operator   
    V_(k+1) (s) = max_a { sum_s'r   p(r, s'| s, a)(r + gamma V_k(s') }
    And the relative best policy
    pi_(k+1)(s) = argmax_a { sum_s'r   p(r, s'| s, a)(r + gamma V_k(s') }
    
    In the case with deterministic (p=1) or stochastic (p<1) actions.
    Returns V_(k+1)(s) and pi_(k+1)(s)
    """
    
    # -----------------------------------------------------------
    # The dimension of the world
    Ly, Lx = world.shape
    # initialize the vectors to store the new values and policy
    NewValues = np.zeros((Ly,Lx))
    NewPolicy = np.zeros((Ly,Lx,2))
    # 
    goal = np.where(np.logical_or( world.grid > 0.0, world.grid < -1.0))
    
    # --------------- UPDATE -------------------------------------
    # cycle over all the states
    for ix in range(Lx):
        for iy in range(Ly):
            # state is defined by its indices
            S = np.array([iy, ix])
            
            # skip blocked squares
            if world.grid[S[0],S[1]] != -1:
                maxvalue = -100
                
                # it "tries out" all actions and store the best
                for A in Actions:
                    new_S = transition(S, A, world.grid)
                    R = rewards(S, A, new_S, world.grid)

                    value_action = 0.0
                    
                    value_action += R + gamma*Values[new_S[0], new_S[1]]
                    
                    if (value_action > maxvalue):
                        maxvalue = value_action
                        bestact = A
                    # -------------------------
                    # Q: What happens in a tie?
                    # -------------------------

                # It stores the new value and policy of state S
                NewValues[S[0],S[1]] = maxvalue
                NewPolicy[S[0],S[1],:] = bestact
    # --------------------------------------------------------------

    # --------------------------------------------------------------
    for gx, gy in zip(goal[0],goal[1]):
        NewValues[gx, gy] = 0
        NewPolicy[gx, gy] = [0,0]
    # --------------------------------------------------------------
    return NewValues, NewPolicy

In [None]:
# Define the array
arr = np.array([[1, 0], [-1, 0], [0, 1], [0, -1]])

# Define the element you want to find the index of
target_element = [0, -1]

# Use numpy.where() to find the index of the target_element
indices = np.where((arr == target_element).all(axis=1))

# If the element is found, the indices variable will contain the index/indices of the target_element in the array
if len(indices[0]) > 0:
    print("Index of", target_element, ":", indices[0])
    print("Type of target element:", type(target_element))
else:
    print("Element not found in the array.")

Index of [0, -1] : [3]
Type of target element: <class 'list'>
