# COMS4061A - Reinforcement Learning

## Markov Decision Processes

- Mamello Seboholi [1851317]

### Imports

In [20]:
import numpy as np

### Utils

In [31]:
Position = tuple[int, int]
Rewards = list[list[int]]

def get_value(array, x, y, default = None):
    if x < 0 or y < 0:
        return default

    if x >= len(array):
        return default

    row = array[x]

    if y >= len(array):
        return default

    return row[y]

### Action

In [30]:
class Action:
    def __init__(self, x_modifier, y_modifier, desc = "Action"):
        self.x_modifier = x_modifier
        self.y_modifier = y_modifier
        self.desc = desc

    def __str__(self):
        return f"{self.desc}"

    def get_new_position(self, x: int, y: int) -> Position:
        return self.x_modifier(x), self.y_modifier(y)

UP = Action(lambda x: x, lambda y: y-1 , desc="UP")
RIGHT = Action(lambda x: x+1, lambda y: y, desc="RIGHT")
DOWN = Action(lambda x: x, lambda y: y+1, desc="DOWN")
LEFT = Action(lambda x: x-1, lambda y: y, desc="LEFT")

### State

In [23]:
class State:
    def __init__(self, x, y, actions):
        self.x = x
        self.y = y
        self.actions = actions

    def __repr__(self):
        return f"[{self.x}][{self.y}] actions: {[str(action) for action in self.actions]}"
    
States = list[list[State]]
    
def get_states_from_world(world) -> States:
    states = []
    for i, row in enumerate(world):
        states_row = []
        for j, cell in enumerate(row):
            actions = []
            # Up
            if (get_value(world, i-1, j, -1) != -1):
                actions.append(UP)

            # Right
            if (get_value(world, i, j+1, -1) != -1):
                actions.append(RIGHT)

            # Down
            if (get_value(world, i+1, j, -1) != -1):
                actions.append(DOWN)

            # Left
            if (get_value(world, i, j-1, -1) != -1):
                actions.append(LEFT)

            states_row.append(State(j, i, actions))
        states.append(states_row)
    return states

### MPD

In [33]:
class MDP:
    def __init__(self, states: States, rewards: Rewards):
        self.states = states
        self.rewards = rewards

    def __str__(self):
        value = "[\n"
        for i, states_row in enumerate(self.states):
            value += "\t[\n"
            for j, state in enumerate(states_row):
                value += f"\t\t{str(state)}{',' if j == len(states_row) -1 else ''}\n"
            value += f"\t]{',' if i == len(self.states) -1 else ''}\n"

        return f"{self.states}"

    def get_state(self, x: int, y: int):
        return self.states[y][x]
    
    def get_reward(self, state: State, action: Action):
        next_x, next_y = action.get_new_position(x=state.y, y=state.y)
        return self.rewards[next_y][next_x]

### Agent

In [35]:
class Agent:
    def __init__(self, mdp: MDP):
        self.mdp = mdp
    
    def train(self, start_x: int, start_y: int, max_steps: int, num_runs: int, discount: int = 0):
        state = self.mdp.get_state(start_x, start_y)
        actions = state.actions
        
        print([str(action) for action in actions])

### World Map

- 7x7 grid

In [34]:
world = [[ 1, 0, 0, 0, 0, 0, 0],
        [ 0, 0, 0, 0, 0, 0, 0 ],
        [ -1, -1, -1, -1, -1, -1, 0 ],
        [ 0, 0, 0, 0, 0, 0, 0 ],
        [ 0, 0, 0, 0, 0, 0, 0 ],
        [ 0, 0, 0, 0, 0, 0, 0 ],
        [ 0, 0, 0, 0, 0, 0, 0 ]]


rewards = np.full((7, 7), -1)
rewards[0][0] = 20

states = get_states_from_world(world)

mdp = MDP(states, rewards)
agent = Agent(mdp)
agent.train(0, 6, max_steps=50, num_runs=20)

['UP', 'RIGHT']
