In [1]:
import numpy as np
import gym 
import gym_gridworlds

In [47]:
import numpy as np

# global variables
BOARD_ROWS = 5
BOARD_COLS = 5
WIN_STATE = (4, 4)
LOSE_STATE = (3, 3)
START = (0, 0)
DETERMINISTIC = True
WIN_REWARD = 1
LOSE_REWARD = -1
TRANSITION_REWARD = -0.05
CONVERGENCE_FACTOR = 0.01
DISCOUNT_FACTOR = 0.9
class GridWorld:
    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.board[1, 1] = -1
        self.state = state
        self.isEnd = False
        self.determine = DETERMINISTIC
        
    def giveReward(self):
        if self.state == WIN_STATE:
            return WIN_REWARD
        elif self.state == LOSE_STATE:
            return LOSE_REWARD
        else:
            return TRANSITION_REWARD

    def get_reward_table(self):
        # Inicializa uma tabela de recompensas com zeros para todos os estados
        reward_table = np.zeros([BOARD_ROWS, BOARD_COLS])
        # Percorre todas as linhas e colunas do grid dando as recompensas
        for r in range(BOARD_ROWS):
            for c in range(BOARD_COLS):
                state_tuple = (r,c)
                if state_tuple == WIN_STATE:
                    reward_table[r][c] = WIN_REWARD
                elif state_tuple == LOSE_STATE:
                    reward_table[r][c] = LOSE_REWARD
                else:
                    reward_table[r][c] = TRANSITION_REWARD
        return reward_table

    def isEndFunc(self):
        if (self.state == WIN_STATE) or (self.state == LOSE_STATE):
            self.isEnd = True

    def nxtPosition(self,state, action):
        """
        action: up, down, left, right
        -------------
        0 | 1 | 2| 3|
        1 |
        2 |
        return next position
        """
        if self.determine:
            if action == "up":
                nxtState = (state[0] - 1, state[1])
            elif action == "down":
                nxtState = (state[0] + 1, state[1])
            elif action == "left":
                nxtState = (state[0], state[1] - 1)
            else:
                nxtState = (state[0], state[1] + 1)
            # if next state legal
            if (nxtState[0] >= 0) and (nxtState[0] <= (BOARD_ROWS -1)):
                if (nxtState[1] >= 0) and (nxtState[1] <= (BOARD_COLS -1)):
                    if nxtState != (1, 1):
                        return nxtState
            return state

    def showBoard(self):
        self.board[self.state] = 1
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = '*'
                if self.board[i, j] == -1:
                    token = 'z'
                if self.board[i, j] == 0:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-----------------')


# Agent of player

class Agent:

    def __init__(self):
        self.states = []
        self.actions = ["up", "down", "left", "right"]
        self.State = GridWorld()
        self.lr = 0.2
        self.exp_rate = 0.3
        # initial state reward
        self.state_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0  # set initial value to 0

    def takeAction(self, action):
        position = self.State.nxtPosition(GridWorld.state,action)
        return GridWorld(state=position)

    def reset(self):
        self.states = []
        self.State = GridWorld()

    def optimizeMDP(self, reward_table):
        values = np.zeros([BOARD_ROWS, BOARD_COLS])
        policy = [["" for _ in range(BOARD_COLS)] for _ in range(BOARD_ROWS)]
        converge = False
        while not converge:
            delta = 0
            for r in range(BOARD_ROWS):
                for c in range(BOARD_COLS):
                    temp = values[r][c]
                    max_value = float('-inf')

                    for a in self.actions:
                        next_row , next_col = self.State.nxtPosition((r,c), a)
                        max_value = max(max_value,values[next_row][next_col])
                        if(values[next_row][next_col] == max_value):
                            policy[r][c] = a 

                    values[r][c] = reward_table[r][c] + DISCOUNT_FACTOR*max_value
                    delta = max(delta, abs(temp - values[r][c]))
                if delta < CONVERGENCE_FACTOR:
                    converge = True
            self.state_values = values
        return values , policy                 
        

if __name__ == "__main__":
    grid = GridWorld()
    agente = Agent()
    table = grid.get_reward_table()
    print('rewards')
    print( table)
    print('-------------')
    print('valores')
    print(agente.optimizeMDP(table)[0])
    print('-------------')
    print('acao da politica')
    for i in range(BOARD_ROWS):
        print(agente.optimizeMDP(table)[1][i] )

rewards
[[ 0.1  0.1  0.1  0.1  0.1]
 [ 0.1  0.1  0.1  0.1  0.1]
 [ 0.1  0.1  0.1  0.1  0.1]
 [ 0.1  0.1  0.1 -1.   0.1]
 [ 0.1  0.1  0.1  0.1  1. ]]
-------------
valores
[[4.79189612 5.22236333 5.70066023 6.23210123 6.82259123]
 [5.22236333 5.70066023 6.23210123 6.82259123 7.47869123]
 [5.70066023 6.23210123 6.82259123 7.47869123 8.20769123]
 [6.23210123 6.82259123 7.47869123 7.10769123 9.01769123]
 [6.82259123 7.47869123 8.20769123 9.01769123 9.91769123]]
-------------
acao da politica
['right', 'right', 'right', 'right', 'down']
['down', 'right', 'right', 'right', 'down']
['right', 'right', 'right', 'right', 'down']
['right', 'right', 'down', 'right', 'down']
['right', 'right', 'right', 'right', 'right']


In [54]:
BOARD_ROWS = 5
BOARD_COLS = 5
WIN_STATE = (4, 4)
LOSE_STATE = (3, 3)
START = (0, 0)
DETERMINISTIC = True
WIN_REWARD = 1
LOSE_REWARD = -1
TRANSITION_REWARD = 0.1
CONVERGENCE_FACTOR = 0.01
DISCOUNT_FACTOR = 0.5
if __name__ == "__main__":
    grid = GridWorld()
    agente = Agent()
    table = grid.get_reward_table()
    print('rewards')
    print( table)
    print('-------------')
    print('valores')
    print(agente.optimizeMDP(table)[0])
    print('-------------')
    print('acao da politica')
    for i in range(BOARD_ROWS):
        print(agente.optimizeMDP(table)[1][i] )

rewards
[[ 0.1  0.1  0.1  0.1  0.1]
 [ 0.1  0.1  0.1  0.1  0.1]
 [ 0.1  0.1  0.1  0.1  0.1]
 [ 0.1  0.1  0.1 -1.   0.1]
 [ 0.1  0.1  0.1  0.1  1. ]]
-------------
valores
[[ 0.1984375   0.19921875  0.19960938  0.19980469  0.19990234]
 [ 0.19921875  0.19960938  0.19980469  0.19990234  0.31245117]
 [ 0.19960938  0.19980469  0.19990234  0.31245117  0.53745117]
 [ 0.19980469  0.19990234  0.31245117 -0.56254883  0.98745117]
 [ 0.19990234  0.31245117  0.53745117  0.98745117  1.88745117]]
-------------
acao da politica
['right', 'right', 'right', 'right', 'left']
['down', 'right', 'right', 'right', 'down']
['right', 'right', 'right', 'right', 'down']
['right', 'right', 'down', 'right', 'down']
['right', 'right', 'right', 'right', 'right']


In [55]:
BOARD_ROWS = 5
BOARD_COLS = 5
WIN_STATE = (4, 4)
LOSE_STATE = (3, 3)
START = (0, 0)
DETERMINISTIC = True
WIN_REWARD = 1
LOSE_REWARD = -1
TRANSITION_REWARD = 0.1
CONVERGENCE_FACTOR = 0.01
DISCOUNT_FACTOR = 0.6
if __name__ == "__main__":
    grid = GridWorld()
    agente = Agent()
    table = grid.get_reward_table()
    print('rewards')
    print( table)
    print('-------------')
    print('valores')
    print(agente.optimizeMDP(table)[0])
    print('-------------')
    print('acao da politica')
    for i in range(BOARD_ROWS):
        print(agente.optimizeMDP(table)[1][i] )

rewards
[[ 0.1  0.1  0.1  0.1  0.1]
 [ 0.1  0.1  0.1  0.1  0.1]
 [ 0.1  0.1  0.1  0.1  0.1]
 [ 0.1  0.1  0.1 -1.   0.1]
 [ 0.1  0.1  0.1  0.1  1. ]]
-------------
valores
[[ 0.27416108  0.29935532  0.34134572  0.41132972  0.52796972]
 [ 0.29935532  0.34134572  0.41132972  0.52796972  0.72236972]
 [ 0.34134572  0.41132972  0.52796972  0.72236972  1.04636972]
 [ 0.41132972  0.52796972  0.72236972 -0.05363028  1.58636972]
 [ 0.52796972  0.72236972  1.04636972  1.58636972  2.48636972]]
-------------
acao da politica
['right', 'right', 'right', 'right', 'down']
['down', 'right', 'right', 'right', 'down']
['right', 'right', 'right', 'right', 'down']
['right', 'right', 'down', 'right', 'down']
['right', 'right', 'right', 'right', 'right']


achei interessante esse pq se vc coloca pra ele ter fator de desconto 1 e valor de transicao maior q 0 ele ve muito no futuro e acaba rodando pra sempre


In [56]:

BOARD_ROWS = 5
BOARD_COLS = 5
WIN_STATE = (4, 4)
LOSE_STATE = (3, 3)
START = (0, 0)
DETERMINISTIC = True
WIN_REWARD = 1
LOSE_REWARD = -1
TRANSITION_REWARD = 0.1
CONVERGENCE_FACTOR = 0.01
DISCOUNT_FACTOR = 1
if __name__ == "__main__":
    grid = GridWorld()
    agente = Agent()
    table = grid.get_reward_table()
    print('rewards')
    print( table)
    print('-------------')
    print('valores')
    print(agente.optimizeMDP(table)[0])
    print('-------------')
    print('acao da politica')
    for i in range(BOARD_ROWS):
        print(agente.optimizeMDP(table)[1][i] )

rewards
[[ 0.1  0.1  0.1  0.1  0.1]
 [ 0.1  0.1  0.1  0.1  0.1]
 [ 0.1  0.1  0.1  0.1  0.1]
 [ 0.1  0.1  0.1 -1.   0.1]
 [ 0.1  0.1  0.1  0.1  1. ]]
-------------
valores
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\guilh\anaconda3\envs\columbia_ai\Lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\guilh\AppData\Local\Temp\ipykernel_24220\969363149.py", line 20, in <module>
    print(agente.optimizeMDP(table)[0])
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\guilh\AppData\Local\Temp\ipykernel_24220\1804627987.py", line None, in optimizeMDP
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\guilh\anaconda3\envs\columbia_ai\Lib\site-packages\IPython\core\interactiveshell.py", line 2120, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\guilh\anaconda3\envs\columbia_ai\Lib\site-packages\IPython\core\ultratb.py", line 1435, in structured_traceback
    return FormattedTB.structu