In [1]:
from IPython.display import display, Image, SVG, Math, YouTubeVideo

In [2]:
Image(url='https://cdn-media-1.freecodecamp.org/images/3JXI06jyHegMS1Yx8rhIq64gkYwSTM7ZhD25')

* The robot loses 1 point at each step. This is done so that the robot takes the shortest path and reaches the goal as fast as possible.
* If the robot steps on a mine, the point loss is 100 and the game ends.
* If the robot gets power ⚡️, it gains 1 point.
* If the robot reaches the end goal, the robot gets 100 points.

In [3]:
import numpy as np
import pandas as pd
import random as rand
from collections import OrderedDict

# Rewards matrix

In [4]:
R = np.matrix(np.ones([5,6]))
R *= -1
R

matrix([[-1., -1., -1., -1., -1., -1.],
        [-1., -1., -1., -1., -1., -1.],
        [-1., -1., -1., -1., -1., -1.],
        [-1., -1., -1., -1., -1., -1.],
        [-1., -1., -1., -1., -1., -1.]])

In [5]:
R[0,2] = 1
R[1,1] = -100
R[1,4] = -100
R[2,2] = 1
R[2,5] = 1
R[3,0] = -100
R[3,3] = -100
R[4,1] = 1
R[4,4] = 100
R

matrix([[  -1.,   -1.,    1.,   -1.,   -1.,   -1.],
        [  -1., -100.,   -1.,   -1., -100.,   -1.],
        [  -1.,   -1.,    1.,   -1.,   -1.,    1.],
        [-100.,   -1.,   -1., -100.,   -1.,   -1.],
        [  -1.,    1.,   -1.,   -1.,  100.,   -1.]])

In [6]:
R.shape

(5, 6)

# Actions

In [7]:
A = {
    'up': (-1, 0),
    'down': (1, 0),
    'left': (0, -1),
    'right': (0, 1)
}
A = OrderedDict(A)
A

OrderedDict([('up', (-1, 0)),
             ('down', (1, 0)),
             ('left', (0, -1)),
             ('right', (0, 1))])

# Q Matrix

In [8]:
Q = np.matrix(np.zeros([R.shape[0]*R.shape[1], len(A)]))
Q

matrix([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [9]:
Q.shape

(30, 4)

## Go to (row, column) expected reward

In [10]:
def Q_idx(coords=(0,0)):
    return Q[coords[0]*R.shape[1] + coords[1]]

In [11]:
Q_idx((0,0))

matrix([[0., 0., 0., 0.]])

In [12]:
Q_idx((4,5))

matrix([[0., 0., 0., 0.]])

# Learning parameter (gamma)
* between 0 and 1
* closer to 0: future rewards are not that important
* closer to 1: future rewards are very important

In [13]:
gamma = 0.8

# Available actions

In [14]:
def available_actions(state):
    actions = []
    for (key, value) in A.items():
        coords = (state[0] + value[0], state[1] + value[1])
        if coords in map(lambda r: r[0], np.ndenumerate(R)):
            actions.append(key)
    return actions

In [15]:
available_actions((0,0))

['down', 'right']

In [16]:
available_actions((4,5))

['up', 'left']

## Sample random action

In [17]:
def sample_next_action(available_actions_range):
    return rand.choice(available_actions_range)

In [18]:
actions_range = available_actions((4,5))
sample_next_action(actions_range)

'up'

In [19]:
def make_step(state, diff):
    return (state[0] + diff[0], state[1] + diff[1])

In [20]:
def index_of_action(act):
    return list(A.keys()).index(act)

In [21]:
def actions_index(idx):
    return list(A.keys())[idx]

# Initial state

In [22]:
initial_state = (0,0)

In [23]:
actions = available_actions(initial_state)
actions

['down', 'right']

In [24]:
new_action = sample_next_action(actions)
new_action

'right'

## Update

In [25]:
def update(current_state, action, gamma):
    print('current state: ', current_state)
    print('action: ', action)
    diff = A[action]
    new_state = make_step(current_state, diff)
    future_rewards = Q_idx(new_state)
    print('future rewards: ',future_rewards)
    
    max_index = np.where(future_rewards == np.max(future_rewards))[1]
    print('max_index', max_index)
  
    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size = 1))
    else:
        max_index = int(max_index)
    
    max_value = future_rewards[0,max_index]
    print('max_value', max_value)
  
    Q_idx(current_state)[0,index_of_action(action)] = R[current_state] + gamma * max_value
  
    if (np.max(Q) > 0):
        return (np.sum(Q/np.max(Q)*100)), new_state
    else:
        return (0), new_state

In [26]:
update(initial_state, new_action, gamma)

current state:  (0, 0)
action:  right
future rewards:  [[0. 0. 0. 0.]]
max_index [0 1 2 3]
max_value 0.0


(0, (0, 1))

# Training

In [27]:
epsilon = 0
iterations = 1000
max_steps = 50

for i in range(iterations):
    state = initial_state

    epochs, penalties, reward = 0, 0, 0
    done = False
    
    steps = 0
    while not done:
        possible_actions = available_actions(state)
        if rand.uniform(0, 1) < epsilon:
            action = sample_next_action(possible_actions) # Explore action space
        else:
            action_idx = np.argmax(Q_idx(state)) # Exploit learned values
            action = actions_index(action_idx)
            if action not in possible_actions:
                action = sample_next_action(possible_actions)

        new_Q, new_state = update(state, action, gamma)
        
        state = new_state
        steps += 1
        
        # check if it's done
        if state == (4,4):
            done = True
        if steps == max_steps:
            done = True
    
    epsilon = i/iterations

print("Training finished.\n")


current state:  (0, 0)
action:  right
future rewards:  [[0. 0. 0. 0.]]
max_index [0 1 2 3]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[0. 0. 0. 0.]]
max_index [0 1 2 3]
max_value 0.0
current state:  (1, 1)
action:  up
future rewards:  [[ 0. -1.  0.  0.]]
max_index [0 2 3]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.    0.    0.    0.]]
max_index [1 2 3]
max_value 0.0
current state:  (1, 1)
action:  down
future rewards:  [[0. 0. 0. 0.]]
max_index [0 1 2 3]
max_value 0.0
current state:  (2, 1)
action:  up
future rewards:  [[-100. -100.    0.    0.]]
max_index [2 3]
max_value 0.0
current state:  (1, 1)
action:  left
future rewards:  [[0. 0. 0. 0.]]
max_index [0 1 2 3]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0.  0.  0. -1.]]
max_index [0 1 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[ 0. -1.  0.  0.]]
max_index [0 2 3]
max_value 0.0
current state:  (0, 1)
action:  left
future 

current state:  (0, 5)
action:  down
future rewards:  [[-1.  -0.2 -1.   0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  left
future rewards:  [[-100. -100. -100. -100.]]
max_index [0 1 2 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[ 0. -1. -1. -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  left
future rewards:  [[-100. -100. -100. -100.]]
max_index [0 1 2 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[ 0. -1. -1. -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100. -100. -100. -100.]]
max_index [0 1 2 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0

future rewards:  [[-1.   -1.    0.   -1.16]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.   -1.    0.   -1.16]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100. -100. -100. -100.]]
max_index [0 1 2 3]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100. -100. -100. -100.]]
max_index [0 1 2 3]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[

future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100. -100.    0. -100.]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1. -1. -1. -1.]]
max_index [0 1 2 3]
max_value -1.0
current state:  (3, 1)
action:  up
future rewards:  [[-1.    -1.  

current state:  (0, 1)
action:  right
future rewards:  [[0. 0. 1. 1.]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100. -100. -100. -100.]]
max_index [0 1 2 3]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0. 0. 1. 1.]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0. 0. 1. 1.]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0. 0. 1. 1.]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
fu

current state:  (0, 1)
action:  right
future rewards:  [[0. 0. 1. 1.]]
max_index [2 3]
max_value 1.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0. 0. 1. 1.]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0. 0. 1. 1.]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0. 0. 1. 1.]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future reward

future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100. -100. -100. -100.]]
max_index [0 1 2 3]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0. 0. 1. 1.]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100. -100. -

current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100. -100. -100. -100.]]
max_index [0 1 2 3]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100. -100. -100. -100.]]
max_index [0 1 2 3]
max_value -100.0
c

current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.    ]]
max_index [0 2 3]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
curren

current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.         -100.            0.         -100.43932035]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-1.    -1.    -1.    -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rew

future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-1.    -1.    -1.    -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84 0.84 0.   0.  ]]
max_index [0 1]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2  -1.  -81.   -1.8]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2

future rewards:  [[0.84 0.84 0.   0.  ]]
max_index [0 1]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2  -1.  -81.   -1.8]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current s

max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0

current state:  (2, 0)
action:  right
future rewards:  [[-1.    -1.    -1.    -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84 0.84 0.   0.  ]]
max_index [0 1]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2  -1.  -81.   -1.8]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
m

current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.    ]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.5555556 0.        0.        1.       ]]
max_index [3]
max_value 1.0
current state:  (4, 1)
action:  right
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  right
future rewards:  [[-81.   0.  -1.   0.]]
max_index [1 3]
max_value 0.0
current state:  (4, 3)
action:  up
future rewards:  [[-100.     -100.     -100.2624 -100.    ]]
max_index [0 1 3]
max_value -100.0
current state:  (3, 3)
action:  up
future rewards:  [[-1.    -1.    -0.328  0.   ]]
max_index [3]
max_value 0.0
current state:  (2, 3)
action:  right
future rewards:  [[-81.   0.   0.   0.]]
max_index [1 2 3]
max_value 0.0
current state:  (2, 4)
action:  down
f

max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-1.    -1.    -1.    -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.     0.    ]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:

future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
a

action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.    ]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.5555556 0.        1.        1.       ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.5555556 0.        1.        1.       ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     

current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-1.    -1.    -1.    -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
fut

future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current sta

current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  u

future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-1.    -1.16  -1.    -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.     0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  down
future rewards:  [[-0.328      -1.         -1.44442025 -1.        ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  up
future rewards:  [[0.84   0.7376 0.     0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2  -1.  -81.   -1.8]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
futur

future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:

max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2  -1.  -81.   -1.8]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]

current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_in

current state:  (4, 0)
action:  right
future rewards:  [[0.5555556 0.        1.        1.       ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.5555556 0.        1.        1.       ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.5555556 0.        1.        1.       ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
fut

current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index

max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0. 

current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
curr

current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2  -1.  -81.   -1.8]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100

future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
curre

current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  

max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  down
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  up
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [

max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  up
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0


current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  left
future rewards:  [[-100.   -100

current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.

max_index [2 3]
max_value 1.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[ 

current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_v

max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rew

future rewards:  [[-100.  -100.  -100.8 -100. ]]
max_index [0 1 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.  -100.  -100.8 -100. ]]
max_index [0 1 3]
max_value -100.0
current state:  (1, 4)
action:  down
future rewards:  [[-81.      -1.      -1.2624   0.    ]]
max_index [3]
max_value 0.0
current state:  (2, 4)
action:  right
future rewards:  [[1. 1. 1. 0.]]
max_index [0 1 2]
max_value 1.0
current state:  (2, 5)
action:  up
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  up
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.  -100.  -100.8 -100. ]]
max_index [0 1 3]
max_value -100.0
current state:  (1, 4)
action

current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 

max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards: 

future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[-1.     -1.2624 -1.16   -1.    ]]
max_index [0 3]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[-1.     -1.2624 -1.16   -1.    ]]
max_index [0 3]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[-1.     -1.2624 -1.16   -1.    ]]
max_index [0 3]
max_value -1.0
current sta

max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
acti

future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  up
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  up
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  left
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.

max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  left
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  down
future rewards:  [[-0.328 -1.    -1.16  -1.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  up
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0. 

current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up


current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future 

future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  up
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  right
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  left
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
acti

max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
act

max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
futur

max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1. 

future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state

current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_va

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)


current state:  (3, 2)
action:  up
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_v

max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future 

max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  down
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_i

future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2

future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current sta

future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 

max_value 0.0
current state:  (3, 0)
action:  down
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  right
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  up
future rewards:  [[0.

action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (

current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0,

future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up


current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.

current state:  (1, 2)
action:  right
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current stat

current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
cur

current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  right
f

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  right
future rewards:  [[ -1.  

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_

future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   

future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current stat

max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  left
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_i

current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  down
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.

future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)

action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.

future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
futur

current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  right
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  up
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  right
future rewards:  [[ -1.8   -81.     -0.328  -1.16 ]]
max_index [2]
max_value -0.32799999999999985
current state:  (2, 3)
action:  left
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]


current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.  

current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_

action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
curren

max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
ac

current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  left
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -

max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  left
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards: 

max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  down
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100. 

max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)

max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0

future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
f

current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  down
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999

current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  up
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
cur

current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  right
future rewards:  [[ -1.8   -81.     -0.328  -1.16 ]]
max_index [2]
max_value -0.32799999999999985
current state:  (2, 3)
action:  left
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (

current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
curren

max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  up
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future r

future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0

max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.

future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
futur

current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     

max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   

future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0

current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_

max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future reward

current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
m

max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[

current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.

current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
fu

future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  left
future rewards:  [[-81.      -1.8     -1.2624  -0.2   ]]
max_index [3]
max_value -0.19999999999999996
current state:  (2, 4)
action:  up
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  right
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  left
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -

max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  righ

current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewar

future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.

max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  d

current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  right
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  right
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
fu

current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 

action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  up
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.

current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_v

current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current st

current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
c

current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  left
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  right
future rewards:  [[ -1.8   -81.     -0.328  -1.16 ]]
max_index [2]
max_value -0.32799999999999985
current state:  (2, 3)
action:  right
future rewards:  [[-81.      -1.8     -1.2624  -0.2   ]]
max_index [3]
max_value -0.19999999999999996
current state:  (2, 4)
action:  down
future rewards:  [[ -1.16 

action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  down
future rewards:  [[ -1.8   -81.     -0.328  -1.16 ]]
max_index [2]
max_value -0.32799999999999985
current state:  (2, 3)
action:  down
future rewards:  [[-100.2624 -100.     -100.2624 -100.8   ]]
max_index [1]
max_value -100.0
current state:  (3, 3)
action:  down
future rewards:  [[-81.   0.  -1.  -1.]]
max_index [1]
max_value 0.0
current state:  (4, 3)
action:  right
future rewards:  [[0. 0. 0. 0.]]
max_index [0 1 2 3]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_v

action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  down
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  up
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  do

max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  right
future rewards:  [[ -1.8   -81.     -0.328  -1.16 ]]
max_index [2]
max_value -0.32799999999999985
current state:  (2, 3)
action:  up
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
curr

future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future reward

current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_v

current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state: 

max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_

current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
f

current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  left
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
cu

max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future

future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  right
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  left
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1

max_index [1]
max_value -100.0
current state:  (3, 3)
action:  down
future rewards:  [[-81.   0.  -1.  -1.]]
max_index [1]
max_value 0.0
current state:  (4, 3)
action:  up
future rewards:  [[-100.2624 -100.     -100.2624 -100.8   ]]
max_index [1]
max_value -100.0
current state:  (3, 3)
action:  up
future rewards:  [[ -1.8   -81.     -0.328  -1.16 ]]
max_index [2]
max_value -0.32799999999999985
current state:  (2, 3)
action:  down
future rewards:  [[-100.2624 -100.     -100.2624 -100.8   ]]
max_index [1]
max_value -100.0
current state:  (3, 3)
action:  right
future rewards:  [[ -1.16  -1.   -81.    -1.  ]]
max_index [1 3]
max_value -1.0
current state:  (3, 4)
action:  down
future rewards:  [[0. 0. 0. 0.]]
max_index [0 1 2 3]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current st

future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0

current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
curren

current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewar

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  left
future rewards:  [[-81.      

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current st

max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future 

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[

future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rew

action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1

current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  down
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  down
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  right
future rewards:  [[-81.   0.  -1.  -1.]]
max_index [1]
max_value 0.0
current state:  (4, 3)
action:  right
future rew

current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
f

current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_

current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_va

max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  right
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  down
fut

current state:  (3, 0)
action:  down
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  down
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  right
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  down
future rewards:  [[-1.2624

current state:  (1, 5)
action:  left
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  left
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  down
future rewards:  [[ -1.8   -81.     -0.328  -1.16 ]]
max_index [2]
max_value -0.32799999999999985
current state:  (2, 3)
action:  left
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  right
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 0)

current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  left
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 

max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  right
futur

future rewards:  [[-81.      -1.8     -1.2624  -0.2   ]]
max_index [3]
max_value -0.19999999999999996
current state:  (2, 4)
action:  down
future rewards:  [[ -1.16  -1.   -81.    -1.  ]]
max_index [1 3]
max_value -1.0
current state:  (3, 4)
action:  left
future rewards:  [[-100.2624 -100.     -100.2624 -100.8   ]]
max_index [1]
max_value -100.0
current state:  (3, 3)
action:  right
future rewards:  [[ -1.16  -1.   -81.    -1.  ]]
max_index [1 3]
max_value -1.0
current state:  (3, 4)
action:  down
future rewards:  [[0. 0. 0. 0.]]
max_index [0 1 2 3]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
m

max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards: 

future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 

future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  right
future rewards:  [[ -1.   

max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  up
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  up
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  left
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]

max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
futur

future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  right
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
ac

current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  left
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  left
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.  

current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0.

current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  up
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  up
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
actio

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.  

current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.     

current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
c

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state

future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  left
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
curre

future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  up
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  left
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  down
future rewards:  [[0.84

future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  up
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  up
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  right
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action: 

future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  right
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  left
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  r

future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  right
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state: 

max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_ind

current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current sta

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  left
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  left
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0

current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
ma

max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  right
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  right
future rewards:  [[-81.   0.  -1.  -1.]]
max_index [1]
max_value 0.0
current state:  (4, 3)
action:  right
future rewards:  [[0. 0. 0. 0.]]
max_index [0 1 2 3]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
futu

current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  down
future rewards:  [

max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]

current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  up
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)


current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  right
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  right
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  right
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  up
future rewards:  [[ 0. -1. -1. 

max_value -100.0
current state:  (1, 4)
action:  left
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  up
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  up
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
cu

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
cu

future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  left
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  down
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 

current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future 

current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
ac

future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  down
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_va

future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999

max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  down
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
act

max_index [0]
max_value -1.0
current state:  (1, 3)
action:  right
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  left
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current

current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  down
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  right
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  down
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  left
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  up
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_inde

current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  up
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
curre

current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  

current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  down
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  up
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  down
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
curr

future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  left
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [

current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  down
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
curre

current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  down
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  down
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  up
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  down
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  right
future rewards:  [[-100.2624 -100.     -100.2624 -100.8   ]]
max_index [1]
max_value -100.0
current state:  (3, 3)
action:  down
future rewards:  [[-81.   0.  -1.  -1.]]
max_index [1]
max_valu

action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  up
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  right
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  left
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (

max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  down
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000

action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16 

max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
c

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      

current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  right
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  right
future rewards:  [[-81.   0.  -1.  -1.]]
max_index [1]
max_value 0.0
current state:  (4, 3)
action:  left
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  up
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  up
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  down
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
ac

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  r

max_index [0 2]
max_value -100.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  up
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
curren

max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  up
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. 

current state:  (4, 1)
action:  right
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  left
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  up
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  up
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  up
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
cur

current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  left
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right


current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  up
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_val

current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_v

current state:  (0, 3)
action:  down
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  left
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 

max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  right
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  right
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
f

max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  down
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  right
future rewards:  [[ -1.8   -81.     -0.328  -1.16 ]]
max_index [2]
max_value -0.32799999999999985
current state:  (2, 3)
action:  right
future rewards:  [[-81.      -1.8     -1.2624  -0.2   ]]
max_index [3]
max_va

future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  down
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  right
future rewards:  [[-81.   0.  -1.  -1.]]
max_index [1]
max_value 0.0
current state:  (4, 3)
action:  right
future rewards:  [[0. 0. 0. 0.]]
max_index [0 1 2 3]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  left
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  

current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  down
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  up
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
ac

future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  down
future rewards:  [[-0.2 -1.  -1.8  0. ]]
max_index [3]
max_value 0.0
current state:  (3, 5)
action:  down
future rewards:  [[-1.  0. -1.  0.]]
max_index [1 3]
max_value 0.0
current state:  (4, 5)
action:  left
future rewards:  [[0. 0. 0. 0.]]
max_index [0 1 2 3]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards

current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  down
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  up
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000

future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  down
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  up
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  right
future rewards:  [[-100.2624 -100.     -100.2624 -100.8   ]]
max_index [1]
max_value -100.0
current state:  (3, 3)
action:  left
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  down
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  left
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  right
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  up
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  left
future rewards:  

future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  down
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  up
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  up
future rewa

current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_

max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  up
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  

max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  down
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  right
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  left
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  down
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  up
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)
action:  right
future rewards:  [[ -

future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  right
future rewards:  [[ 0.  

future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  down
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  up
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_in

max_index [3]
max_value 0.0
current state:  (1, 5)
action:  up
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  up
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  up
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  left
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  left
future rewards:  [[ -1.      -1.2624  -1.16   -81.    ]]
max_index [0]
max_value -1.0
current state:  (1, 3)
action:  left
future rewards:  [[ -0.2    -0.328 -81.     -1.8 

current state:  (0, 2)
action:  right
future rewards:  [[ 0.  -1.8 -0.2 -1. ]]
max_index [0]
max_value 0.0
current state:  (0, 3)
action:  left
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1,

current state:  (1, 5)
action:  up
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  right
future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  left
future rewards:  [[  0. -81.  -1.  -1.]]
max_index [0]
max_value 0.0
current state:  (0, 4)
action:  down
future rewards:  [[-100.   -100.16 -100.8  -100.  ]]
max_index [0 3]
max_value -100.0
current state:  (1, 4)
action:  right
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  up
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)


future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  right
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (2, 2)
action:  right
future rewards:  [[ -1.8   -81.     -0.328  -1.16 ]]
max_index [2]
max_value -0.32799999999999985
current state:  (2, 3)
action:  down
future rewards:  [[-100.2624 -100.     -100.2624 -100.8   ]]
max_index [1]
max_value -100.0
current state:  (3, 3)
action:  up
future rewards:  [[ -1.8   -81.     -0.328  -1.16 ]]
max_index [2]
max_value -0.32799999999999985
current state:  (2, 3)
action:  left
future rewards:  [[0.84   0.7376 0.7376 0.7376]]
max_index [0]
max_value 0.8400000000000001
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
cur

current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  left
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)

future rewards:  [[ 0. -1. -1.  0.]]
max_index [0 3]
max_value 0.0
current state:  (0, 5)
action:  down
future rewards:  [[ -1.   -0.2 -81.    0. ]]
max_index [3]
max_value 0.0
current state:  (1, 5)
action:  down
future rewards:  [[1.   1.   0.84 0.  ]]
max_index [0 1]
max_value 1.0
current state:  (2, 5)
action:  down
future rewards:  [[-0.2 -1.  -1.8  0. ]]
max_index [3]
max_value 0.0
current state:  (3, 5)
action:  left
future rewards:  [[ -1.16  -1.   -81.    -1.  ]]
max_index [1 3]
max_value -1.0
current state:  (3, 4)
action:  left
future rewards:  [[-100.2624 -100.     -100.2624 -100.8   ]]
max_index [1]
max_value -100.0
current state:  (3, 3)
action:  left
future rewards:  [[ -0.328  -1.     -1.16  -81.   ]]
max_index [0]
max_value -0.32799999999999985
current state:  (3, 2)
action:  down
future rewards:  [[-1.2624  0.     -0.2    -1.    ]]
max_index [1]
max_value 0.0
current state:  (4, 2)
action:  right
future rewards:  [[-81.   0.  -1.  -1.]]
max_index [1]
max_value 0.0
cur

future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.

future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  right
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  down
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1

max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  down
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  right
future rewards:  [[-1.2624 -0.2    -1.     -1.2624]]
max_index [1]
max_value -0.19999999999999996
current state:  (3, 1)


future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  up
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  left
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 2)
action:  left
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 

future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  up
future rewards:  [[ 0. -1.  0. -1.]]
max_index [0 2]
max_value 0.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  right
future rewards:  [[-81.     -1.16   -1.     -0.328]]
max_index [3]
max_value -0.32799999999999985
current state:  (2, 1)
action:  up
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  right
future rewards:  [[ -0.2    -0.328 -81.     -1.8  ]]
max_index [0]
max_value -0.19999999999999996
current state:  (1, 2)
action:  left
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]

future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  right
future rewards:  [[0.84 0.   1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (4, 1)
action:  left
future rewards:  [[-1.   0.   0.  -0.2]]
max_index [1 2]
max_value 0.0
current state:  (4, 0)
action:  up
future rewards:  [[-100.   -100.      0.   -100.16]]
max_index [2]
max_value 0.0
current state:  (3, 0)
action:  down
future rewards:  

current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  down
future rewards:  [[-1.     -1.      0.     -1.2624]]
max_index [2]
max_value 0.0
current state:  (2, 0)
action:  up
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  up
future rewards:  [[  0.  -81.   -1.   -0.2]]
max_index [0]
max_value 0.0
current state:  (0, 1)
action:  right
future rewards:  [[0.   0.84 1.   1.  ]]
max_index [2 3]
max_value 1.0
current state:  (0, 0)
action:  down
future rewards:  [[ -1.  -1.   0. -81.]]
max_index [2]
max_value 0.0
current state:  (1, 0)
action:  right
future rewards:  [[-100.     -100.2624 -100.     -100.16  ]]
max_index [0 2]
max_value -100.0
current state:  (1, 1)
action:  down
future rewards:  [[-81.     -1.16   -1.     -0.3

KeyboardInterrupt: 

In [None]:
Q