In [2]:
import numpy as np
from gridworld2 import GridWorld

In [3]:
def softmax(x):
    '''Compute softmax values of array x.

    @param x the input array
    @return the softmax array
    '''
    return np.exp(x - np.max(x)) / np.sum(np.exp(x - np.max(x)))

def update_critic(utility_matrix, observation, new_observation, 
                   reward, alpha, gamma, done):
    '''Return the updated utility matrix

    @param utility_matrix the matrix before the update
    @param observation the state obsrved at t
    @param new_observation the state observed at t+1
    @param reward the reward observed after the action
    @param alpha the step size (learning rate)
    @param gamma the discount factor
    @return the updated utility matrix
    @return the estimation error delta
    '''
    u = utility_matrix[observation[0], observation[1]]
    u_t1 = utility_matrix[new_observation[0], new_observation[1]]
    delta = reward + ((gamma * u_t1) - u)
    utility_matrix[observation[0], observation[1]] += alpha * delta
    return utility_matrix, delta


def update_actor(state_action_matrix, observation, action, delta, beta_matrix=None):
    '''Return the updated state-action matrix

    @param state_action_matrix the matrix before the update
    @param observation the state obsrved at t
    @param action taken at time t
    @param delta the estimation error returned by the critic
    @param beta_matrix a visit counter for each state-action pair
    @return the updated matrix
    '''
    col = observation[1] + (observation[0]*4)
    if beta_matrix is None: beta = 1
    else: beta = 1 / beta_matrix[action,col]
    state_action_matrix[action, col] += beta * delta
    return state_action_matrix 


In [4]:
def main():

    env = GridWorld(3, 4)

    #Define the state matrix
    state_matrix = np.zeros((3,4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print("State Matrix:")
    print(state_matrix)

    #Define the reward matrix
    reward_matrix = np.full((3,4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print("Reward Matrix:")
    print(reward_matrix)

    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])

    state_action_matrix = np.random.random((4,12))
    print("State-Action Matrix:")
    print(state_action_matrix)

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    utility_matrix = np.zeros((3,4))
    print("Utility Matrix:")
    print(utility_matrix)

    gamma = 0.999
    alpha = 0.001 #constant step size
    beta_matrix = np.zeros((4,12)) #related to action learning, ow though? policy?
    tot_epoch = 300000
    print_epoch = 1000

    for epoch in range(tot_epoch):
        #Reset and return the first observation
        observation = env.reset(exploring_starts=True)
        for step in range(1000):
            #Estimating the action through Softmax
            col = observation[1] + (observation[0]*4)
            action_array = state_action_matrix[:, col]
            action_distribution = softmax(action_array)
            action = np.random.choice(4, 1, p=action_distribution) #pick an action based on probability
            #To enable the beta parameter, enable the libe below
            #and add beta_matrix=beta_matrix in the update actor function
            #beta_matrix[action,col] += 1 #increment the counter
            #Move one step in the environment and get obs and reward
            new_observation, reward, done = env.step(action)
            utility_matrix, delta = update_critic(utility_matrix, observation, 
                                                  new_observation, reward, alpha, gamma, done)
            state_action_matrix = update_actor(state_action_matrix, observation, 
                                               action, delta, beta_matrix=None)
            observation = new_observation
            if done: break
            

        if(epoch % print_epoch == 0):
            print("")
            print("Utility matrix after " + str(epoch+1) + " iterations:") 
            print(utility_matrix)
            print("")
            print("State-Action matrix after " + str(epoch+1) + " iterations:") 
            print(state_action_matrix)
    #Time to check the utility matrix obtained
    print("Utility matrix after " + str(tot_epoch) + " iterations:")
    print(utility_matrix)
    print("State-Action matrix after  " + str(tot_epoch) + " iterations:")
    print(state_action_matrix)
    print(reward_matrix)


if __name__ == "__main__":
    main()


State Matrix:
[[ 0.  0.  0.  1.]
 [ 0. -1.  0.  1.]
 [ 0.  0.  0.  0.]]
Reward Matrix:
[[-0.04 -0.04 -0.04  1.  ]
 [-0.04 -0.04 -0.04 -1.  ]
 [-0.04 -0.04 -0.04 -0.04]]
State-Action Matrix:
[[1.01925527e-02 2.75677419e-01 5.50133505e-01 7.64921528e-01
  3.51923360e-01 7.88989541e-01 4.59938001e-02 3.52811924e-01
  2.61924822e-01 1.48086380e-01 7.66545268e-01 6.13981639e-04]
 [1.27632306e-01 3.11294574e-01 6.95330546e-01 5.56913177e-01
  2.26717624e-01 2.44273485e-01 5.97629053e-01 1.24999864e-01
  3.91020962e-01 8.81927471e-01 1.83758989e-01 5.68276208e-01]
 [9.92747000e-01 4.34833444e-01 8.53157282e-01 6.65099088e-01
  9.23555899e-01 9.38548488e-01 1.78410079e-02 9.57704579e-01
  3.77255312e-01 6.83830476e-02 3.87696180e-02 4.70326987e-01]
 [9.85898937e-01 1.35260209e-02 1.64085452e-01 2.93578902e-01
  4.42110128e-01 1.78042182e-01 3.39697302e-01 1.14465443e-01
  8.80036597e-01 3.22861870e-01 8.14224430e-01 9.90196773e-01]]
Utility Matrix:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


Utility matrix after 7001 iterations:
[[ 0.47498302  0.80168144  0.94450752  0.        ]
 [ 0.14915402  0.          0.64319407  0.        ]
 [-0.05986315  0.0476505   0.34372238 -0.05334309]]

State-Action matrix after 7001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   1.97365458e+02  7.88989541e-01  6.73921249e+02  3.52811924e-01
   8.61308854e+00 -2.00779484e+01  3.85059040e+02 -3.73458675e+01]
 [ 4.96838668e+02  8.05488013e+02  9.44593415e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  1.09344050e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 


Utility matrix after 15001 iterations:
[[0.77256469 0.90180334 0.95805549 0.        ]
 [0.58281693 0.         0.70525282 0.        ]
 [0.1741631  0.33669011 0.55084972 0.16526764]]

State-Action matrix after 15001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   6.31028374e+02  7.88989541e-01  7.35980005e+02  3.52811924e-01
   2.42639336e+02 -2.00779484e+01  5.92186383e+02 -3.73458675e+01]
 [ 7.94420339e+02  9.05609912e+02  9.58141389e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  3.98383656e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.2728664


Utility matrix after 24001 iterations:
[[0.83478855 0.90297544 0.95757971 0.        ]
 [0.75253116 0.         0.71683494 0.        ]
 [0.46438258 0.47495685 0.5863307  0.29298326]]

State-Action matrix after 24001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.00742598e+02  7.88989541e-01  7.47562123e+02  3.52811924e-01
   5.32858820e+02 -2.00779484e+01  6.27667365e+02 -3.73458675e+01]
 [ 8.56644201e+02  9.06782013e+02  9.57665614e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  5.36650405e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.2728664


Utility matrix after 33001 iterations:
[[0.84338747 0.90538828 0.95470673 0.        ]
 [0.78615039 0.         0.68398075 0.        ]
 [0.61676023 0.52912124 0.60023433 0.35212757]]

State-Action matrix after 33001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.34361832e+02  7.88989541e-01  7.14707932e+02  3.52811924e-01
   6.85236465e+02 -2.00779484e+01  6.41570989e+02 -3.73458675e+01]
 [ 8.65243114e+02  9.09194851e+02  9.54792631e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  5.90814794e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.2728664


Utility matrix after 42001 iterations:
[[0.84659926 0.90587918 0.95610797 0.        ]
 [0.79353115 0.         0.70917809 0.        ]
 [0.68306169 0.5510763  0.60710706 0.3894517 ]]

State-Action matrix after 42001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.41742589e+02  7.88989541e-01  7.39905275e+02  3.52811924e-01
   7.51537926e+02 -2.00779484e+01  6.48443723e+02 -3.73458675e+01]
 [ 8.68454905e+02  9.09685751e+02  9.56193870e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.12769855e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.2728664


Utility matrix after 51001 iterations:
[[0.84943051 0.9069169  0.95941156 0.        ]
 [0.79698488 0.         0.70043014 0.        ]
 [0.70826567 0.56078081 0.61841085 0.38377968]]

State-Action matrix after 51001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.45196323e+02  7.88989541e-01  7.31157321e+02  3.52811924e-01
   7.76741911e+02 -2.00779484e+01  6.59747517e+02 -3.73458675e+01]
 [ 8.71286158e+02  9.10723478e+02  9.59497461e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.22474358e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.2728664


Utility matrix after 60001 iterations:
[[0.84572384 0.90364546 0.95592927 0.        ]
 [0.79662407 0.         0.68551409 0.        ]
 [0.72136079 0.55807993 0.6027508  0.3944693 ]]

State-Action matrix after 60001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.44835509e+02  7.88989541e-01  7.16241275e+02  3.52811924e-01
   7.89837032e+02 -2.00779484e+01  6.44087460e+02 -3.73458675e+01]
 [ 8.67579493e+02  9.07452035e+02  9.56015172e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.19773485e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.2728664


Utility matrix after 69001 iterations:
[[0.8465449  0.90295394 0.95344196 0.        ]
 [0.79442352 0.         0.65769052 0.        ]
 [0.72301589 0.54711537 0.58880942 0.39147863]]

State-Action matrix after 69001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.42634961e+02  7.88989541e-01  6.88417701e+02  3.52811924e-01
   7.91492124e+02 -2.00779484e+01  6.30146079e+02 -3.73458675e+01]
 [ 8.68400552e+02  9.06760518e+02  9.53527856e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.08808924e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.2728664


Utility matrix after 78001 iterations:
[[0.8485264  0.90415124 0.95233527 0.        ]
 [0.7957282  0.         0.70355407 0.        ]
 [0.72039986 0.55326989 0.61356926 0.38402543]]

State-Action matrix after 78001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.43939641e+02  7.88989541e-01  7.34281255e+02  3.52811924e-01
   7.88876096e+02 -2.00779484e+01  6.54905924e+02 -3.73458675e+01]
 [ 8.70382052e+02  9.07957812e+02  9.52421169e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.14963444e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.2728664


Utility matrix after 87001 iterations:
[[0.84871017 0.9072309  0.95893151 0.        ]
 [0.79696384 0.         0.70891978 0.        ]
 [0.72499795 0.56572968 0.6214308  0.41312992]]

State-Action matrix after 87001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.45175281e+02  7.88989541e-01  7.39646960e+02  3.52811924e-01
   7.93474192e+02 -2.00779484e+01  6.62767463e+02 -3.73458675e+01]
 [ 8.70565818e+02  9.11037473e+02  9.59017409e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.27423231e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.2728664


Utility matrix after 96001 iterations:
[[0.850445   0.90351653 0.95054731 0.        ]
 [0.7996091  0.         0.67249883 0.        ]
 [0.72583863 0.56081584 0.60236374 0.39403554]]

State-Action matrix after 96001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.47820542e+02  7.88989541e-01  7.03226006e+02  3.52811924e-01
   7.94314864e+02 -2.00779484e+01  6.43700399e+02 -3.73458675e+01]
 [ 8.72300649e+02  9.07323103e+02  9.50633213e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.22509395e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.2728664


Utility matrix after 104001 iterations:
[[0.84703883 0.90490934 0.95659111 0.        ]
 [0.79694628 0.         0.70797204 0.        ]
 [0.72836179 0.56108786 0.62130625 0.38265189]]

State-Action matrix after 104001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.45157726e+02  7.88989541e-01  7.38699220e+02  3.52811924e-01
   7.96838030e+02 -2.00779484e+01  6.62642914e+02 -3.73458675e+01]
 [ 8.68894478e+02  9.08715909e+02  9.56677005e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.22781413e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 112001 iterations:
[[0.84853764 0.90782708 0.96131746 0.        ]
 [0.79845595 0.         0.71419497 0.        ]
 [0.72875951 0.57080607 0.62725909 0.41034386]]

State-Action matrix after 112001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.46667392e+02  7.88989541e-01  7.44922150e+02  3.52811924e-01
   7.97235743e+02 -2.00779484e+01  6.68595756e+02 -3.73458675e+01]
 [ 8.70393284e+02  9.11633650e+02  9.61403358e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.32499617e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 120001 iterations:
[[0.84955865 0.90516154 0.95317508 0.        ]
 [0.7986772  0.         0.70062143 0.        ]
 [0.72972975 0.57114269 0.6196546  0.42158337]]

State-Action matrix after 120001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.46888640e+02  7.88989541e-01  7.31348608e+02  3.52811924e-01
   7.98205992e+02 -2.00779484e+01  6.60991260e+02 -3.73458675e+01]
 [ 8.71414294e+02  9.08968117e+02  9.53260980e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.32836241e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 128001 iterations:
[[0.84649354 0.90268533 0.95512213 0.        ]
 [0.796499   0.         0.68615793 0.        ]
 [0.73103962 0.56409389 0.61090383 0.42113522]]

State-Action matrix after 128001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.44710445e+02  7.88989541e-01  7.16885111e+02  3.52811924e-01
   7.99515857e+02 -2.00779484e+01  6.52240491e+02 -3.73458675e+01]
 [ 8.68349193e+02  9.06491900e+02  9.55208033e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.25787443e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 136001 iterations:
[[0.84823254 0.90490908 0.95823062 0.        ]
 [0.79570733 0.         0.73379088 0.        ]
 [0.72549615 0.56256931 0.61926166 0.4026422 ]]

State-Action matrix after 136001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.43918776e+02  7.88989541e-01  7.64518056e+02  3.52811924e-01
   7.93972390e+02 -2.00779484e+01  6.60598323e+02 -3.73458675e+01]
 [ 8.70088193e+02  9.08715650e+02  9.58316516e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.24262863e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 144001 iterations:
[[0.85097703 0.90615313 0.95854425 0.        ]
 [0.79935764 0.         0.70185124 0.        ]
 [0.72593607 0.56920127 0.61836037 0.42689787]]

State-Action matrix after 144001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.47569078e+02  7.88989541e-01  7.32578420e+02  3.52811924e-01
   7.94412308e+02 -2.00779484e+01  6.59697030e+02 -3.73458675e+01]
 [ 8.72832679e+02  9.09959703e+02  9.58630149e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.30894826e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 152001 iterations:
[[0.84876341 0.90723646 0.95847439 0.        ]
 [0.79871774 0.         0.70796854 0.        ]
 [0.72743254 0.57742585 0.62989909 0.42093767]]

State-Action matrix after 152001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.46929184e+02  7.88989541e-01  7.38695721e+02  3.52811924e-01
   7.95908777e+02 -2.00779484e+01  6.71235748e+02 -3.73458675e+01]
 [ 8.70619061e+02  9.11043034e+02  9.58560290e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.39119397e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 160001 iterations:
[[0.84794715 0.90741519 0.95929987 0.        ]
 [0.79797454 0.         0.71995931 0.        ]
 [0.73026789 0.57479765 0.62704447 0.41350131]]

State-Action matrix after 160001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.46185980e+02  7.88989541e-01  7.50686492e+02  3.52811924e-01
   7.98744126e+02 -2.00779484e+01  6.68381130e+02 -3.73458675e+01]
 [ 8.69802797e+02  9.11221760e+02  9.59385770e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.36491203e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 168001 iterations:
[[0.8471509  0.90524741 0.96133081 0.        ]
 [0.79768968 0.         0.7037715  0.        ]
 [0.73001088 0.56758889 0.61693121 0.39599924]]

State-Action matrix after 168001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.45901119e+02  7.88989541e-01  7.34498680e+02  3.52811924e-01
   7.98487120e+02 -2.00779484e+01  6.58267873e+02 -3.73458675e+01]
 [ 8.69006545e+02  9.09053982e+02  9.61416708e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.29282436e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 176001 iterations:
[[0.84619421 0.90449276 0.95429889 0.        ]
 [0.79570074 0.         0.68716097 0.        ]
 [0.73006151 0.56560038 0.60913027 0.37550637]]

State-Action matrix after 176001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.43912184e+02  7.88989541e-01  7.17888149e+02  3.52811924e-01
   7.98537745e+02 -2.00779484e+01  6.50466935e+02 -3.73458675e+01]
 [ 8.68049855e+02  9.08299331e+02  9.54384789e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.27293928e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 184001 iterations:
[[0.84650111 0.90614051 0.96305675 0.        ]
 [0.79438337 0.         0.70652285 0.        ]
 [0.72406446 0.56288827 0.61585823 0.39131421]]

State-Action matrix after 184001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.42594811e+02  7.88989541e-01  7.37250026e+02  3.52811924e-01
   7.92540703e+02 -2.00779484e+01  6.57194894e+02 -3.73458675e+01]
 [ 8.68356754e+02  9.09947082e+02  9.63142654e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.24581821e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 192001 iterations:
[[0.84981288 0.90888715 0.95756961 0.        ]
 [0.79821606 0.         0.70960133 0.        ]
 [0.72550655 0.56883294 0.6238091  0.38867024]]

State-Action matrix after 192001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.46427499e+02  7.88989541e-01  7.40328506e+02  3.52811924e-01
   7.93982792e+02 -2.00779484e+01  6.65145768e+02 -3.73458675e+01]
 [ 8.71668533e+02  9.12693725e+02  9.57655514e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.30526494e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 200001 iterations:
[[0.84952006 0.90755045 0.96029455 0.        ]
 [0.79822203 0.         0.68633045 0.        ]
 [0.72624851 0.56172314 0.60782016 0.38189299]]

State-Action matrix after 200001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.46433471e+02  7.88989541e-01  7.17057634e+02  3.52811924e-01
   7.94724743e+02 -2.00779484e+01  6.49156821e+02 -3.73458675e+01]
 [ 8.71375707e+02  9.11357022e+02  9.60380454e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.23416688e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 208001 iterations:
[[0.85150196 0.90928043 0.96319134 0.        ]
 [0.79977903 0.         0.70361443 0.        ]
 [0.72717144 0.56444436 0.61682437 0.39860466]]

State-Action matrix after 208001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.47990472e+02  7.88989541e-01  7.34341606e+02  3.52811924e-01
   7.95647676e+02 -2.00779484e+01  6.58161028e+02 -3.73458675e+01]
 [ 8.73357607e+02  9.13087009e+02  9.63277239e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.26137908e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 216001 iterations:
[[0.84925965 0.90684297 0.95626093 0.        ]
 [0.79927813 0.         0.69769334 0.        ]
 [0.72814928 0.57373818 0.62407685 0.40487243]]

State-Action matrix after 216001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.47489574e+02  7.88989541e-01  7.28420522e+02  3.52811924e-01
   7.96625522e+02 -2.00779484e+01  6.65413516e+02 -3.73458675e+01]
 [ 8.71115299e+02  9.10649541e+02  9.56346825e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.35431735e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 224001 iterations:
[[0.85032657 0.90725991 0.95683376 0.        ]
 [0.79856918 0.         0.68986391 0.        ]
 [0.72824084 0.57018006 0.61351736 0.40776109]]

State-Action matrix after 224001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.46780625e+02  7.88989541e-01  7.20591087e+02  3.52811924e-01
   7.96717078e+02 -2.00779484e+01  6.54854020e+02 -3.73458675e+01]
 [ 8.72182216e+02  9.11066485e+02  9.56919660e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.31873606e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 232001 iterations:
[[0.84915306 0.90641707 0.9585096  0.        ]
 [0.79803555 0.         0.70980313 0.        ]
 [0.72651565 0.56769105 0.62160422 0.41416581]]

State-Action matrix after 232001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.46246992e+02  7.88989541e-01  7.40530308e+02  3.52811924e-01
   7.94991886e+02 -2.00779484e+01  6.62940884e+02 -3.73458675e+01]
 [ 8.71008706e+02  9.10223647e+02  9.58595501e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.29384604e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 240001 iterations:
[[0.8489731  0.90628193 0.95917745 0.        ]
 [0.79788891 0.         0.70604908 0.        ]
 [0.72397485 0.57152524 0.62703641 0.41971231]]

State-Action matrix after 240001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.46100353e+02  7.88989541e-01  7.36776258e+02  3.52811924e-01
   7.92451087e+02 -2.00779484e+01  6.68373071e+02 -3.73458675e+01]
 [ 8.70828746e+02  9.10088506e+02  9.59263353e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.33218786e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 248001 iterations:
[[0.84881227 0.90656836 0.96284871 0.        ]
 [0.79934972 0.         0.68495948 0.        ]
 [0.72682396 0.57118983 0.61361949 0.41337159]]

State-Action matrix after 248001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.47561157e+02  7.88989541e-01  7.15686659e+02  3.52811924e-01
   7.95300196e+02 -2.00779484e+01  6.54956151e+02 -3.73458675e+01]
 [ 8.70667915e+02  9.10374934e+02  9.62934610e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.32883377e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 256001 iterations:
[[0.85021442 0.90777103 0.95938306 0.        ]
 [0.79921885 0.         0.70037503 0.        ]
 [0.72955291 0.57209777 0.62436448 0.39769295]]

State-Action matrix after 256001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.47430292e+02  7.88989541e-01  7.31102206e+02  3.52811924e-01
   7.98029147e+02 -2.00779484e+01  6.65701146e+02 -3.73458675e+01]
 [ 8.72070063e+02  9.11577608e+02  9.59468958e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.33791319e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 264001 iterations:
[[0.84846422 0.90597506 0.95607382 0.        ]
 [0.79746003 0.         0.68762217 0.        ]
 [0.72563599 0.56464429 0.6072222  0.39112274]]

State-Action matrix after 264001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.45671471e+02  7.88989541e-01  7.18349349e+02  3.52811924e-01
   7.94112233e+02 -2.00779484e+01  6.48558865e+02 -3.73458675e+01]
 [ 8.70319871e+02  9.09781635e+02  9.56159724e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.26337840e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 272001 iterations:
[[0.84796609 0.90574593 0.96060228 0.        ]
 [0.7981412  0.         0.69061977 0.        ]
 [0.72610796 0.55869304 0.60849195 0.39464706]]

State-Action matrix after 272001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.46352645e+02  7.88989541e-01  7.21346953e+02  3.52811924e-01
   7.94584194e+02 -2.00779484e+01  6.49828614e+02 -3.73458675e+01]
 [ 8.69821735e+02  9.09552505e+02  9.60688178e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.20386590e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 280001 iterations:
[[0.84875045 0.90847531 0.96353403 0.        ]
 [0.79637092 0.         0.7249627  0.        ]
 [0.7292342  0.56281128 0.62375768 0.38980503]]

State-Action matrix after 280001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.44582365e+02  7.88989541e-01  7.55689882e+02  3.52811924e-01
   7.97710440e+02 -2.00779484e+01  6.65094341e+02 -3.73458675e+01]
 [ 8.70606099e+02  9.12281882e+02  9.63619929e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.24504828e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 288001 iterations:
[[0.84906249 0.9063265  0.95633051 0.        ]
 [0.7981786  0.         0.7040782  0.        ]
 [0.72916077 0.56677058 0.61727617 0.39181938]]

State-Action matrix after 288001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.46390039e+02  7.88989541e-01  7.34805377e+02  3.52811924e-01
   7.97637007e+02 -2.00779484e+01  6.58612832e+02 -3.73458675e+01]
 [ 8.70918142e+02  9.10133078e+02  9.56416413e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.28464136e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286


Utility matrix after 296001 iterations:
[[0.84963391 0.90589937 0.95474418 0.        ]
 [0.79735957 0.         0.67512809 0.        ]
 [0.72455281 0.56411955 0.60577793 0.39785783]]

State-Action matrix after 296001 iterations:
[[-6.52968446e+00 -8.00993539e-01  5.10132770e-01  7.64921528e-01
   8.45571013e+02  7.88989541e-01  7.05855267e+02  3.52811924e-01
   7.93029046e+02 -2.00779484e+01  6.47114593e+02 -3.73458675e+01]
 [ 8.71489562e+02  9.09705941e+02  9.54830076e+02  5.56913177e-01
  -1.53382557e+01  2.44273485e-01 -1.22289744e+01  1.24999864e-01
  -2.15284574e+01  6.25813102e+02 -1.33785222e+01 -3.45594948e+01]
 [-6.67300066e+00 -9.99471485e-01  1.54340387e+00  6.65099088e-01
  -1.56806046e+01  9.38548488e-01 -9.72662997e+00  9.57704579e-01
  -2.24045612e+01 -2.00345267e+01 -1.34261782e+01 -3.35473528e+01]
 [-6.53649211e+00 -9.70777354e-01  1.23270773e-01  2.93578902e-01
  -1.52482746e+01  1.78042182e-01 -7.77041453e+00  1.14465443e-01
  -2.26329816e+01 -2.01598167e+01 -1.27286