In [4]:
import numpy as np
import mdptoolbox
from itertools import product

# Initialize warehouse features
features = {
    'number_fields': 4,
    'number_fillings': 4,
    'number_next_color': 3,
    'number_actions': 2,
    'structure': (2, 2)
}

ModuleNotFoundError: No module named 'numpy'

In [None]:
# Declare global variables to reuse in functions
number_fields = features['number_fields']
number_fillings = features['number_fillings']
number_next_color = features['number_next_color']
number_actions = features['number_actions']

In [None]:
# Represent each state in a multiple array
def create_states(features):
    global number_fields
    global number_fillings
    global number_next_color
    global number_actions

    # Create multidimensional array
    all_states = np.ndarray(shape=((number_fillings ** number_fields) * number_next_color * number_actions, number_fields + 2))

    field_states = list(product(np.arange(number_fillings), repeat=number_fields))

    # Generate all possible states
    for counter_fields, field_state in enumerate(field_states):
        for counter_actions in range(number_actions):
            for counter_next_color in range(number_next_color):
                index = number_next_color * number_actions * counter_fields + counter_actions * number_next_color + counter_next_color

                # Shift next color counter for better encoding of colors
                all_states[index, :] = *field_state, counter_actions, counter_next_color

    return all_states

In [None]:
states = create_states(features)
states

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 2.],
       ...,
       [3., 3., 3., 3., 1., 0.],
       [3., 3., 3., 3., 1., 1.],
       [3., 3., 3., 3., 1., 2.]])

In [None]:
def transition_matrix_generator(states, features):
    global number_fields
    global number_fillings
    global number_next_color
    global number_actions
    number_states = states.shape[0]

    # Define the state space transition matrix 
    transition_matrix = np.zeros((number_fields + 1, number_states, number_states))

    for first_state in range(number_states):
        for field in range(number_fields):
            for second_state in range(number_states):
                state_1 = states[first_state]
                state_2 = states[second_state]

                number_actions = state_1[-2]
                number_next_color = state_1[-1]

                if number_actions == 0:
                    if state_1[field] == 3 and state_2[field] == number_next_color:
                        transition_matrix[field, first_state, second_state] = 1
                    else:
                        transition_matrix[4, first_state, second_state] = 1
                else:
                    if state_1[field] == number_next_color and state_2[field] == 3:
                        transition_matrix[field, first_state, second_state] = 1
                    else:
                        transition_matrix[4, first_state, second_state] = 1

            row_sum = np.sum(transition_matrix[field, first_state, :])

            # Now, normalize the row accordingly
            if row_sum > 1:
                transition_matrix[field, first_state, :] /= row_sum
            else:
                transition_matrix[field, first_state, first_state] = 1
            
        row_sum = np.sum(transition_matrix[4, first_state, :])
        
        if row_sum > 1:
            transition_matrix[4, first_state, :] /= row_sum
        else:
            transition_matrix[4, first_state, first_state] = 1 
            
    return transition_matrix

In [None]:
transition = transition_matrix_generator(states, features)
transition

array([[[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        ...,
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 1.00000000e+00]],

       [[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 

In [None]:
def reward_generator(states, ztsm, features):
    global number_fields
    global number_fillings
    global number_next_color
    global number_actions
    structure = features['structure']

    # Return rows and columns structure
    rows = np.arange(structure[0])
    columns = np.arange(structure[1])

    # Return new array and fill with ones
    distances = np.ones(structure)

    distances += columns
    distances = (distances.T + rows).T
    distances = np.append(distances, 7)
    
    # Create empty array to store reward matrix
    reward = []

    # Define reward matrix
    for action_nr, spacetimes in enumerate(transition):
        indices = np.where(np.logical_and(spacetimes > 0, spacetimes < 1))
        output = np.zeros(spacetimes.shape)
        output[indices] = 10 - distances[action_nr]

        reward.append(output)

    return reward

In [None]:
rewards = reward_generator(states, transition, features)
rewards

[array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 arr

In [None]:
# 
pi = mdptoolbox.mdp.ValueIteration(transitions=transition, reward=rewards, discount=0.95)
pi.setVerbose()
results = pi.run()
print(pi.policy)

  Iteration		V-variation
    1		  6.0
    2		  1.0291666666666686
    3		  0.15254659016926908
    4		  0.02640567211310696
    5		  0.005204433290689536
    6		  0.0011181208688526567
    7		  0.00025226362635066835
Iterating stopped, epsilon-optimal policy found.
(4, 4, 4, 0, 4, 4, 4, 4, 4, 0, 3, 4, 4, 4, 4, 0, 4, 3, 3, 3, 3, 0, 4, 4, 4, 4, 4, 0, 2, 4, 4, 4, 4, 0, 2, 4, 4, 4, 4, 0, 2, 3, 3, 3, 3, 0, 2, 4, 4, 4, 4, 0, 4, 2, 4, 4, 4, 0, 3, 2, 4, 4, 4, 0, 4, 2, 3, 3, 3, 0, 4, 2, 2, 2, 2, 0, 4, 4, 2, 2, 2, 0, 3, 4, 2, 2, 2, 0, 4, 3, 2, 2, 2, 0, 4, 4, 4, 4, 4, 0, 1, 4, 4, 4, 4, 0, 1, 4, 4, 4, 4, 0, 1, 3, 3, 3, 3, 0, 1, 4, 4, 4, 4, 0, 1, 4, 4, 4, 4, 0, 1, 4, 4, 4, 4, 0, 1, 3, 3, 3, 3, 0, 1, 4, 4, 4, 4, 0, 1, 2, 4, 4, 4, 0, 1, 2, 4, 4, 4, 0, 1, 2, 3, 3, 3, 0, 1, 2, 2, 2, 2, 0, 1, 4, 2, 2, 2, 0, 1, 4, 2, 2, 2, 0, 1, 3, 2, 2, 2, 0, 1, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 0, 3, 1, 4, 4, 4, 0, 4, 1, 3, 3, 3, 0, 4, 1, 4, 4, 4, 0, 2, 1, 4, 4, 4, 0, 2, 1, 4, 4, 4, 0, 2, 1, 3, 3, 3, 0, 2, 1, 4, 4, 4, 0, 4

In [None]:
states[11:19,:]

array([[0., 0., 0., 1., 1., 2.],
       [0., 0., 0., 2., 0., 0.],
       [0., 0., 0., 2., 0., 1.],
       [0., 0., 0., 2., 0., 2.],
       [0., 0., 0., 2., 1., 0.],
       [0., 0., 0., 2., 1., 1.],
       [0., 0., 0., 2., 1., 2.],
       [0., 0., 0., 3., 0., 0.]])

In [None]:
pi.policy[11:19]

(4, 4, 4, 4, 0, 4, 3, 3)

In [None]:
color_to_index = {
    "white": 0,
    "blue": 1,
    "red": 2
}

index_to_color = ["white", "blue", "red"]

actions_to_index = {
    "store": 0,
    "restore": 1
}

index_to_action = ["store", "restore"]