# Reinforcement Learning in a Smart Factory

Optimization of the robots route for pick-up and storage of items in a warehouse: 
1. Implement a reinforcement-learning based algorithm
2. The robot is the agent and decides where to place the next part
3. Use the markov decision process toolbox for your solution
4. Choose the best performing MDP

In [165]:
#!pip install pymdptoolbox

In [166]:
## Imports
import mdptoolbox
import pandas as pd
import itertools as it
import numpy as np
import pickle
import time
from scipy import sparse

## Import  data

In [167]:
file_path = 'Reinforcement Learning Data - warehousetraining.txt'
file_path_test= 'Reinforcement Learning Data - warehouseorder.txt'

In [168]:
# Name the data colums corectly
data = pd.read_csv(file_path, sep='\t', names=["action", "color_state"])
test_data = pd.read_csv(file_path_test, sep='\t', names=["action", "color_state"])

In [169]:
#print(data.info()) print(data.dtypes)
data.head()
data.groupby(["action", "color_state"]).count()

action,color_state
restore,blue
restore,red
restore,white
store,blue
store,red
store,white


In [170]:
actions =  list(np.unique(data.action))  #['restore', 'store']
item_colors  =  list(np.unique(data.color_state))   #['blue' 'red' 'white']

In [171]:
train_data = np.array( [[actions.index(v[0]), item_colors.index(v[1])] for v in data.values] , dtype=int)

## Reinforcement-learning based algorithm: Markov Descision Process (MDP)

A MDP is a discrete time stochastic control process where the markov property is satisfied
1. Create Transitions Matrix represeting the probabilities to lead from state *s0* another state *s1* within the action *a*
2. Generate Reward Matrix defined reward after action *a* in state *s0* to reach state *s1*

Optimize the route with following constraints:
- Size of warehouse is {1..3} x {1..3}
- Separate start/stop position outside the 3x3 storage space where the robot have to go at the end of storage and pick-up
- The first position the robot can move into is always (1,1)
- Robot can move to adjacent fields
- Robot cannot move diagonally
- There are three types of items: (white, blue, red)

In [270]:
# Set Markov Descision Process (MDP) Constrains
warehouse_size = [2,2] #2x2 grid
grid_size = np.prod(warehouse_size)
grids_cells = [(i+1,j+1) for j in range(warehouse_size[1]) for i in range(warehouse_size[0]) ]

# The actions is equal to grid size 
actions_moves = grid_size.copy()

items = len(item_colors) + 1 # Consider also no item
moves = len(actions)*len(item_colors)

#Total posibilities of item in any satate on  the field
items_grid = items ** grid_size
total_states = items_grid * moves

print("The total number of states is: ", total_states)

The total number of states is:  1536


In [None]:
item_states_ID = dict((k,v) for v,k in enumerate( ["noitem"] + item_colors ))# dict.fromkeys(item_colors + ["noitem"], 0)
item_states_ID

In [284]:
# Create all the posible states indexing
def compute_grid_index(grid_size, item_states_ID): 
    grids = [s for s in it.product(item_states_ID.values(), repeat=grid_size)]
    return np.array(grids)

grids = compute_grid_index(grid_size, item_states_ID)
print("Number of posible states: ", len(grids))

Number of posible states:  256


In [414]:
grid_states= pd.DataFrame(data=grids, columns=grids_cells)
grid_states[20:30]


Unnamed: 0,"(1, 1)","(2, 1)","(1, 2)","(2, 2)"
20,0,1,1,0
21,0,1,1,1
22,0,1,1,2
23,0,1,1,3
24,0,1,2,0
25,0,1,2,1
26,0,1,2,2
27,0,1,2,3
28,0,1,3,0
29,0,1,3,1


In [287]:
def generate_warehosue_states(grid_states, item_states_ID,):
    warehouse_states = pd.DataFrame()
    for k,v in item_states_ID.items(): 
        warehouse_states[k] = np.sum(grid_states ==v, axis =1)
    return  warehouse_states

In [416]:
warehouse_states = generate_warehosue_states(grid_states, item_states_ID)
warehouse_states[20:30]

Unnamed: 0,noitem,blue,red,white
20,2,2,0,0
21,1,3,0,0
22,1,2,1,0
23,1,2,0,1
24,2,1,1,0
25,1,2,1,0
26,1,1,2,0
27,1,1,1,1
28,2,1,0,1
29,1,2,0,1


### Transition Probability Matrix (action,  state, next state)


In [381]:
def create_TPM(data, grids):
    # Initialize TMP with shape (action, posible states, posible states)
    P = np.zeros(( actions_moves, total_states, total_states),dtype=np.float16)
    # Compute  Each action probability as the count of each action on the data
    move_action_probability = np.array([a*c for a in data["action"].value_counts() / len(data) for c in data["color_state"].value_counts() / len(data) ])

    for action in range(actions_moves):
        idx = 0
        for mov in range(moves):
            for s in grids:
                for m in range(moves):
                    if m >= (moves//2): # restore actions
                        i = ((idx % items_grid) - (items**(actions_moves - action - 1) * (mov+1))) + (items_grid * m)
                    else:
                        i = ((idx % items_grid) - (items**(actions_moves - action - 1) * (mov+1))) + (items_grid * m)
                    P[action][idx][i] = move_action_probability[m]
                idx += 1  
    return P

TMP = create_TPM(data, grids)


In [382]:
def create_rewards(moves, total_states, grid_states):
    distances = [sum(np.array(c) - np.array(grids_cells[0])) for c in grids_cells]
    rewards = dict(keys=grids_cells, values =distances )
    
    R = np.zeros((actions_moves, total_states, ))

    for action in range(actions_moves):
        for idx, s in grid_states.iterrows():
            next_state = idx//(len(grid_states)//moves)
            try:
                if(next_state < (moves//len(actions)) and s[action] == 0):
                    reward = rewards[str(s)]
                elif (next_state > (moves//len(actions) ) and (s[action] == (next_state - len(actions)))):
                    reward = 10000*rewards[str(s)]  #+=100
                # Invalid movements
                else:
                    reward = -10000 
                R[action][idx] = reward
            except:
                pass

    return  np.asarray(R).T

In [383]:
R = create_rewards(moves, total_states, grid_states)

1536


In [None]:
assert TMP.shape[:-1] == R.shape[::-1], "The actions and states should match"

In [404]:
discount =  0.9
max_iter = 750
policy = mdptoolbox.mdp.PolicyIteration(TMP, R, 0.9, max_iter=max_iter)
value = mdptoolbox.mdp.ValueIteration(TMP, R, 0.9, max_iter=max_iter)

In [None]:
value.run()
policy.run()

In [409]:
p = policy.policy
iterations = policy.iter

print("Policy iterations:", iterations)

print("Value  iterations:", value.iter)

Policy iterations: 6
Value  iterations: 35
