<a href="https://colab.research.google.com/github/monanjo123/RL_Course/blob/master/Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Optimizing Warehouse flows with Q-Learning
import numpy as np

In [0]:
# set gamma and alpha
gamma = 0.75
alpha = 0.9

# Define the environment

## States

In [0]:
location_to_state = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6,
    'H': 7,
    'I': 8,
    'J': 9,
    'K': 10,
    'L': 11
}
state_to_location = {state: location for location, state in location_to_state.items()}

## Action

In [0]:
action = [0 , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

## Rewards

In [0]:
R = np.array([
    [1,1,0,0,0,0,0,0,0,0,0,0],
    [1,0,1,0,0,1,0,0,0,0,0,0],
    [0,1,0,0,0,0,1,0,0,0,0,0],
    [0,0,0,0,0,0,0,1,0,0,0,0],
    [0,0,0,0,0,0,0,0,1,0,0,0],
    [0,1,0,0,0,0,0,0,0,1,0,0],
    [0,0,1,0,0,0,1,1,0,0,0,0],
    [0,0,0,1,0,0,1,0,0,0,0,1],
    [0,0,0,0,1,0,0,0,0,1,0,0],
    [0,0,0,0,0,1,0,0,1,0,1,0],
    [0,0,0,0,0,0,0,0,0,1,0,1],
    [0,0,0,0,0,0,0,1,0,0,1,0]
])

## Building the Q-Learning Algorithm

In [0]:
def train(R, Q):
  # Implement Q-learning process
  for i in range(1000):
    # select random state
    current_state = np.random.randint(0,12)
    # play a random action, such that R > 0
    playable_actions = []
    for j in range(12):
      if R[current_state, j] > 0:
        playable_actions.append(j)
    # get next_state and current reward
    next_state = np.random.choice(playable_actions)
    current_reward = R[current_state, next_state]

    # Calculate the Temporal Difference
    TD = current_reward + gamma*Q[next_state, np.argmax(Q[next_state,])] - Q[current_state, next_state]

    # Update Q by Bellman's Equation
    Q[current_state, next_state] += alpha*TD

  return Q

## Deploying the Q-learning

In [0]:
def route(starting_location, ending_location):
  route = [starting_location]
  # Initialize Q - value
  Q = np.array(np.zeros([12,12]))
  
  # Initialize new R
  R_new = np.copy(R)
  starting_state = location_to_state[starting_location]
  ending_state = location_to_state[ending_location]
  # set 1000 as reward value for the end state
  R_new[ending_state, ending_state] = 1000
  
  Q = train(R_new, Q)
  next_state = starting_state
  while(next_state != ending_state):
    next_state = np.argmax(Q[next_state,])
    route.append(state_to_location[next_state])
    
  return route

In [0]:
# make location prioritization
def best_route(starting_location, end_locations=[]):
  best_route = []
  if end_locations:
    print(end_locations)
    start_loc = starting_location
    for loc in end_locations:
      best_route += route(start_loc, loc)[1:]
      start_loc = loc
    best_route.insert(0,starting_location)
    print(best_route)  
  else:
    print('No end locations')
    

In [55]:
locations = ['B', 'G', 'H']
best_route('A', end_locations=locations)

['B', 'G', 'H']
['A', 'B', 'C', 'G', 'H']
