In [1]:
# Library imports

import numpy as np
import pandas as pd
import random
from stable_baselines3 import PPO, DQN

from env import WarehouseEnv

In [2]:
def run_episode(env, policy_function):

    state, info = env.reset()

    done = False   
    final_reward = 0
     
    while not done:
        action = policy_function(state)
        state, reward, done, done, _ = env.step(action)
        final_reward += reward
    
    avg_dos_section_a = env.avg_dos_section_a
    avg_dos_section_b = env.avg_dos_section_b
    avg_dos_section_c = env.avg_dos_section_c
            
    return final_reward, avg_dos_section_a, avg_dos_section_b, avg_dos_section_c

#Evaluate the policy
def evaluate_policy(env, policy_function, num_episodes):

    list_final_reward = []
    list_avg_dos_section_a = []
    list_avg_dos_section_b = []
    list_avg_dos_section_c = []

    for i in range(num_episodes):

        print(f"Episode {i+1}/{num_episodes}")

        env.update_seed(i)
        final_reward, avg_dos_section_a, avg_dos_section_b, avg_dos_section_c = run_episode(env, policy_function)
        list_final_reward.append(final_reward)
        list_avg_dos_section_a.append(avg_dos_section_a)
        list_avg_dos_section_b.append(avg_dos_section_b)
        list_avg_dos_section_c.append(avg_dos_section_c)
        
    return np.mean(list_final_reward), np.std(list_final_reward), np.mean(list_avg_dos_section_a), np.mean(list_avg_dos_section_b), np.mean(list_avg_dos_section_c)

In [9]:
def export_results():

    results = []

    from tensorflow.keras.models import load_model

    env = WarehouseEnv()

    def get_state(state):
        return state

    def rl_get_state(state):
        
        data_rows = []

        for i in range(env.num_sectors):
            data_rows.append(state['capacity'][i]/state['static_info']['max_capacity'][i])

        dos = state['static_info']['dos']
        dos_min = min(dos)
        dos_max = max(dos)
            
        if dos_max == dos_min:
            normalized_dos = 0  # or 1 — depends on how you want to handle it
        else:
            normalized_dos = (dos[env.current_product] - dos_min) / (dos_max - dos_min)
        
        data_rows.append(normalized_dos)
        #Truncate the data to be between 0 and 1
        #data_rows = np.clip(data_rows, 0, 1)
        
        return np.array(np.nan_to_num(data_rows, nan=0), dtype=np.float32)
        


    num_episodes = 20
    policy_names = ['random', 'ABC', 'A-first', 'dqn', 'ppo', 'maskable_ppo']
    #policy_names = ['random', 'dqn']

    for policy_name in policy_names:

        print(f"Evaluating policy: {policy_name}")

        if policy_name == "random":

            env.get_state = get_state

            def policy(state):
                possible_actions = [i for i, capacity in enumerate(state['capacity']) if capacity > 0]
                action = random.choice(possible_actions)
                return action
            mean_reward, std_reward, avg_a, avg_b, avg_c = evaluate_policy(env, policy, num_episodes) 
        
        elif policy_name == "ABC":

            env.get_state = get_state

            def policy(state):
                if state['current_product'] == 0:
                    if state['capacity'][0] > 0:
                        action = 0
                    elif state['capacity'][1] > 0:
                        action = 1
                    else:
                        action = 2
                elif state['current_product'] == 1 or state['current_product'] == 2:
                    if state['capacity'][1] > 0:
                        action = 1
                    elif state['capacity'][2] > 0:
                        action = 2
                    else:
                        action = 0
                elif state['current_product'] == 3 or state['current_product'] == 4 or state['current_product'] == 5:
                    if state['capacity'][2] > 0:
                        action = 2
                    elif state['capacity'][1] > 0:
                        action = 1
                    else:
                        action = 0
                else:
                    action = 0
                return action
            
            mean_reward, std_reward, avg_a, avg_b, avg_c = evaluate_policy(env, policy, num_episodes) 

        elif policy_name == "A-first":

            env.get_state = get_state

            def policy(state):
                if state['capacity'][0] > 0:
                    action = 0
                elif state['capacity'][1] > 0:
                    action = 1
                else:
                    action = 2
                return action
            
            mean_reward, std_reward, avg_a, avg_b, avg_c = evaluate_policy(env, policy, num_episodes) 

    
        elif policy_name == "dqn":

            model = DQN.load(f"rl_models/dqn_models/dqn_model")
            
            env.get_state = rl_get_state

            def policy(state):
                action, _ = model.predict(state, deterministic=True)
                action = int(action)
                return action
            
            mean_reward, std_reward, avg_a, avg_b, avg_c = evaluate_policy(env, policy, num_episodes) 
        
        elif policy_name == "ppo":

            model = PPO.load(f"rl_models/ppo_models/ppo_model")
            
            env.get_state = rl_get_state

            def policy(state):
                action, _ = model.predict(state, deterministic=True)
                action = int(action)
                return action
            
            mean_reward, std_reward, avg_a, avg_b, avg_c = evaluate_policy(env, policy, num_episodes) 
        
        elif policy_name == "maskable_ppo":
            
            from sb3_contrib import MaskablePPO
            from sb3_contrib.common.maskable.utils import get_action_masks
            model = MaskablePPO.load(f"rl_models/maskable_ppo_models/maskable_ppo_model")

            env.get_state = rl_get_state

            def policy(state):
                action_masks = get_action_masks(env)
                action, _ = model.predict(observation=state, deterministic=True, action_masks=action_masks)
                action = int(action)
                return action
            mean_reward, std_reward, avg_a, avg_b, avg_c = evaluate_policy(env, policy, num_episodes)
            

        results.append({
            'Policy': policy_name,
            'MeanReward': np.round(mean_reward,2),
            'StdReward': np.round(std_reward,2),
            'AvgDosSectionA': np.round(avg_a,2),
            'AvgDosSectionB': np.round(avg_b,2),
            'AvgDosSectionC': np.round(avg_c,2)
        })

    # Convert the results list to a DataFrame
    results_df = pd.DataFrame(results)

    import os

    # Save the results to a CSV file
    results_df.to_csv(f'results/tables/table_results.csv', mode='a', header=not os.path.exists(f'results/tables/table.csv'), index=False)

In [10]:
export_results()

Evaluating policy: random
Episode 1/20
Episode 2/20
Episode 3/20
Episode 4/20
Episode 5/20
Episode 6/20
Episode 7/20
Episode 8/20
Episode 9/20
Episode 10/20
Episode 11/20
Episode 12/20
Episode 13/20
Episode 14/20
Episode 15/20
Episode 16/20
Episode 17/20
Episode 18/20
Episode 19/20
Episode 20/20
Evaluating policy: ABC
Episode 1/20
Episode 2/20
Episode 3/20
Episode 4/20
Episode 5/20
Episode 6/20
Episode 7/20
Episode 8/20
Episode 9/20
Episode 10/20
Episode 11/20
Episode 12/20
Episode 13/20
Episode 14/20
Episode 15/20
Episode 16/20
Episode 17/20
Episode 18/20
Episode 19/20
Episode 20/20
Evaluating policy: A-first
Episode 1/20
Episode 2/20
Episode 3/20
Episode 4/20
Episode 5/20
Episode 6/20
Episode 7/20
Episode 8/20
Episode 9/20
Episode 10/20
Episode 11/20
Episode 12/20
Episode 13/20
Episode 14/20
Episode 15/20
Episode 16/20
Episode 17/20
Episode 18/20
Episode 19/20
Episode 20/20
Evaluating policy: dqn
Episode 1/20
Episode 2/20
Episode 3/20
Episode 4/20
Episode 5/20
Episode 6/20
Episode 7/