In [19]:
import gymnasium as gym
from gymnasium.spaces import Discrete, Box, Sequence, Dict
import numpy as np
from utils import methods
import statistics

from utils.rl_environments.env1 import Env1

import os

## Config

In [20]:
DEFAULT_CONFIG = {
    'alpha_range': range(1, 4),
    'beta_range': [round(i * 0.5, 1) for i in range(2, 4)],
    'h_range': [round(i * 0.01, 2) for i in range(6, 61)],
    'c_range': range(20, 30),
    'total': range(10, 40), 
}

CONSTANT_CONFIG = {
    'alpha_range': [3],
    'beta_range': [1.5],
    'h_range': [0.15],
    'c_range': [25],
    'total': range(17, 40), 
}

TRAIN_CONFIG = {
    'alpha_range': np.arange(2, 5, 0.1),
    'beta_range': np.arange(1, 2, 0.1),
    'h_range': [0.15],
    'c_range': [25],
    'total': range(17, 40), 
}

In [21]:
PPO_MODEL_NAME = 'PPO_Env1'

## Train PQN

In [22]:
import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv

if False:

    n_cpus = os.cpu_count()

    print(f"Number of processors: {n_cpus}")
    env = make_vec_env(lambda: Env1(config=TRAIN_CONFIG), n_envs=n_cpus, vec_env_cls=SubprocVecEnv)
    model = PPO("MlpPolicy", env, verbose=1, device='cpu')

    print(model.n_envs)

    total_timesteps = 1_500_000
    print(f"Training model for {total_timesteps} timesteps")
    # Start training
    model.learn(total_timesteps=total_timesteps)

    model.save(methods.file_path(PPO_MODEL_NAME, 'models'))



## Generate Test Data

In [23]:
def get_realized_data(config):
    alpha = np.random.choice(config['alpha_range'])
    beta = np.random.choice(config['beta_range'])
    h = np.random.choice(config['h_range'])
    c = np.random.choice(config['c_range'])
    total = np.random.choice(config['total'])
    intervals = np.random.gamma(shape=alpha, scale=beta, size=total)
    # travel_time = sum(intervals[4:]) - np.random.exponential(scale=alpha * beta)
    travel_time = sum(intervals[4:]) - np.random.gamma(shape=5, scale=alpha*beta)
    travel_time = np.random.gamma(shape=5, scale=alpha*beta)
    # travel_time = max(beta * 5, travel_time)
    travel_time = sum(intervals[3:]) * np.random.uniform(0, 1)
    travel_time = max(alpha * beta, travel_time)

    return alpha, beta, h, c, total, intervals, travel_time

import pandas as pd

df = pd.DataFrame(columns=['h', 'c', 'travel_time', 'total', 'intervals', 'alpha', 'beta'])

for i in range(20000):
    alpha, beta, h, c, total, intervals, travel_time = get_realized_data(CONSTANT_CONFIG)
    row = {'h': h, 'c': c, 'travel_time': travel_time, 'total': total, 'intervals': intervals, 'alpha': alpha, 'beta': beta}
    df.loc[i] = row



## Test

In [24]:
from stable_baselines3 import PPO
model = PPO.load(methods.file_path(PPO_MODEL_NAME, 'models'))

env = Env1(config=CONSTANT_CONFIG)
rewards = {}
u_rl = {}
print(env._get_info())
for i in range(len(df)):
    row = df.iloc[i]
    state, _ = env.reset(row=row)
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(state)  
        state, reward, done, _, info = env.step(action)
        total_reward += reward
    rewards[i] = total_reward
    u_rl[i] = env.cur_time + env.travel_time
    # print(f"Episode {i} reward: {total_reward} | h = {info['state']['h']}, c = {info['state']['c']}")
    print(f"Episode {i} reward: {total_reward} | cur_time = {info['state']['cur_time']}")

df['rewards'] = rewards
df['u_rl'] = u_rl
print(f"Average reward: {df['rewards'].mean()}")

{'hidden': {'alpha': -1, 'beta': -1, 'interval': -1, 'cum_sum_intervals': -1}, 'state': {'n': -1, 'N': -1, 'h': -1, 'c': -1, 'travel_time': -1, 'cur_time': -1, 'mean_n': -1, 'std_n': -1, 'alpha_hat': -1, 'beta_hat': -1, 'u_star_hat': -1, 'last_update': -1}}
Episode 0 reward: -1.027105749238705 | cur_time = 12.93500292137895
Episode 1 reward: -18.726589290982957 | cur_time = 24.84354423506489
Episode 2 reward: -8.276959836443737 | cur_time = 17.09728250940599
Episode 3 reward: -13.719558288017481 | cur_time = 21.58201005296592
Episode 4 reward: -3.6682637144627446 | cur_time = 13.51264260447839
Episode 5 reward: -1.2692836007897548 | cur_time = 10.456311006649917
Episode 6 reward: -4.733902770049963 | cur_time = 15.340402767766456
Episode 7 reward: -0.9424224755820276 | cur_time = 12.275030102529835
Episode 8 reward: -1.936535568975342 | cur_time = 13.054012891022834
Episode 9 reward: -5.018641468410391 | cur_time = 20.02085535106872
Episode 10 reward: -8.158009010197203 | cur_time = 12

## Add Comparison Metrics

In [25]:

from pandarallel import pandarallel

# Initialize pandarallel with progress bar enabled
pandarallel.initialize(progress_bar=True)
for i in [0, 3, 5]:
    print(f"Optimal reward at n = {i}")
    df[f'u{i}'] = df.apply(lambda row: row['intervals'][i:].sum(), axis=1)
    df[f'u_star{i}'] = df.parallel_apply(lambda row: methods.get_u_star_binary_fast(row['total'] - i, row['alpha'], row['beta'], row['h'], row['c']), axis=1)
    df[f'optimal_rewards{i}'] = df.apply(lambda row: -methods.cal_cost(row['c'], row['h'], row[f'u{i}'], row[f'u_star{i}']), axis=1)

df['direct_leave_rewards'] = df.apply(lambda row: -methods.cal_cost(row['c'], row['h'], row['intervals'][3:].sum(), row[f'travel_time']), axis=1)
df.head()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Optimal reward at n = 0


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Optimal reward at n = 3


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Optimal reward at n = 5


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Unnamed: 0,h,c,travel_time,total,intervals,alpha,beta,rewards,u_rl,u0,u_star0,optimal_rewards0,u3,u_star3,optimal_rewards3,u5,u_star5,optimal_rewards5,direct_leave_rewards
0,0.15,25,99.648783,27,"[5.853639650562324, 3.0411423981159684, 3.6229...",3,1.5,-1.027106,112.583786,119.431158,97.37899,-3.307825,106.913413,84.939653,-3.296064,98.585153,76.704829,-3.282049,-1.089694
1,0.15,25,17.473884,32,"[6.035641540885896, 8.043479052995735, 2.45376...",3,1.5,-18.726589,42.317429,167.161357,118.299709,-7.329247,150.628468,105.721743,-6.736009,146.53156,97.37899,-7.372885,-19.973188
2,0.15,25,92.78821,29,"[1.7581246566477384, 10.890902411698658, 3.896...",3,1.5,-8.27696,109.885492,165.065224,105.721743,-8.901522,148.519467,93.221939,-8.294629,133.784963,84.939653,-7.326796,-8.359689
3,0.15,25,27.589705,30,"[3.005162129113325, 8.560831942627543, 7.71556...",3,1.5,-13.719558,49.171715,140.635437,109.906371,-4.60936,121.353877,97.37899,-3.596233,117.422821,89.075261,-4.252134,-14.064626
4,0.15,25,70.789788,31,"[1.8687192155354246, 3.889005484961306, 7.3190...",3,1.5,-3.668264,84.30243,108.757522,114.099159,-25.0,95.680771,101.545779,-25.0,91.118141,93.221939,-25.0,-3.733647


## Visualize Results

In [26]:
import plotly.graph_objects as go

all_rewards = [x for x in df.columns if 'reward' in x]

# Calculate means
means = df[all_rewards + ['h', 'c']].mean()

# Plot
fig = go.Figure(data=[go.Bar(x=means.index, y=means.values)])
fig.update_layout(title="Mean Values of Rewards, H, and C", yaxis_title="Mean Value")
fig.show()

# Calculate medians
medians = df[all_rewards + ['h', 'c']].median()

# Plot
fig = go.Figure(data=[go.Bar(x=medians.index, y=medians.values)])
fig.update_layout(title="Median Values of Rewards, H, and C", yaxis_title="Median Value")
fig.show()

In [27]:
df.head(50)

Unnamed: 0,h,c,travel_time,total,intervals,alpha,beta,rewards,u_rl,u0,u_star0,optimal_rewards0,u3,u_star3,optimal_rewards3,u5,u_star5,optimal_rewards5,direct_leave_rewards
0,0.15,25,99.648783,27,"[5.853639650562324, 3.0411423981159684, 3.6229...",3,1.5,-1.027106,112.583786,119.431158,97.37899,-3.307825,106.913413,84.939653,-3.296064,98.585153,76.704829,-3.282049,-1.089694
1,0.15,25,17.473884,32,"[6.035641540885896, 8.043479052995735, 2.45376...",3,1.5,-18.726589,42.317429,167.161357,118.299709,-7.329247,150.628468,105.721743,-6.736009,146.53156,97.37899,-7.372885,-19.973188
2,0.15,25,92.78821,29,"[1.7581246566477384, 10.890902411698658, 3.896...",3,1.5,-8.27696,109.885492,165.065224,105.721743,-8.901522,148.519467,93.221939,-8.294629,133.784963,84.939653,-7.326796,-8.359689
3,0.15,25,27.589705,30,"[3.005162129113325, 8.560831942627543, 7.71556...",3,1.5,-13.719558,49.171715,140.635437,109.906371,-4.60936,121.353877,97.37899,-3.596233,117.422821,89.075261,-4.252134,-14.064626
4,0.15,25,70.789788,31,"[1.8687192155354246, 3.889005484961306, 7.3190...",3,1.5,-3.668264,84.30243,108.757522,114.099159,-25.0,95.680771,101.545779,-25.0,91.118141,93.221939,-25.0,-3.733647
5,0.15,25,105.776604,32,"[2.2384065321851425, 4.552957541443538, 3.3276...",3,1.5,-1.269284,116.232915,124.694805,118.299709,-0.959264,114.575795,105.721743,-1.328108,108.724309,97.37899,-1.701798,-1.319879
6,0.15,25,42.273307,21,"[6.097012291437716, 4.999346528085629, 2.84946...",3,1.5,-4.733903,57.613709,89.173061,72.607468,-2.484839,75.227241,60.409231,-2.222701,63.927098,52.370237,-1.733529,-4.94309
7,0.15,25,130.02474,36,"[2.4979894398222258, 3.840035031896834, 5.5410...",3,1.5,-0.942422,142.299771,148.582587,135.17186,-2.011609,136.703526,122.507602,-2.129389,121.422761,114.099159,-1.09854,-1.001818
8,0.15,25,53.996703,22,"[6.0952314053836965, 2.8735144404396245, 3.664...",3,1.5,-1.936536,67.050716,79.960953,76.704829,-0.488419,67.328038,64.458332,-0.430456,60.19382,56.379238,-0.572187,-1.9997
9,0.15,25,36.569771,24,"[2.7838873444526433, 12.158852245741675, 4.432...",3,1.5,-5.018641,56.590627,90.048236,84.939653,-0.766288,70.673215,72.607468,-25.0,63.714111,64.458332,-25.0,-5.115517
