In [21]:
import gymnasium as gym
from gymnasium.spaces import Discrete, Box, Sequence, Dict
import numpy as np
from utils import methods
import statistics

from utils.rl_environments.env1 import Env1

import os

## Config

In [22]:
DEFAULT_CONFIG = {
    'alpha_range': range(1, 4),
    'beta_range': [round(i * 0.5, 1) for i in range(2, 4)],
    'h_range': [round(i * 0.01, 2) for i in range(6, 61)],
    'c_range': range(20, 30),
    'total': range(10, 40), 
}

CONSTANT_CONFIG = {
    'alpha_range': [2],
    'beta_range': [1],
    'h_range': [0.15],
    'c_range': [25],
    'total': range(17, 40), 
}


## Train PQN

In [23]:
import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv

if False:

    n_cpus = os.cpu_count()

    print(f"Number of processors: {n_cpus}")
    env = make_vec_env(lambda: Env1(config=CONSTANT_CONFIG), n_envs=n_cpus, vec_env_cls=SubprocVecEnv)
    model = PPO("MlpPolicy", env, verbose=1, device='cpu')

    print(model.n_envs)

    total_timesteps = 1_000_000
    print(f"Training model for {total_timesteps} timesteps")
    # Start training
    model.learn(total_timesteps=total_timesteps)

    model.save("ppo")



## Generate Test Data

In [24]:
def get_realized_data(config):
    alpha = np.random.choice(config['alpha_range'])
    beta = np.random.choice(config['beta_range'])
    h = np.random.choice(config['h_range'])
    c = np.random.choice(config['c_range'])
    total = np.random.choice(config['total'])
    intervals = np.random.gamma(shape=alpha, scale=beta, size=total)
    # travel_time = sum(intervals[4:]) - np.random.exponential(scale=alpha * beta)
    travel_time = sum(intervals[4:]) - np.random.gamma(shape=5, scale=alpha*beta)
    travel_time = np.random.gamma(shape=5, scale=alpha*beta)
    # travel_time = max(beta * 5, travel_time)
    # travel_time = sum(intervals[3:]) * np.random.uniform(0, 1)
    travel_time = max(alpha * beta, travel_time)

    return alpha, beta, h, c, total, intervals, travel_time

import pandas as pd

df = pd.DataFrame(columns=['h', 'c', 'travel_time', 'total', 'intervals', 'alpha', 'beta'])

for i in range(20000):
    alpha, beta, h, c, total, intervals, travel_time = get_realized_data(CONSTANT_CONFIG)
    row = {'h': h, 'c': c, 'travel_time': travel_time, 'total': total, 'intervals': intervals, 'alpha': alpha, 'beta': beta}
    df.loc[i] = row



## Test

In [25]:
from stable_baselines3 import PPO
model = PPO.load("PPO")

env = Env1(config=CONSTANT_CONFIG)
rewards = {}
u_rl = {}
print(env._get_info())
for i in range(len(df)):
    row = df.iloc[i]
    state, _ = env.reset(row=row)
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(state)  
        state, reward, done, _, info = env.step(action)
        total_reward += reward
    rewards[i] = total_reward
    u_rl[i] = env.cur_time + env.travel_time
    # print(f"Episode {i} reward: {total_reward} | h = {info['state']['h']}, c = {info['state']['c']}")
    print(f"Episode {i} reward: {total_reward} | cur_time = {info['state']['cur_time']}")

df['rewards'] = rewards
df['u_rl'] = u_rl
print(f"Average reward: {df['rewards'].mean()}")

{'hidden': {'alpha': -1, 'beta': -1, 'interval': -1, 'cum_sum_intervals': -1}, 'state': {'n': -1, 'N': -1, 'h': -1, 'c': -1, 'travel_time': -1, 'cur_time': -1, 'mean_n': -1, 'std_n': -1, 'alpha_hat': -1, 'beta_hat': -1, 'u_star_hat': -1, 'last_update': -1}}
Episode 0 reward: -3.9399541468695083 | cur_time = 22.822050202722522
Episode 1 reward: -1.8626467820902504 | cur_time = 53.52301983882057
Episode 2 reward: -0.22374639780658293 | cur_time = 12.667579540136542
Episode 3 reward: -3.067884396831506 | cur_time = 31.186691284388175
Episode 4 reward: -0.5574002798859056 | cur_time = 55.66505762373664
Episode 5 reward: -3.4981997385713157 | cur_time = 34.872334373709286
Episode 6 reward: -1.9098289575621517 | cur_time = 23.62560408557042
Episode 7 reward: -2.899021076951947 | cur_time = 42.46401310641871
Episode 8 reward: -1.6365924290466356 | cur_time = 11.540167654700962
Episode 9 reward: -3.180502163627616 | cur_time = 13.59357111579188
Episode 10 reward: -0.7348893650516686 | cur_time

## Add Comparison Metrics

In [26]:

from pandarallel import pandarallel

# Initialize pandarallel with progress bar enabled
pandarallel.initialize(progress_bar=True)
for i in [0, 3, 5]:
    print(f"Optimal reward at n = {i}")
    df[f'u{i}'] = df.apply(lambda row: row['intervals'][i:].sum(), axis=1)
    df[f'u_star{i}'] = df.parallel_apply(lambda row: methods.get_u_star_binary_fast(row['total'] - i, row['alpha'], row['beta'], row['h'], row['c']), axis=1)
    df[f'optimal_rewards{i}'] = df.apply(lambda row: -methods.cal_cost(row['c'], row['h'], row[f'u{i}'], row[f'u_star{i}']), axis=1)

df['direct_leave_rewards'] = df.apply(lambda row: -methods.cal_cost(row['c'], row['h'], row['intervals'][3:].sum(), row[f'travel_time']), axis=1)
df.head()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Optimal reward at n = 0


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Optimal reward at n = 3


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Optimal reward at n = 5


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Unnamed: 0,h,c,travel_time,total,intervals,alpha,beta,rewards,u_rl,u0,u_star0,optimal_rewards0,u3,u_star3,optimal_rewards3,u5,u_star5,optimal_rewards5,direct_leave_rewards
0,0.15,25,9.888546,27,"[0.46215557560432674, 1.63234026780688, 3.3275...",2,1,-3.939954,32.710596,58.976957,39.0912,-2.982864,53.554907,33.831967,-2.958441,49.90013,30.362099,-2.930705,-6.549954
1,0.15,25,12.210701,38,"[0.6981127614686615, 3.2654218577916896, 1.459...",2,1,-1.862647,65.73372,78.151366,58.785859,-2.904826,72.728346,53.362258,-2.904913,69.473901,49.766159,-2.956161,-9.077647
2,0.15,25,24.121388,21,"[1.4557300694006245, 1.1260651619210553, 1.785...",2,1,-0.223746,36.788968,38.28061,28.639672,-1.446141,33.913031,23.530883,-1.557322,32.484393,20.182994,-1.84521,-1.468746
3,0.15,25,3.062012,32,"[3.522621248955584, 5.680672455576015, 0.48339...",2,1,-3.067884,34.248703,54.701266,47.974589,-1.009001,45.014574,42.628557,-0.357903,41.283717,39.0912,-0.328878,-6.292884
4,0.15,25,10.60527,39,"[0.6810261090633148, 1.3089679800991698, 7.175...",2,1,-0.5574,66.270327,69.986329,60.600893,-1.407815,60.821272,55.166377,-0.848234,59.411492,51.562122,-1.177405,-7.5324


## Visualize Results

In [27]:
import plotly.graph_objects as go

all_rewards = [x for x in df.columns if 'reward' in x]

# Calculate means
means = df[all_rewards + ['h', 'c']].mean()

# Plot
fig = go.Figure(data=[go.Bar(x=means.index, y=means.values)])
fig.update_layout(title="Mean Values of Rewards, H, and C", yaxis_title="Mean Value")
fig.show()

# Calculate medians
medians = df[all_rewards + ['h', 'c']].median()

# Plot
fig = go.Figure(data=[go.Bar(x=medians.index, y=medians.values)])
fig.update_layout(title="Median Values of Rewards, H, and C", yaxis_title="Median Value")
fig.show()

In [28]:
df.head(50)

Unnamed: 0,h,c,travel_time,total,intervals,alpha,beta,rewards,u_rl,u0,u_star0,optimal_rewards0,u3,u_star3,optimal_rewards3,u5,u_star5,optimal_rewards5,direct_leave_rewards
0,0.15,25,9.888546,27,"[0.46215557560432674, 1.63234026780688, 3.3275...",2,1,-3.939954,32.710596,58.976957,39.0912,-2.982864,53.554907,33.831967,-2.958441,49.90013,30.362099,-2.930705,-6.549954
1,0.15,25,12.210701,38,"[0.6981127614686615, 3.2654218577916896, 1.459...",2,1,-1.862647,65.73372,78.151366,58.785859,-2.904826,72.728346,53.362258,-2.904913,69.473901,49.766159,-2.956161,-9.077647
2,0.15,25,24.121388,21,"[1.4557300694006245, 1.1260651619210553, 1.785...",2,1,-0.223746,36.788968,38.28061,28.639672,-1.446141,33.913031,23.530883,-1.557322,32.484393,20.182994,-1.84521,-1.468746
3,0.15,25,3.062012,32,"[3.522621248955584, 5.680672455576015, 0.48339...",2,1,-3.067884,34.248703,54.701266,47.974589,-1.009001,45.014574,42.628557,-0.357903,41.283717,39.0912,-0.328878,-6.292884
4,0.15,25,10.60527,39,"[0.6810261090633148, 1.3089679800991698, 7.175...",2,1,-0.5574,66.270327,69.986329,60.600893,-1.407815,60.821272,55.166377,-0.848234,59.411492,51.562122,-1.177405,-7.5324
5,0.15,25,10.812689,23,"[1.5364141405538967, 0.04506969384507508, 0.39...",2,1,-3.4982,45.685024,69.006355,32.093066,-5.536993,67.034021,26.926459,-6.016134,58.714419,23.530883,-5.277531,-8.4332
6,0.15,25,9.754133,21,"[2.2133087717998885, 2.553234950430311, 1.4590...",2,1,-1.909829,33.379737,46.11193,28.639672,-2.620839,39.886326,23.530883,-2.453317,38.022395,20.182994,-2.67591,-4.519829
7,0.15,25,9.594072,38,"[2.970823872498464, 1.0109002906371958, 0.9822...",2,1,-2.899021,52.058085,71.384892,58.785859,-1.889855,66.420879,53.362258,-1.958793,63.532671,49.766159,-2.064977,-8.524021
8,0.15,25,10.584547,19,"[0.6648768071784588, 1.846604125457729, 0.7286...",2,1,-1.636592,22.124714,33.03533,25.223237,-1.171814,29.795163,20.182994,-1.441825,27.722183,16.893084,-1.624365,-2.881592
9,0.15,25,12.286833,22,"[0.029946461590250273, 2.6928300330554795, 0.4...",2,1,-3.180502,25.880404,47.083752,30.362099,-2.508248,43.890181,25.223237,-2.800042,39.886165,21.850409,-2.705363,-4.740502
