In [2]:
import gymnasium as gym
from gymnasium.spaces import Discrete, Box, Sequence, Dict
import numpy as np
from utils import methods
import statistics

from utils.rl_environments.env1 import Env1

## Config

In [2]:
DEFAULT_CONFIG = {
    'alpha_range': range(1, 8),
    'beta_range': [round(i * 0.5, 1) for i in range(2, 9)],
    'h_range': [round(i * 0.01, 2) for i in range(6, 61)],
    'c_range': range(20, 30),
    'total': range(10, 40), 
}

CONSTANT_CONFIG = {
    'alpha_range': [1],
    'beta_range': [1],
    'h_range': [0.15],
    'c_range': [25],
    'total': range(10, 40), 
}

CONSTANT_CONFIG = DEFAULT_CONFIG

## Train PQN

In [None]:
import stable_baselines3
from stable_baselines3 import PPO

if True:
    env = Env1(config=CONSTANT_CONFIG)

    model = PPO("MlpPolicy", env, verbose=1, device='cpu')


    # Start training
    model.learn(total_timesteps=200000)

    model.save("ppo")



Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.79     |
|    ep_rew_mean     | -6.61    |
| time/              |          |
|    fps             | 63       |
|    iterations      | 1        |
|    time_elapsed    | 32       |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 2.1          |
|    ep_rew_mean          | -5.79        |
| time/                   |              |
|    fps                  | 67           |
|    iterations           | 2            |
|    time_elapsed         | 60           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0155644715 |
|    clip_fraction        | 0.128        |
|    clip_range           | 0.2          |
|    en

## Generate Test Data

In [4]:
def get_realized_data(config):
    alpha = np.random.choice(config['alpha_range'])
    beta = np.random.choice(config['beta_range'])
    h = np.random.choice(config['h_range'])
    c = np.random.choice(config['c_range'])
    total = np.random.choice(config['total'])
    intervals = np.random.gamma(shape=alpha, scale=beta, size=total)
    # travel_time = sum(intervals[4:]) - np.random.exponential(scale=alpha * beta)
    travel_time = sum(intervals[4:]) - np.random.gamma(shape=2, scale=alpha*beta)
    travel_time = max(beta * 5, travel_time)

    return alpha, beta, h, c, total, intervals, travel_time

import pandas as pd

df = pd.DataFrame(columns=['h', 'c', 'travel_time', 'total', 'intervals', 'alpha', 'beta'])

for i in range(20000):
    alpha, beta, h, c, total, intervals, travel_time = get_realized_data(CONSTANT_CONFIG)
    row = {'h': h, 'c': c, 'travel_time': travel_time, 'total': total, 'intervals': intervals, 'alpha': alpha, 'beta': beta}
    df.loc[i] = row



## Test

In [None]:
from stable_baselines3 import PPO
model = PPO.load("PPO")

env = Env1(config=CONSTANT_CONFIG)
rewards = {}
u_rl = {}
print(env._get_info())
for i in range(len(df)):
    row = df.iloc[i]
    state, _ = env.reset(row=row)
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(state)  
        state, reward, done, _, info = env.step(action)
        total_reward += reward
    rewards[i] = total_reward
    u_rl[i] = env.cur_time + env.travel_time
    # print(f"Episode {i} reward: {total_reward} | h = {info['state']['h']}, c = {info['state']['c']}")
    print(f"Episode {i} reward: {total_reward} | cur_time = {info['state']['cur_time']}")

df['rewards'] = rewards
df['u_rl'] = u_rl
print(f"Average reward: {df['rewards'].mean()}")

{'hidden': {'alpha': -1, 'beta': -1, 'interval': -1, 'cum_sum_intervals': -1}, 'state': {'n': -1, 'N': -1, 'h': -1, 'c': -1, 'travel_time': -1, 'cur_time': -1, 'mean_n': -1, 'std_n': -1, 'alpha_hat': -1, 'beta_hat': -1, 'u_star_hat': -1, 'last_update': -1}}
Episode 0 reward: -25.232201814277477 | cur_time = 36.011962740932766
Episode 1 reward: -5.008544671263144 | cur_time = 76.71143185655946
Episode 2 reward: -9.748115166587326 | cur_time = 38.996928692169
Episode 3 reward: -2.378947819981788 | cur_time = 25.37028852674334
Episode 4 reward: -4.941448460952268 | cur_time = 26.83775432291246
Episode 5 reward: -6.416311209426831 | cur_time = 38.4973687652023
Episode 6 reward: -6.816668421544737 | cur_time = 45.48390518318615
Episode 7 reward: -7.418940324126661 | cur_time = 81.29501233662936
Episode 8 reward: -1.1060124115035976 | cur_time = 3.409993508439823
Episode 9 reward: -8.106551117865731 | cur_time = 20.804755076005108
Episode 10 reward: -2.340278301739599 | cur_time = 38.5674742

## Add Comparison Metrics

In [6]:

from pandarallel import pandarallel

# Initialize pandarallel with progress bar enabled
pandarallel.initialize(progress_bar=True)
for i in [0, 3, 5]:
    print(f"Optimal reward at n = {i}")
    df[f'u{i}'] = df.apply(lambda row: row['intervals'][i:].sum(), axis=1)
    df[f'u_star{i}'] = df.parallel_apply(lambda row: methods.get_u_star_binary_fast(row['total'] - i, row['alpha'], row['beta'], row['h'], row['c']), axis=1)
    df[f'optimal_rewards{i}'] = df.apply(lambda row: -methods.cal_cost(row['c'], row['h'], row[f'u{i}'], row[f'u_star{i}']), axis=1)

df['direct_leave_rewards'] = df.apply(lambda row: -methods.cal_cost(row['c'], row['h'], row['intervals'][3:].sum(), row[f'travel_time']), axis=1)
df.head()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Optimal reward at n = 0


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Optimal reward at n = 3


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Optimal reward at n = 5


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Unnamed: 0,h,c,travel_time,total,intervals,alpha,beta,rewards,u_rl,u0,u_star0,optimal_rewards0,u3,u_star3,optimal_rewards3,u5,u_star5,optimal_rewards5,direct_leave_rewards
0,0.55,23,135.714097,17,"[15.390915514572413, 1.8681610950690877, 16.05...",3,4.0,-25.232202,171.72606,217.602791,196.643185,-11.527783,184.290828,158.085407,-14.412982,145.725873,132.546509,-7.24865,-26.717202
1,0.08,26,105.022953,10,"[34.80758972946041, 14.993723416446095, 11.710...",7,3.0,-5.008545,181.734385,244.341193,164.662333,-6.374309,182.829761,107.626189,-6.016286,134.254236,70.776648,-5.078207,-6.224545
2,0.5,20,247.500506,26,"[12.28972390495802, 14.385826559822092, 4.3213...",3,4.0,-9.748115,286.497435,305.993665,315.700163,-20.0,274.996737,276.423352,-20.0,255.002871,250.30018,-2.351345,-13.748115
3,0.09,27,252.39418,27,"[8.94379894565293, 9.873185262779728, 4.153304...",3,4.0,-2.378948,277.764469,304.197222,266.595901,-3.384119,281.226934,232.854077,-4.353557,245.934576,210.500854,-3.189035,-2.594948
4,0.18,27,72.783358,21,"[6.178729287041313, 11.178258244347273, 9.0807...",6,1.0,-4.941448,99.621113,127.073604,105.472502,-3.888198,100.63585,88.650097,-2.157436,91.591738,77.519299,-2.533039,-5.013448


## Visualize Results

In [7]:
import plotly.graph_objects as go

all_rewards = [x for x in df.columns if 'reward' in x]

# Calculate means
means = df[all_rewards + ['h', 'c']].mean()

# Plot
fig = go.Figure(data=[go.Bar(x=means.index, y=means.values)])
fig.update_layout(title="Mean Values of Rewards, H, and C", yaxis_title="Mean Value")
fig.show()

# Calculate medians
medians = df[all_rewards + ['h', 'c']].median()

# Plot
fig = go.Figure(data=[go.Bar(x=medians.index, y=medians.values)])
fig.update_layout(title="Median Values of Rewards, H, and C", yaxis_title="Median Value")
fig.show()

In [8]:
df.head(50)

Unnamed: 0,h,c,travel_time,total,intervals,alpha,beta,rewards,u_rl,u0,u_star0,optimal_rewards0,u3,u_star3,optimal_rewards3,u5,u_star5,optimal_rewards5,direct_leave_rewards
0,0.55,23,135.714097,17,"[15.390915514572413, 1.8681610950690877, 16.05...",3,4.0,-25.232202,171.72606,217.602791,196.643185,-11.527783,184.290828,158.085407,-14.412982,145.725873,132.546509,-7.24865,-26.717202
1,0.08,26,105.022953,10,"[34.80758972946041, 14.993723416446095, 11.710...",7,3.0,-5.008545,181.734385,244.341193,164.662333,-6.374309,182.829761,107.626189,-6.016286,134.254236,70.776648,-5.078207,-6.224545
2,0.5,20,247.500506,26,"[12.28972390495802, 14.385826559822092, 4.3213...",3,4.0,-9.748115,286.497435,305.993665,315.700163,-20.0,274.996737,276.423352,-20.0,255.002871,250.30018,-2.351345,-13.748115
3,0.09,27,252.39418,27,"[8.94379894565293, 9.873185262779728, 4.153304...",3,4.0,-2.378948,277.764469,304.197222,266.595901,-3.384119,281.226934,232.854077,-4.353557,245.934576,210.500854,-3.189035,-2.594948
4,0.18,27,72.783358,21,"[6.178729287041313, 11.178258244347273, 9.0807...",6,1.0,-4.941448,99.621113,127.073604,105.472502,-3.888198,100.63585,88.650097,-2.157436,91.591738,77.519299,-2.533039,-5.013448
5,0.16,26,117.403085,22,"[3.756921597637104, 12.506547430321655, 17.033...",3,2.5,-6.416311,155.900454,196.002399,133.037568,-10.074373,162.705031,112.089457,-8.098492,146.825361,98.249933,-7.772069,-7.248311
6,0.29,21,179.560539,27,"[11.718662545236644, 8.730032364477662, 16.535...",6,1.5,-6.816668,225.044444,248.550197,221.918648,-7.723149,211.566292,195.331226,-4.708169,197.76277,177.663186,-5.828879,-9.281668
7,0.24,27,612.887513,32,"[36.32979423093962, 16.908825606526186, 9.1563...",6,4.0,-7.41894,694.182525,725.094777,736.093512,-27.0,662.699764,662.990889,-27.0,634.31929,614.341018,-4.794785,-11.95494
8,0.2,23,30.483737,17,"[0.5750910656478342, 0.6841352647148895, 2.050...",2,1.0,-1.106012,33.893731,39.423793,22.604483,-3.363862,36.113799,17.537467,-3.715266,33.166561,14.241884,-3.784935,-1.126012
9,0.31,21,111.033197,22,"[8.886154837110604, 6.269859846386212, 5.54874...",3,2.0,-8.106551,131.837952,157.988117,112.667704,-14.049328,137.283362,95.250432,-13.030208,115.98946,83.722097,-10.002883,-8.137551
