In [10]:
import gymnasium as gym
from gymnasium.spaces import Discrete, Box, Sequence, Dict
import numpy as np
from utils import methods
import statistics

from utils.rl_environments.env3 import Env3

import os

## Config

In [11]:
DEFAULT_CONFIG = {
    'alpha_range': range(1, 4),
    'beta_range': [round(i * 0.5, 1) for i in range(2, 4)],
    'h_range': [round(i * 0.01, 2) for i in range(6, 61)],
    'c_range': range(20, 30),
    'total': range(10, 40), 
}

CONSTANT_CONFIG = {
    'alpha_range': [2],
    'beta_range': [1],
    'h_range': [0.15],
    'c_range': [25],
    'total': range(17, 40), 
}

TRAIN_CONFIG = {
    'alpha_range': np.arange(2, 4, 0.1),
    'beta_range': np.arange(1, 2, 0.1),
    'h_range': [0.15],
    'c_range': [25],
    'total': range(17, 40), 
}


In [12]:
PPO_MODEL_NAME = 'PPO_Env3'

## Train PQN

In [13]:
import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv

if False:

    n_cpus = os.cpu_count()

    print(f"Number of processors: {n_cpus}")
    env = make_vec_env(lambda: Env3(config=TRAIN_CONFIG), n_envs=n_cpus, vec_env_cls=SubprocVecEnv)
    model = PPO("MlpPolicy", env, verbose=1, device='cpu')

    print(model.n_envs)

    total_timesteps = 1_500_000
    print(f"Training model for {total_timesteps} timesteps")
    # Start training
    model.learn(total_timesteps=total_timesteps)

    model.save(methods.file_path(PPO_MODEL_NAME, 'models'))



## Generate Test Data

In [14]:
def get_realized_data(config):
    alpha = np.random.choice(config['alpha_range'])
    beta = np.random.choice(config['beta_range'])
    h = np.random.choice(config['h_range'])
    c = np.random.choice(config['c_range'])
    total = np.random.choice(config['total'])
    intervals = np.random.gamma(shape=alpha, scale=beta, size=total)
    # travel_time = sum(intervals[4:]) - np.random.exponential(scale=alpha * beta)
    travel_time = sum(intervals[4:]) - np.random.gamma(shape=4, scale=alpha*beta)
    # travel_time = np.random.gamma(shape=5, scale=alpha*beta)
    # travel_time = max(beta * 5, travel_time)
    travel_time = sum(intervals[3:]) * np.random.uniform(0, 1)
    travel_time = max(alpha * beta, travel_time)

    return alpha, beta, h, c, total, intervals, travel_time

import pandas as pd

df = pd.DataFrame(columns=['h', 'c', 'travel_time', 'total', 'intervals', 'alpha', 'beta'])

for i in range(20000):
    alpha, beta, h, c, total, intervals, travel_time = get_realized_data(CONSTANT_CONFIG)
    row = {'h': h, 'c': c, 'travel_time': travel_time, 'total': total, 'intervals': intervals, 'alpha': alpha, 'beta': beta}
    df.loc[i] = row



## Test

In [15]:
from stable_baselines3 import PPO
model = PPO.load(methods.file_path(PPO_MODEL_NAME, 'models'))

env = Env3(config=CONSTANT_CONFIG)
rewards = {}
u_rl = {}
print(env._get_info())
for i in range(len(df)):
    row = df.iloc[i]
    state, _ = env.reset(row=row)
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(state)  
        state, reward, done, _, info = env.step(action)
        total_reward += reward
    rewards[i] = total_reward
    u_rl[i] = env.cur_time + env.travel_time
    # print(f"Episode {i} reward: {total_reward} | h = {info['state']['h']}, c = {info['state']['c']}")
    print(f"Episode {i} reward: {total_reward} | cur_time = {info['state']['cur_time']}")

df['rewards'] = rewards
df['u_rl'] = u_rl
print(f"Average reward: {df['rewards'].mean()}")

{'hidden': {'alpha': -1, 'beta': -1, 'interval': -1, 'cum_sum_intervals': -1}, 'state': {'n': -1, 'N': -1, 'h': -1, 'c': -1, 'travel_time': -1, 'cur_time': -1, 'mean_n': -1, 'std_n': -1, 'last_update': -1}}
Episode 0 reward: -1.563944033615151 | cur_time = 47.04200450622985
Episode 1 reward: -2.112782364582101 | cur_time = 32.91307540497828
Episode 2 reward: -25 | cur_time = 10.878329548231896
Episode 3 reward: -25 | cur_time = 58.95566331962594
Episode 4 reward: -1.9428883293756458 | cur_time = 35.287907436246954
Episode 5 reward: -2.4909210924122522 | cur_time = 18.120591453798816
Episode 6 reward: -0.7190542281519523 | cur_time = 13.402965455571634
Episode 7 reward: -3.6287368944888514 | cur_time = 40.23694017294848
Episode 8 reward: -3.3030154505477527 | cur_time = 29.006921455738436
Episode 9 reward: -1.2116345635962382 | cur_time = 12.047156879401827
Episode 10 reward: -2.0276253885568534 | cur_time = 24.47798291250747
Episode 11 reward: -3.351700786827408 | cur_time = 42.8730175

## Add Comparison Metrics

In [16]:

from pandarallel import pandarallel

# Initialize pandarallel with progress bar enabled
pandarallel.initialize(progress_bar=True)
for i in [0, 3, 5]:
    print(f"Optimal reward at n = {i}")
    df[f'u{i}'] = df.apply(lambda row: row['intervals'][i:].sum(), axis=1)
    df[f'u_star{i}'] = df.parallel_apply(lambda row: methods.get_u_star_binary_fast(row['total'] - i, row['alpha'], row['beta'], row['h'], row['c']), axis=1)
    df[f'optimal_rewards{i}'] = df.apply(lambda row: -methods.cal_cost(row['c'], row['h'], row[f'u{i}'], row[f'u_star{i}']), axis=1)

df['direct_leave_rewards'] = df.apply(lambda row: -methods.cal_cost(row['c'], row['h'], row['intervals'][3:].sum(), row[f'travel_time']), axis=1)
df.head()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Optimal reward at n = 0


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Optimal reward at n = 3


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Optimal reward at n = 5


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Unnamed: 0,h,c,travel_time,total,intervals,alpha,beta,rewards,u_rl,u0,u_star0,optimal_rewards0,u3,u_star3,optimal_rewards3,u5,u_star5,optimal_rewards5,direct_leave_rewards
0,0.15,25,4.199317,32,"[5.539269605387338, 1.9815034154318827, 1.8322...",2,1,-1.563944,51.241321,61.667615,47.974589,-2.053954,52.314565,42.628557,-1.452901,48.443285,39.0912,-1.402813,-7.217287
1,0.15,25,14.164279,25,"[0.4637677605442899, 2.7835827447677546, 1.766...",2,1,-2.112782,47.077354,61.16257,35.578271,-3.837645,56.148543,30.362099,-3.867967,49.981168,26.926459,-3.458206,-6.29764
2,0.15,25,43.315728,28,"[0.3521230923318635, 0.09850590410213199, 4.21...",2,1,-25.0,54.194057,52.18412,40.857006,-1.699067,47.514144,35.578271,-1.790381,42.560685,32.093066,-1.570143,-0.629763
3,0.15,25,24.474516,39,"[4.960015279540726, 0.6168600997551806, 1.0642...",2,1,-25.0,83.430179,82.885697,60.600893,-3.342721,76.244565,55.166377,-3.161728,75.209215,51.562122,-3.547064,-7.765507
4,0.15,25,19.629608,30,"[1.2084786997583494, 1.6130855910150033, 0.939...",2,1,-1.942888,54.917516,67.870105,44.405524,-3.519687,64.109464,39.0912,-3.75274,61.07743,35.578271,-3.824874,-6.671978


## Visualize Results

In [17]:
import plotly.graph_objects as go

all_rewards = [x for x in df.columns if 'reward' in x]

# Calculate means
means = df[all_rewards + ['h', 'c']].mean()

# Plot
fig = go.Figure(data=[go.Bar(x=means.index, y=means.values)])
fig.update_layout(title="Mean Values of Rewards, H, and C", yaxis_title="Mean Value")
fig.show()

# Calculate medians
medians = df[all_rewards + ['h', 'c']].median()

# Plot
fig = go.Figure(data=[go.Bar(x=medians.index, y=medians.values)])
fig.update_layout(title="Median Values of Rewards, H, and C", yaxis_title="Median Value")
fig.show()

In [18]:
df.head(50)

Unnamed: 0,h,c,travel_time,total,intervals,alpha,beta,rewards,u_rl,u0,u_star0,optimal_rewards0,u3,u_star3,optimal_rewards3,u5,u_star5,optimal_rewards5,direct_leave_rewards
0,0.15,25,4.199317,32,"[5.539269605387338, 1.9815034154318827, 1.8322...",2,1,-1.563944,51.241321,61.667615,47.974589,-2.053954,52.314565,42.628557,-1.452901,48.443285,39.0912,-1.402813,-7.217287
1,0.15,25,14.164279,25,"[0.4637677605442899, 2.7835827447677546, 1.766...",2,1,-2.112782,47.077354,61.16257,35.578271,-3.837645,56.148543,30.362099,-3.867967,49.981168,26.926459,-3.458206,-6.29764
2,0.15,25,43.315728,28,"[0.3521230923318635, 0.09850590410213199, 4.21...",2,1,-25.0,54.194057,52.18412,40.857006,-1.699067,47.514144,35.578271,-1.790381,42.560685,32.093066,-1.570143,-0.629763
3,0.15,25,24.474516,39,"[4.960015279540726, 0.6168600997551806, 1.0642...",2,1,-25.0,83.430179,82.885697,60.600893,-3.342721,76.244565,55.166377,-3.161728,75.209215,51.562122,-3.547064,-7.765507
4,0.15,25,19.629608,30,"[1.2084786997583494, 1.6130855910150033, 0.939...",2,1,-1.942888,54.917516,67.870105,44.405524,-3.519687,64.109464,39.0912,-3.75274,61.07743,35.578271,-3.824874,-6.671978
5,0.15,25,26.625248,33,"[2.1033512353988177, 1.1667231499208484, 1.333...",2,1,-2.490921,44.74584,61.35198,49.766159,-1.737873,56.74874,44.405524,-1.851482,54.09633,40.857006,-1.985899,-4.518524
6,0.15,25,15.602155,18,"[3.3134015774928636, 3.5557882722404295, 2.589...",2,1,-0.719054,29.00512,33.798815,23.530883,-1.54019,24.3397,18.530012,-0.871453,22.442129,15.274153,-1.075196,-1.310632
7,0.15,25,6.682724,36,"[0.3105390286579038, 3.07170043619801, 1.36444...",2,1,-3.628737,46.919665,71.111244,55.166377,-2.39173,66.364559,49.766159,-2.48976,63.697686,46.187628,-2.626509,-8.952275
8,0.15,25,6.066835,26,"[0.4882969877417784, 0.16559736637389078, 0.82...",2,1,-3.303015,35.073756,57.093859,37.331492,-2.964355,55.613532,32.093066,-3.52807,49.73839,28.639672,-3.164808,-7.432005
9,0.15,25,10.934766,20,"[0.717492738087836, 0.5453033577543935, 1.8819...",2,1,-1.211635,22.981922,31.059486,26.926459,-0.619954,27.91475,21.850409,-0.909651,25.642132,18.530012,-1.066818,-2.546998
