In [1]:
import gymnasium as gym
from gymnasium.spaces import Discrete, Box, Sequence, Dict
import numpy as np
from utils import methods
import statistics

from utils.rl_environments.env3 import Env3

import os

## Config

In [2]:
DEFAULT_CONFIG = {
    'alpha_range': range(1, 4),
    'beta_range': [round(i * 0.5, 1) for i in range(2, 4)],
    'h_range': [round(i * 0.01, 2) for i in range(6, 61)],
    'c_range': range(20, 30),
    'total': range(10, 40), 
}

CONSTANT_CONFIG = {
    'alpha_range': [3],
    'beta_range': [1.5],
    'h_range': [0.15],
    'c_range': [25],
    'total': range(17, 40), 
}

TRAIN_CONFIG = {
    'alpha_range': np.arange(2, 5, 0.1),
    'beta_range': np.arange(1, 2, 0.1),
    'h_range': [0.15],
    'c_range': [25],
    'total': range(17, 40), 
}


In [3]:
PPO_MODEL_NAME = 'PPO_Env3'

## Train PQN

In [4]:
import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv

if True:

    n_cpus = os.cpu_count()

    print(f"Number of processors: {n_cpus}")
    env = make_vec_env(lambda: Env3(config=TRAIN_CONFIG), n_envs=n_cpus, vec_env_cls=SubprocVecEnv)
    model = PPO("MlpPolicy", env, verbose=1, device='cpu')

    print(model.n_envs)

    total_timesteps = 2_500_000
    print(f"Training model for {total_timesteps} timesteps")
    # Start training
    model.learn(total_timesteps=total_timesteps)

    model.save(methods.file_path(PPO_MODEL_NAME, 'models'))



Number of processors: 8
Using cpu device
8
Training model for 2500000 timesteps
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4.6      |
|    ep_rew_mean     | -7.84    |
| time/              |          |
|    fps             | 17923    |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 16384    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.91        |
|    ep_rew_mean          | -9.94       |
| time/                   |             |
|    fps                  | 8850        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.013562638 |
|    clip_fraction        | 0.192       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.37 

## Generate Test Data

In [5]:
def get_realized_data(config):
    alpha = np.random.choice(config['alpha_range'])
    beta = np.random.choice(config['beta_range'])
    h = np.random.choice(config['h_range'])
    c = np.random.choice(config['c_range'])
    total = np.random.choice(config['total'])
    intervals = np.random.gamma(shape=alpha, scale=beta, size=total)
    # travel_time = sum(intervals[4:]) - np.random.exponential(scale=alpha * beta)
    travel_time = sum(intervals[4:]) - np.random.gamma(shape=4, scale=alpha*beta)
    # travel_time = np.random.gamma(shape=5, scale=alpha*beta)
    # travel_time = max(beta * 5, travel_time)
    travel_time = sum(intervals[3:]) * np.random.uniform(0, 1)
    travel_time = max(alpha * beta, travel_time)

    return alpha, beta, h, c, total, intervals, travel_time

import pandas as pd

df = pd.DataFrame(columns=['h', 'c', 'travel_time', 'total', 'intervals', 'alpha', 'beta'])

for i in range(20000):
    alpha, beta, h, c, total, intervals, travel_time = get_realized_data(CONSTANT_CONFIG)
    row = {'h': h, 'c': c, 'travel_time': travel_time, 'total': total, 'intervals': intervals, 'alpha': alpha, 'beta': beta}
    df.loc[i] = row



## Test

In [6]:
from stable_baselines3 import PPO
model = PPO.load(methods.file_path(PPO_MODEL_NAME, 'models'))

env = Env3(config=CONSTANT_CONFIG)
rewards = {}
u_rl = {}
print(env._get_info())
for i in range(len(df)):
    row = df.iloc[i]
    state, _ = env.reset(row=row)
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(state)  
        state, reward, done, _, info = env.step(action)
        total_reward += reward
    rewards[i] = total_reward
    u_rl[i] = env.cur_time + env.travel_time
    # print(f"Episode {i} reward: {total_reward} | h = {info['state']['h']}, c = {info['state']['c']}")
    print(f"Episode {i} reward: {total_reward} | cur_time = {info['state']['cur_time']}")

df['rewards'] = rewards
df['u_rl'] = u_rl
print(f"Average reward: {df['rewards'].mean()}")

{'hidden': {'alpha': -1, 'beta': -1, 'interval': -1, 'cum_sum_intervals': -1}, 'state': {'n': -1, 'N': -1, 'h': -1, 'c': -1, 'travel_time': -1, 'cur_time': -1, 'mean_n': -1, 'std_n': -1, 'last_update': -1}}
Episode 0 reward: -3.8977532093246072 | cur_time = 18.790045982467348
Episode 1 reward: -3.855202098428286 | cur_time = 42.87280325525108
Episode 2 reward: -25 | cur_time = 95.51545703402101
Episode 3 reward: -1.7288234907435651 | cur_time = 49.65431075275633
Episode 4 reward: -25 | cur_time = 135.55504534033332
Episode 5 reward: -4.641201063437715 | cur_time = 46.36989689015878
Episode 6 reward: -5.997482776925123 | cur_time = 62.44484382896494
Episode 7 reward: -5.107506433409655 | cur_time = 69.05952708798635
Episode 8 reward: -3.6214263232502617 | cur_time = 21.48121967454325
Episode 9 reward: -2.718486152610235 | cur_time = 68.59665403361059
Episode 10 reward: -0.217446781112875 | cur_time = 11.3049779940138
Episode 11 reward: -3.2532432983376727 | cur_time = 48.912449949312915

## Add Comparison Metrics

In [7]:

from pandarallel import pandarallel

# Initialize pandarallel with progress bar enabled
pandarallel.initialize(progress_bar=True)
for i in [0, 3, 5]:
    print(f"Optimal reward at n = {i}")
    df[f'u{i}'] = df.apply(lambda row: row['intervals'][i:].sum(), axis=1)
    df[f'u_star{i}'] = df.parallel_apply(lambda row: methods.get_u_star_binary_fast(row['total'] - i, row['alpha'], row['beta'], row['h'], row['c']), axis=1)
    df[f'optimal_rewards{i}'] = df.apply(lambda row: -methods.cal_cost(row['c'], row['h'], row[f'u{i}'], row[f'u_star{i}']), axis=1)

df['direct_leave_rewards'] = df.apply(lambda row: -methods.cal_cost(row['c'], row['h'], row['intervals'][3:].sum(), row[f'travel_time']), axis=1)
df.head()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Optimal reward at n = 0


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Optimal reward at n = 3


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Optimal reward at n = 5


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

Unnamed: 0,h,c,travel_time,total,intervals,alpha,beta,rewards,u_rl,u0,u_star0,optimal_rewards0,u3,u_star3,optimal_rewards3,u5,u_star5,optimal_rewards5,direct_leave_rewards
0,0.15,25,60.621185,27,"[5.878209013634051, 4.373432982915695, 3.23429...",3,1.5,-3.897753,79.411231,105.396252,97.37899,-1.202589,91.910319,84.939653,-1.0456,84.329599,76.704829,-1.143715,-4.69337
1,0.15,25,18.146751,20,"[5.137656988310638, 1.7355802975953494, 1.5864...",3,1.5,-3.855202,61.019554,86.720902,68.524882,-2.729403,78.261255,56.379238,-3.282302,69.883023,48.384471,-3.224783,-9.017176
2,0.15,25,79.29292,19,"[8.382835695685449, 3.872076567299527, 2.50512...",3,1.5,-25.0,174.808377,95.084732,64.458332,-4.59396,80.324699,52.370237,-4.193169,70.953394,44.424561,-3.979325,-0.154767
3,0.15,25,20.870627,18,"[2.500494417660291, 5.965747440208398, 3.18854...",3,1.5,-1.728823,70.524938,82.050428,60.409231,-3.246179,70.395636,48.384471,-3.301675,58.264811,40.49366,-2.665673,-7.428751
4,0.15,25,83.597319,33,"[14.03371684311505, 2.4216381490912866, 2.5400...",3,1.5,-25.0,219.152364,135.242861,122.507602,-1.910289,116.247476,109.906371,-0.951166,109.573166,101.545779,-1.204108,-4.897524


## Visualize Results

In [8]:
import plotly.graph_objects as go

all_rewards = [x for x in df.columns if 'reward' in x]

# Calculate means
means = df[all_rewards + ['h', 'c']].mean()

# Plot
fig = go.Figure(data=[go.Bar(x=means.index, y=means.values)])
fig.update_layout(title="Mean Values of Rewards, H, and C", yaxis_title="Mean Value")
fig.show()

# Calculate medians
medians = df[all_rewards + ['h', 'c']].median()

# Plot
fig = go.Figure(data=[go.Bar(x=medians.index, y=medians.values)])
fig.update_layout(title="Median Values of Rewards, H, and C", yaxis_title="Median Value")
fig.show()

In [9]:
df.head(50)

Unnamed: 0,h,c,travel_time,total,intervals,alpha,beta,rewards,u_rl,u0,u_star0,optimal_rewards0,u3,u_star3,optimal_rewards3,u5,u_star5,optimal_rewards5,direct_leave_rewards
0,0.15,25,60.621185,27,"[5.878209013634051, 4.373432982915695, 3.23429...",3,1.5,-3.897753,79.411231,105.396252,97.37899,-1.202589,91.910319,84.939653,-1.0456,84.329599,76.704829,-1.143715,-4.69337
1,0.15,25,18.146751,20,"[5.137656988310638, 1.7355802975953494, 1.5864...",3,1.5,-3.855202,61.019554,86.720902,68.524882,-2.729403,78.261255,56.379238,-3.282302,69.883023,48.384471,-3.224783,-9.017176
2,0.15,25,79.29292,19,"[8.382835695685449, 3.872076567299527, 2.50512...",3,1.5,-25.0,174.808377,95.084732,64.458332,-4.59396,80.324699,52.370237,-4.193169,70.953394,44.424561,-3.979325,-0.154767
3,0.15,25,20.870627,18,"[2.500494417660291, 5.965747440208398, 3.18854...",3,1.5,-1.728823,70.524938,82.050428,60.409231,-3.246179,70.395636,48.384471,-3.301675,58.264811,40.49366,-2.665673,-7.428751
4,0.15,25,83.597319,33,"[14.03371684311505, 2.4216381490912866, 2.5400...",3,1.5,-25.0,219.152364,135.242861,122.507602,-1.910289,116.247476,109.906371,-0.951166,109.573166,101.545779,-1.204108,-4.897524
5,0.15,25,12.788282,22,"[1.4463989040253769, 0.8880045332733784, 5.213...",3,1.5,-4.641201,59.158179,90.099519,76.704829,-2.009204,82.551955,64.458332,-2.714043,74.893486,56.379238,-2.777137,-10.464551
6,0.15,25,4.5,19,"[8.06800518899282, 7.641310518832496, 4.418841...",3,1.5,-5.997483,66.944844,106.928062,64.458332,-6.37046,86.799905,52.370237,-5.16445,75.866513,44.424561,-4.716293,-12.344986
7,0.15,25,28.251344,30,"[1.8692939008482097, 0.683205563824656, 5.3766...",3,1.5,-5.107506,97.310871,131.360914,109.906371,-3.218181,123.431778,97.37899,-3.907918,115.527862,89.075261,-3.96789,-14.277065
8,0.15,25,46.941486,24,"[4.131036185947485, 3.124594461505142, 5.59705...",3,1.5,-3.621426,68.422706,92.565548,84.939653,-1.143884,79.712867,72.607468,-1.06581,73.8586,64.458332,-1.41004,-4.915707
9,0.15,25,24.390325,30,"[7.116759975782401, 2.707899424621283, 8.82758...",3,1.5,-2.718486,92.986979,111.11022,109.906371,-0.180577,92.457976,97.37899,-25.0,87.823151,89.075261,-25.0,-10.210148
