In [1]:
import gymnasium as gym
from gymnasium.spaces import Discrete, Box, Sequence, Dict
import numpy as np
from utils import methods
import statistics

from utils.rl_environments import GammaIntervalsEnv1

In [2]:
from stable_baselines3.common.env_checker import check_env
c = GammaIntervalsEnv1()
# check_env(c, warn=True)
# experiment with the environment
obs, info = c.reset()
print(info['hidden'])
done = False 
while not done:
    _, reward, done, _, info = c.step(np.random.choice([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
    print(info['state'])
    print(reward)

{'alpha': 2, 'beta': 1.0, 'interval': array([1.4326973 , 1.075113  , 1.33615675, 0.78019561, 0.32611095,
       2.24287583, 3.03213719, 2.18641761, 3.05459471, 2.011798  ,
       3.40552239, 1.2138881 , 1.49425536, 0.74879057, 1.84613807,
       1.91477024, 2.13669289, 2.282278  , 1.45581652, 4.79350104,
       4.14274943, 0.23991083, 3.06504517, 3.68023393, 2.7677787 ,
       2.80193152, 3.28099843, 0.07762041, 0.6568814 , 2.79273485,
       0.52008491, 2.34152724]), 'cum_sum_intervals': array([ 1.4326973 ,  2.5078103 ,  3.84396705,  4.62416266,  4.95027361,
        7.19314944, 10.22528663, 12.41170424, 15.46629895, 17.47809695,
       20.88361934, 22.09750744, 23.5917628 , 24.34055337, 26.18669143,
       28.10146167, 30.23815456, 32.52043256, 33.97624908, 38.76975012,
       42.91249955, 43.15241038, 46.21745556, 49.89768949, 52.66546819,
       55.4673997 , 58.74839813, 58.82601854, 59.48289994, 62.27563478,
       62.79571969, 65.13724693])}
{'n': 3, 'N': 29, 'h': 0.57, 'c': 20, '

In [3]:
DEFAULT_CONFIG = {
    'alpha_range': range(1, 8),
    'beta_range': [round(i * 0.5, 1) for i in range(2, 9)],
    'h_range': [round(i * 0.01, 2) for i in range(6, 61)],
    'c_range': range(20, 30),
    'total': range(10, 40), 
}

CONSTANT_CONFIG = {
    'alpha_range': [1],
    'beta_range': [1],
    'h_range': [0.15],
    'c_range': [25],
    'total': range(10, 40), 
}

CONSTANT_CONFIG = DEFAULT_CONFIG

In [4]:
import stable_baselines3
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv



env = GammaIntervalsEnv1(config=CONSTANT_CONFIG)

model = PPO("MlpPolicy", env, verbose=1, device='cpu')


# Start training
model.learn(total_timesteps=80000)

model.save("ppo")

model = PPO.load("ppo")


state, _ = env.reset()
done = False
total_reward = 0
model.predict(state)

while not done:
    action, _ = model.predict(state)  
    state, reward, done, _, _ = env.step(action)
    total_reward += reward
    env.render()

print("Total reward:", total_reward)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.78     |
|    ep_rew_mean     | -4.06    |
| time/              |          |
|    fps             | 61       |
|    iterations      | 1        |
|    time_elapsed    | 33       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.17        |
|    ep_rew_mean          | -4.7        |
| time/                   |             |
|    fps                  | 63          |
|    iterations           | 2           |
|    time_elapsed         | 64          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014599012 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.2         |
|    entropy_loss   

In [5]:
def get_realized_data(config):
    alpha = np.random.choice(config['alpha_range'])
    beta = np.random.choice(config['beta_range'])
    h = np.random.choice(config['h_range'])
    c = np.random.choice(config['c_range'])
    total = np.random.choice(config['total'])
    intervals = np.random.gamma(shape=alpha, scale=beta, size=total)
    travel_time = sum(intervals[4:]) - np.random.exponential(scale=beta)
    travel_time = max(beta * 2, travel_time)

    return alpha, beta, h, c, total, intervals, travel_time

import pandas as pd

df = pd.DataFrame(columns=['h', 'c', 'travel_time', 'total', 'intervals'])

for i in range(10000):
    alpha, beta, h, c, total, intervals, travel_time = get_realized_data(CONSTANT_CONFIG)
    row = {'h': h, 'c': c, 'travel_time': travel_time, 'total': total, 'intervals': intervals}
    df.loc[i] = row



In [6]:
from stable_baselines3 import PPO
model = PPO.load("PPO")

env = GammaIntervalsEnv1(config=CONSTANT_CONFIG)
rewards = {}
print(env._get_info())
for i in range(len(df)):
    row = df.iloc[i]
    state, _ = env.reset(row=row)
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(state)  
        state, reward, done, _, info = env.step(action)
        total_reward += reward
    rewards[i] = total_reward
    # print(f"Episode {i} reward: {total_reward} | h = {info['state']['h']}, c = {info['state']['c']}")
    print(f"Episode {i} reward: {total_reward} | cur_time = {info['state']['cur_time']}")

df['rewards'] = rewards
print(f"Average reward: {df['rewards'].mean()}")
df.head()

{'hidden': {'alpha': -1, 'beta': -1, 'interval': -1, 'cum_sum_intervals': -1}, 'state': {'n': -1, 'N': -1, 'h': -1, 'c': -1, 'travel_time': -1, 'cur_time': -1, 'mean_n': -1, 'std_n': -1, 'alpha_hat': -1, 'beta_hat': -1, 'u_star_hat': -1, 'last_update': -1}}
Episode 0 reward: -0.3137199446315283 | cur_time = 3.6259115686991543
Episode 1 reward: -1.8260777020145724 | cur_time = 93.37281594417043
Episode 2 reward: -5.186847499444898 | cur_time = 10.884968145326772
Episode 3 reward: -0.8258657438910885 | cur_time = 3.130100346166207
Episode 4 reward: -0.31536295938980247 | cur_time = 9.962543776533836
Episode 5 reward: -9.56576415613671 | cur_time = 23.623935760770507
Episode 6 reward: -4.855341799874975 | cur_time = 48.32081289870473
Episode 7 reward: -1.780454074122834 | cur_time = 6.882057636594226
Episode 8 reward: -2.2592269245675807 | cur_time = 23.20722627286505
Episode 9 reward: -0.0929312494262436 | cur_time = 5.132318010475235
Episode 10 reward: -1.1602149804625037 | cur_time = 4

Unnamed: 0,h,c,travel_time,total,intervals,rewards
0,0.46,21,8.646211,11,"[2.312758234122157, 0.1645467425795917, 1.0486...",-0.31372
1,0.11,20,168.929431,12,"[26.957855914077548, 29.623774208563827, 33.59...",-1.826078
2,0.34,20,95.852827,32,"[8.400520307854446, 1.3883303642439124, 0.9961...",-5.186847
3,0.58,24,9.048812,13,"[0.05551032124894172, 2.358761033249645, 0.615...",-0.825866
4,0.18,29,30.748894,12,"[4.035882580490654, 3.799990407221247, 2.02667...",-0.315363


In [7]:
import plotly.graph_objects as go

# Calculate means
means = df[['rewards', 'h', 'c']].mean()

# Plot
fig = go.Figure(data=[go.Bar(x=means.index, y=means.values)])
fig.update_layout(title="Mean Values of Rewards, H, and C", yaxis_title="Mean Value")
fig.show()

# Calculate medians
medians = df[['rewards', 'h', 'c']].median()

# Plot
fig = go.Figure(data=[go.Bar(x=medians.index, y=medians.values)])
fig.update_layout(title="Median Values of Rewards, H, and C", yaxis_title="Median Value")
fig.show()