In [3]:
%load_ext autoreload
%autoreload 2

import numpy as np
from poly_matrix import polynomial, create_poly_matrix
from city_env import CityEnv  # make sure the env class is renamed properly if needed
from stable_baselines3 import PPO
import gymnasium as gym
from stable_baselines3.common.env_util import make_vec_env
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Initialize environment
N = 40
T = 24
poly_matrix = create_poly_matrix(N, T)

def make_env():
    return CityEnv(poly_matrix=poly_matrix, N=N, time_horizon=T)

env = make_vec_env(make_env, n_envs=50)

In [5]:
# Initialize benchmark set and savefile for model
benchmark_size = 100
destination_set = [np.random.choice([0, 1], size=N) for i in range(benchmark_size)]
start_node_set = np.random.choice(np.arange(N), size = benchmark_size)
start_time_set = np.random.uniform(low = 0, high = 24, size = benchmark_size)
savefile_name = 'N40_test'

In [10]:
# define helper function for model evaluation
def eval_func(model):
    eval_env = CityEnv(poly_matrix=poly_matrix, N=N, time_horizon=T)
    average_time = 0
    for i in range(benchmark_size):
        benchmark_element = {
        "destinations": destination_set[i].copy(),
        "current_time": start_time_set[i],
        "current_node": start_node_set[i] }
        obs, _ = eval_env.reset(eval_params=benchmark_element or {})
        done = False
        truncated = False
        total_reward = 0.0

        while not (done or truncated):
            #print(obs)
            action, _ = model.predict(obs, deterministic=True)
            #print(action)
            obs, reward, done, truncated, _ = eval_env.step(action)
            
            total_reward += reward
            if truncated and not done:
                print(f'WARNING: TRUNCATION on benchmark example {i}')
        average_time += eval_env.current_time / benchmark_size
        avg_reward = total_reward / benchmark_size

    return average_time, avg_reward

In [7]:
def visualize_poly_matrix(poly_matrix):
  for i in range(N):
    for j in range(N):
      vals = [poly_matrix[i][j].eval(t) for t in range(T)]
      plt.plot(vals, label=f"poly_matrix[{i}][{j}]")
  plt.show()

In [8]:
# create and save model
model = PPO("MlpPolicy", env, verbose=0)
model.save(savefile_name)

In [None]:
benchmark_average_travel_times = []
benchmark_average_rewards = []
for i in range(50):
    print('Training epoch: ', i)
    model = PPO.load(savefile_name, env = env)
    model.learn(total_timesteps=10_000)
    benchmark_time, average_reward = eval_func(model)
    print('Benchmark time', benchmark_time, 'Average reward', average_reward)
    benchmark_average_travel_times.append(benchmark_time)
    benchmark_average_rewards.append(average_reward)
    model.save(savefile_name)

Training epoch:  0
