In [None]:
%load_ext autoreload
%autoreload 2
import numpy as np
from poly_matrix import polynomial, create_poly_matrix # custome type we implement
from city_env import CityEnv  # gym environment we implement
# CITATION:
# Raffin, A., Hill, A., Gleave, A., Kanervisto, A., Dormann, N., et al. (2021). Stable-baselines3: Reli-
# able reinforcement learning implementations. https://github.com/DLR-RM/stable-baselines3.
from stable_baselines3 import PPO, DQN # out-of-the-box PPO model 
import gymnasium as gym # off-the-shelf generic environment
from stable_baselines3.common.env_util import make_vec_env
import matplotlib.pyplot as plt
import pickle as pkl


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# Initialize environment
# pick the number of city locations
N = 80
T = 24
poly_matrix = create_poly_matrix(N, T)

# wrapper function for vectorized environments allowing for parallelization across CPU cores
def make_env():
    return CityEnv(poly_matrix=poly_matrix, N=N, time_horizon=T)

env = make_vec_env(make_env, n_envs=50)

In [None]:
# Initialize benchmark set and savefile for model
benchmark_size = 100
# choose random destinations, starting nodes and starting times
destination_set = [np.random.choice([0, 1], size=N) for i in range(benchmark_size)]
start_node_set = np.random.choice(np.arange(N), size = benchmark_size)
start_time_set = np.random.uniform(low = 0, high = 24, size = benchmark_size)
savefile_name = 'N80_prod_run'

In [None]:
# save critical environment parameters and benchmark set for future analysis
f = open(f'poly_matrix_{savefile_name}.pkl', 'wb')
pkl.dump(poly_matrix, f)
f.close()

f = open(f'destination_set_{savefile_name}.pkl', 'wb')
pkl.dump(poly_matrix, f)
f.close()

f = open(f'start_node_set_{savefile_name}.pkl', 'wb')
pkl.dump(poly_matrix, f)
f.close()

f = open(f'start_time_set_{savefile_name}.pkl', 'wb')
pkl.dump(poly_matrix, f)
f.close()


In [None]:
# define helper function for model evaluation
def eval_func(model):
    eval_env = CityEnv(poly_matrix=poly_matrix, N=N, time_horizon=T)
    average_time = 0
    average_reward = 0
    # iterate through each element of the benchmark set
    for i in range(benchmark_size):
        benchmark_element = {
        "destinations": destination_set[i].copy(),
        "current_time": start_time_set[i],
        "current_node": start_node_set[i] }
        obs, _ = eval_env.reset(eval_params=benchmark_element or {})
        done = False
        truncated = False
        total_trajectory_reward = 0.0
        while not (done or truncated):
            #print(obs)
            action, _ = model.predict(obs, deterministic=True)
            #print(action)
            obs, reward, done, truncated, _ = eval_env.step(action)
            
            total_trajectory_reward += reward
            if truncated and not done:
                # throw a warning if a trajectory ends due to reaching max_steps
                # rather than successfully visiting every destination.
                print(f'WARNING: TRUNCATION on benchmark example {i}')
        average_time += (eval_env.current_time - start_time_set[i]) / benchmark_size
        average_reward += total_trajectory_reward / benchmark_size

    return average_time, average_reward

In [None]:
# helper function for poly_matrix visualization
def visualize_poly_matrix(poly_matrix):
  for i in range(N):
    for j in range(N):
      vals = [poly_matrix[i][j].eval(t) for t in range(T)]
      plt.plot(vals, label=f"poly_matrix[{i}][{j}]")
  plt.show()

In [28]:
# create and save model
model = PPO("MlpPolicy", env, verbose=0)
model.save(savefile_name)

In [None]:
# core training loop
benchmark_average_travel_times = []
benchmark_average_rewards = []
for i in range(50):
    print('Training epoch: ', i)
    model = PPO.load(savefile_name, env = env)
    model.learn(total_timesteps=500_000)
    benchmark_time, average_reward = eval_func(model)
    print('Benchmark time', benchmark_time, 'Average reward', average_reward)
    benchmark_average_travel_times.append(benchmark_time)
    benchmark_average_rewards.append(average_reward)
    model.save(savefile_name)

In [None]:
# extract runtimes for histogram creation
times = []
eval_env = CityEnv(poly_matrix=poly_matrix, N=N, time_horizon=T)
for i in range(benchmark_size):
  benchmark_element = {
  "destinations": destination_set[i].copy(),
  "current_time": start_time_set[i],
  "current_node": start_node_set[i] }
  obs, _ = eval_env.reset(eval_params=benchmark_element or {})
  done = False
  truncated = False
  while not (done or truncated):
    #print(obs)
    action, _ = model.predict(obs, deterministic=True)
    #print(action)
    obs, reward, done, truncated, _ = eval_env.step(action)
    if truncated and not done:
        print(f'WARNING: TRUNCATION on benchmark example {i}')
  times.append(eval_env.current_time - start_time_set[i])