## Testing of the Graph Environment

Imports

In [22]:
import numpy as np
import pandas as pd
import json
import os
import shutil
import sys
import gym

import ray


In [3]:
from ray.rllib.agents.dqn import DQNTrainer, DEFAULT_CONFIG, APEX_DEFAULT_CONFIG

Import the Environment

In [24]:
sys.path.insert(0,"")

from ManhattanGraph import ManhattanGraph
from gym_graphenv.envs.GraphworldManhattan import GraphEnv

In [25]:
env=GraphEnv()
env.render()

Run random steps without Ray Trainer

In [None]:
def run_one_episode (env):
    env.reset()
    sum_reward = 0
    for i in range(30):
        #print(env.action_space)
        env.available_actions = env.get_available_actions()
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        sum_reward+=reward
        #env.render()
        if done:
            print("DELIVERY DONE! sum_reward: ",sum_reward, " time: ",env.time,  "deadline time: ", env.deadline,"pickup time: ", env.pickup_time )
            break

        print("sum_reward: ",sum_reward, " time: ",env.time, "deadline time: ", env.deadline, "pickup time: ", env.pickup_time)
    return sum_reward

for i in range(1):
    run_one_episode (env)

get_available_actions() Execution time: 0.25038623809814453 seconds
get_available_actions() Execution time: 0.24793601036071777 seconds
action ==  ownRide 
Compute_reward() Execution time: 9.5367431640625e-07 seconds
Step() Execution time: 0.014837980270385742 seconds
DELIVERY DONE! sum_reward:  100  time:  2016-01-01 07:35:09.700000 deadline time:  2016-01-01 10:15:00 pickup time:  2016-01-01 07:15:00


Initialize Ray

In [None]:
ray.init()

2022-04-22 11:35:33,075	INFO services.py:1374 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': '127.0.0.1:6379',
 'object_store_address': '/tmp/ray/session_2022-04-22_11-35-31_020066_16783/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-04-22_11-35-31_020066_16783/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2022-04-22_11-35-31_020066_16783',
 'metrics_export_port': 63548,
 'gcs_address': '127.0.0.1:60191',
 'node_id': '4e7e401e4a6a22dc61bb9d8e6c3857d41500f231cffdbae15804984f'}

Set DQN trainer configuration

In [None]:
dqn_config = DEFAULT_CONFIG.copy()
dqn_config['num_workers'] = 1
dqn_config["train_batch_size"] = 400
dqn_config["gamma"] = 0.95
dqn_config["n_step"] = 10
dqn_config["framework"] = "torch"
#num_gpus and other gpu parameters in order to train with gpu
#dqn_config["num_gpus"] = int(os.environ.get("RLLIB_NUM_GPUS", "0")) 

Set DQN - Rainbow configuration

In [None]:
rainbow_config = DEFAULT_CONFIG.copy()

rainbow_config = DEFAULT_CONFIG.copy()
rainbow_config['num_workers'] = 1
rainbow_config["train_batch_size"] = 400
rainbow_config["gamma"] = 0.95
rainbow_config["n_step"] = 10
rainbow_config["framework"] = "torch"
#num_gpus and other gpu parameters in order to train with gpu
#rainbow_config["num_gpus"] = int(os.environ.get("RLLIB_NUM_GPUS", "0")) 

#rainbow parameters

# N-step Q learning
rainbow_config["n_step"]= 5 #[between 1 and 10]
# Whether to use noisy network
rainbow_config["noisy"] = True
# Number of atoms for representing the distribution of return. When
# this is greater than 1, distributional Q-learning is used.
# the discrete supports are bounded by v_min and v_max
rainbow_config["num_atoms"] =2 #[more than 1]
rainbow_config["v_min"] =-100
rainbow_config["v_max"]=100 # (set v_min and v_max according to your expected range of returns).

Set DQN - Ape-X configuration 

In [None]:
apex_config=APEX_DEFAULT_CONFIG.copy()

Initialize the DQN trainer, Rainbow DQN and DQN Ape-X

In [None]:
trainer_dqn = DQNTrainer(dqn_config,GraphEnv )

2022-04-22 11:35:34,693	INFO simple_q.py:153 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
2022-04-22 11:35:34,694	INFO trainer.py:790 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(RolloutWorker pid=16857)[0m get_available_actions() Execution time: 0.2540102005004883 seconds
[2m[36m(RolloutWorker pid=16857)[0m get_available_actions() Execution time: 0.24943995475769043 seconds


2022-04-22 11:35:58,630	INFO trainable.py:125 -- Trainable.setup took 23.940 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [None]:
trainer_rainbow=DQNTrainer(rainbow_config,GraphEnv )

2022-04-23 14:21:04,767	INFO services.py:1374 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(RolloutWorker pid=30508)[0m get_available_actions() Execution time: 0.2821159362792969 seconds
[2m[36m(RolloutWorker pid=30508)[0m get_available_actions() Execution time: 0.3127598762512207 seconds


2022-04-23 14:21:37,475	INFO trainable.py:125 -- Trainable.setup took 35.528 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [None]:
#trainer_apex=DQNTrainer(apex_config,GraphEnv )

Define the path where the results of the trainer should be saved

In [9]:
checkpoint_root = "tmp/dqn/graphworld"
shutil.rmtree(checkpoint_root, ignore_errors=True, onerror=None)   # clean up old runs

Run the trainer

In [10]:
results = []
episode_data = []
episode_json = []
n_iter=10

for n in range(n_iter):
    result = trainer_dqn.train()
    results.append(result)
    print("TEST",n)
    
    episode = {'n': n, 
               'episode_reward_min': result['episode_reward_min'], 
               'episode_reward_mean': result['episode_reward_mean'], 
               'episode_reward_max': result['episode_reward_max'],  
               'episode_len_mean': result['episode_len_mean']
              }
    
    episode_data.append(episode)
    episode_json.append(json.dumps(episode))
    file_name = trainer_dqn.save(checkpoint_root)
    
    print(f'{n+1:3d}: Min/Mean/Max reward: {result["episode_reward_min"]:8.4f}/{result["episode_reward_mean"]:8.4f}/{result["episode_reward_max"]:8.4f}, len mean: {result["episode_len_mean"]:8.4f}. Checkpoint saved to {file_name}')

[2m[36m(RolloutWorker pid=16857)[0m get_available_actions() Execution time: 0.2527930736541748 seconds
[2m[36m(RolloutWorker pid=16857)[0m action == wait 
[2m[36m(RolloutWorker pid=16857)[0m Compute_reward() Execution time: 9.5367431640625e-07 seconds
[2m[36m(RolloutWorker pid=16857)[0m Step() Execution time: 7.009506225585938e-05 seconds
[2m[36m(RolloutWorker pid=16857)[0m get_available_actions() Execution time: 0.2461402416229248 seconds
[2m[36m(RolloutWorker pid=16857)[0m action ==  ownRide 
[2m[36m(RolloutWorker pid=16857)[0m Compute_reward() Execution time: 1.9073486328125e-06 seconds
[2m[36m(RolloutWorker pid=16857)[0m Step() Execution time: 0.010963916778564453 seconds
[2m[36m(RolloutWorker pid=16857)[0m get_available_actions() Execution time: 0.24532389640808105 seconds
[2m[36m(RolloutWorker pid=16857)[0m action ==  ownRide 
[2m[36m(RolloutWorker pid=16857)[0m Compute_reward() Execution time: 0.0 seconds
[2m[36m(RolloutWorker pid=16857)[0m St



[2m[36m(RolloutWorker pid=16857)[0m get_available_actions() Execution time: 0.25438904762268066 seconds
[2m[36m(RolloutWorker pid=16857)[0m action ==  ownRide 
[2m[36m(RolloutWorker pid=16857)[0m Compute_reward() Execution time: 9.5367431640625e-07 seconds
[2m[36m(RolloutWorker pid=16857)[0m Step() Execution time: 0.005262136459350586 seconds
[2m[36m(RolloutWorker pid=16857)[0m get_available_actions() Execution time: 0.2569620609283447 seconds
[2m[36m(RolloutWorker pid=16857)[0m action == wait 
[2m[36m(RolloutWorker pid=16857)[0m Compute_reward() Execution time: 3.814697265625e-06 seconds
[2m[36m(RolloutWorker pid=16857)[0m Step() Execution time: 8.0108642578125e-05 seconds
[2m[36m(RolloutWorker pid=16857)[0m action == wait 
[2m[36m(RolloutWorker pid=16857)[0m Compute_reward() Execution time: 1.9073486328125e-06 seconds
[2m[36m(RolloutWorker pid=16857)[0m Step() Execution time: 4.291534423828125e-05 seconds
[2m[36m(RolloutWorker pid=16857)[0m action 



[2m[36m(RolloutWorker pid=16857)[0m get_available_actions() Execution time: 0.24400091171264648 seconds
[2m[36m(RolloutWorker pid=16857)[0m action == wait 
[2m[36m(RolloutWorker pid=16857)[0m Compute_reward() Execution time: 2.86102294921875e-06 seconds
[2m[36m(RolloutWorker pid=16857)[0m Step() Execution time: 5.1975250244140625e-05 seconds
TEST 0
  1: Min/Mean/Max reward:  99.7500/ 99.9723/100.0000, len mean:   1.9980. Checkpoint saved to tmp/dqn/graphworld/checkpoint_000001/checkpoint-1
[2m[36m(RolloutWorker pid=16857)[0m action ==  ownRide 
[2m[36m(RolloutWorker pid=16857)[0m Compute_reward() Execution time: 2.1457672119140625e-06 seconds
[2m[36m(RolloutWorker pid=16857)[0m Step() Execution time: 0.011554241180419922 seconds
[2m[36m(RolloutWorker pid=16857)[0m get_available_actions() Execution time: 0.25821995735168457 seconds
[2m[36m(RolloutWorker pid=16857)[0m action ==  ownRide 
[2m[36m(RolloutWorker pid=16857)[0m Compute_reward() Execution time: 9.

In [11]:
results

[{'episode_reward_max': 100.0,
  'episode_reward_min': 99.75,
  'episode_reward_mean': 99.97227777777778,
  'episode_len_mean': 1.998,
  'episode_media': {},
  'episodes_this_iter': 500,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [100.0,
    100.0,
    100.0,
    100.0,
    100.0,
    100.0,
    100.0,
    99.97222222222223,
    100.0,
    100.0,
    99.88888888888889,
    100.0,
    100.0,
    99.86111111111111,
    100.0,
    99.97222222222223,
    99.97222222222223,
    100.0,
    100.0,
    100.0,
    99.94444444444444,
    100.0,
    100.0,
    99.97222222222223,
    99.97222222222223,
    99.97222222222223,
    99.97222222222223,
    100.0,
    99.97222222222223,
    99.97222222222223,
    100.0,
    99.88888888888889,
    99.97222222222223,
    99.97222222222223,
    99.94444444444444,
    100.0,
    99.75,
    99.97222222222223,
    99.88888888888889,
    100.0,
    100.0,
    100.

Testing of the trainer

In [12]:
trainer.restore(file_name)
env = gym.make("graphworld-v0")
state = env.reset()

2022-04-22 12:09:13,152	INFO trainable.py:472 -- Restored on 127.0.0.1 from checkpoint: tmp/dqn/graphworld/checkpoint_000010/checkpoint-10
2022-04-22 12:09:13,153	INFO trainable.py:480 -- Current state after restoring: {'_iteration': 10, '_timesteps_total': 4000, '_time_total': 1993.247197151184, '_episodes_total': 7414}


get_available_actions() Execution time: 0.2622261047363281 seconds
get_available_actions() Execution time: 0.2613539695739746 seconds


In [13]:
sum_reward = 0
n_step = 20
for step in range(n_step):
    action = trainer_dqn.compute_action(state)
    state, reward, done, info = env.step(action)
    sum_reward += reward
    #env.render()
    if done == 1:
        print("cumulative reward", sum_reward)
        state = env.reset()
        sum_reward = 0



action ==  ownRide 
Compute_reward() Execution time: 9.5367431640625e-07 seconds
Step() Execution time: 0.01560068130493164 seconds
cumulative reward 100
get_available_actions() Execution time: 0.2573566436767578 seconds
action ==  ownRide 
Compute_reward() Execution time: 2.1457672119140625e-06 seconds
Step() Execution time: 0.007878780364990234 seconds
cumulative reward 100
get_available_actions() Execution time: 0.2552003860473633 seconds
action ==  ownRide 
Compute_reward() Execution time: 9.5367431640625e-07 seconds
Step() Execution time: 0.0023260116577148438 seconds
cumulative reward 100
get_available_actions() Execution time: 0.2516210079193115 seconds
action ==  ownRide 
Compute_reward() Execution time: 9.5367431640625e-07 seconds
Step() Execution time: 0.010392904281616211 seconds
cumulative reward 100
get_available_actions() Execution time: 0.2516932487487793 seconds
action ==  ownRide 
Compute_reward() Execution time: 0.0 seconds
Step() Execution time: 0.005363941192626953 

In [14]:
ray.shutdown()