In [None]:
!pip install ray[rllib] tensorflow
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

In [2]:
!pip install numpy --upgrade



In [9]:
!pip install -U tensorflow-probability==0.19.0

Collecting tensorflow-probability==0.19.0
  Downloading tensorflow_probability-0.19.0-py2.py3-none-any.whl (6.7 MB)
     ---------------------------------------- 6.7/6.7 MB 3.5 MB/s eta 0:00:00
Installing collected packages: tensorflow-probability
Successfully installed tensorflow-probability-0.19.0


In [15]:
import argparse
import gymnasium as gym
import os

import numpy as np
#import ray
from ray.air import Checkpoint
from ray.air.config import RunConfig
from ray.train.rl.rl_predictor import RLPredictor
from ray.train.rl.rl_trainer import RLTrainer
from ray.air.config import ScalingConfig
from ray.air.result import Result
from ray.rllib.algorithms.bc import BC
from ray.tune.tuner import Tuner

In [4]:
def train_rl_ppo_online(num_workers: int, use_gpu: bool = False) -> Result:
    print("Starting online training")
    trainer = RLTrainer(
        run_config=RunConfig(stop={"training_iteration": 5}),
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
        algorithm="PPO",
        config={
            "env": "CartPole-v1",
            "framework": "tf",
        },
    )
    # Todo (krfricke/xwjiang): Enable checkpoint config in RunConfig
    # result = trainer.fit()
    tuner = Tuner(
        trainer,
        _tuner_kwargs={"checkpoint_at_end": True},
    )
    result = tuner.fit()[0]
    return result

In [5]:
def evaluate_using_checkpoint(checkpoint: Checkpoint, num_episodes) -> list:
    predictor = RLPredictor.from_checkpoint(checkpoint)

    env = gym.make("CartPole-v1")

    rewards = []
    for i in range(num_episodes):
        obs = env.reset()
        reward = 0.0
        done = False
        while not done:
            action = predictor.predict(np.array([obs]))
            obs, r, done, _ = env.step(action[0])
            reward += r
        rewards.append(reward)

    return rewards

In [10]:
result = train_rl_ppo_online(num_workers=2, use_gpu=False)

Starting online training


2023-02-28 16:33:19,992	INFO worker.py:1553 -- Started a local Ray instance.


0,1
Current time:,2023-02-28 16:34:16
Running for:,00:00:52.57
Memory:,11.4/15.9 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
AIRPPO_3cf6c_00000,TERMINATED,127.0.0.1:23572,5,27.5592,20000,128.79,500,15,128.79


2023-02-28 16:33:24,143	INFO algorithm_config.py:2899 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
2023-02-28 16:33:24,179	INFO algorithm_config.py:2899 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(AIRPPO pid=23572)[0m 2023-02-28 16:33:32,526	INFO algorithm_config.py:2899 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(AIRPPO pid=23572)[0m 2023-02-28 16:33:3

Trial name,agent_timesteps_total,connector_metrics,counters,custom_metrics,date,done,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,episodes_total,experiment_id,hostname,info,iterations_since_restore,node_ip,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_trained,num_env_steps_trained_this_iter,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,pid,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,time_since_restore,time_this_iter_s,time_total_s,timers,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
AIRPPO_3cf6c_00000,20000,"{'ObsPreprocessorConnector_ms': 0.009012222290039062, 'StateBufferConnector_ms': 0.010036945343017578, 'ViewRequirementAgentConnector_ms': 0.12197756767272949}","{'num_env_steps_sampled': 20000, 'num_env_steps_trained': 20000, 'num_agent_steps_sampled': 20000, 'num_agent_steps_trained': 20000}",{},2023-02-28_16-34-16,True,128.79,{},500,128.79,15,13,346,fdb77e61f1684ee0b76397ace8c72295,LAPTOP-P1JP2TCI,"{'learner': {'default_policy': {'learner_stats': {'cur_kl_coeff': 0.30000001192092896, 'cur_lr': 4.999999873689376e-05, 'total_loss': 9.772552, 'policy_loss': -0.019828321, 'vf_loss': 9.791359, 'vf_explained_var': -0.013446819, 'kl': 0.0033996392, 'entropy': 0.54818004, 'entropy_coeff': 0.0, 'model': {}}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0, 'num_grad_updates_lifetime': 4185.5, 'diff_num_grad_updates_vs_sampler_policy': 464.5}}, 'num_env_steps_sampled': 20000, 'num_env_steps_trained': 20000, 'num_agent_steps_sampled': 20000, 'num_agent_steps_trained': 20000}",5,127.0.0.1,20000,20000,20000,4000,20000,4000,0,2,0,0,4000,"{'cpu_util_percent': 28.483333333333334, 'ram_util_percent': 71.31666666666668}",23572,{},{},{},"{'mean_raw_obs_processing_ms': 0.34509124464907015, 'mean_inference_ms': 0.8267897491329514, 'mean_action_processing_ms': 0.14310844735188502, 'mean_env_wait_ms': 0.04206508927980401, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 500.0, 'episode_reward_min': 15.0, 'episode_reward_mean': 128.79, 'episode_len_mean': 128.79, 'episode_media': {}, 'episodes_this_iter': 13, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [17.0, 132.0, 27.0, 17.0, 66.0, 30.0, 29.0, 40.0, 57.0, 29.0, 61.0, 28.0, 129.0, 35.0, 50.0, 29.0, 53.0, 27.0, 65.0, 27.0, 28.0, 45.0, 45.0, 50.0, 30.0, 64.0, 17.0, 69.0, 23.0, 37.0, 74.0, 230.0, 117.0, 262.0, 137.0, 27.0, 79.0, 48.0, 139.0, 178.0, 76.0, 65.0, 154.0, 81.0, 134.0, 106.0, 15.0, 89.0, 135.0, 84.0, 69.0, 32.0, 93.0, 50.0, 51.0, 131.0, 43.0, 92.0, 153.0, 100.0, 120.0, 118.0, 130.0, 110.0, 96.0, 98.0, 39.0, 117.0, 65.0, 177.0, 87.0, 255.0, 110.0, 169.0, 270.0, 211.0, 130.0, 123.0, 244.0, 232.0, 182.0, 149.0, 213.0, 303.0, 232.0, 273.0, 281.0, 267.0, 252.0, 500.0, 331.0, 415.0, 345.0, 178.0, 376.0, 228.0, 201.0, 227.0, 236.0, 389.0], 'episode_lengths': [17, 132, 27, 17, 66, 30, 29, 40, 57, 29, 61, 28, 129, 35, 50, 29, 53, 27, 65, 27, 28, 45, 45, 50, 30, 64, 17, 69, 23, 37, 74, 230, 117, 262, 137, 27, 79, 48, 139, 178, 76, 65, 154, 81, 134, 106, 15, 89, 135, 84, 69, 32, 93, 50, 51, 131, 43, 92, 153, 100, 120, 118, 130, 110, 96, 98, 39, 117, 65, 177, 87, 255, 110, 169, 270, 211, 130, 123, 244, 232, 182, 149, 213, 303, 232, 273, 281, 267, 252, 500, 331, 415, 345, 178, 376, 228, 201, 227, 236, 389]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 0.34509124464907015, 'mean_inference_ms': 0.8267897491329514, 'mean_action_processing_ms': 0.14310844735188502, 'mean_env_wait_ms': 0.04206508927980401, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0, 'connector_metrics': {'ObsPreprocessorConnector_ms': 0.009012222290039062, 'StateBufferConnector_ms': 0.010036945343017578, 'ViewRequirementAgentConnector_ms': 0.12197756767272949}}",27.5592,4.40509,27.5592,"{'training_iteration_time_ms': 5504.588, 'load_time_ms': 0.2, 'load_throughput': 20025323.466, 'learn_time_ms': 2891.284, 'learn_throughput': 1383.468, 'synch_weights_time_ms': 3.101}",1677598456,0,20000,5,3cf6c_00000,15.9019


2023-02-28 16:34:17,436	INFO tune.py:798 -- Total run time: 53.45 seconds (52.55 seconds for the tuning loop).


In [11]:
result

Result(metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'cur_kl_coeff': 0.30000001192092896, 'cur_lr': 4.999999873689376e-05, 'total_loss': 9.772552, 'policy_loss': -0.019828321, 'vf_loss': 9.791359, 'vf_explained_var': -0.013446819, 'kl': 0.0033996392, 'entropy': 0.54818004, 'entropy_coeff': 0.0, 'model': {}}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0, 'num_grad_updates_lifetime': 4185.5, 'diff_num_grad_updates_vs_sampler_policy': 464.5}}, 'num_env_steps_sampled': 20000, 'num_env_steps_trained': 20000, 'num_agent_steps_sampled': 20000, 'num_agent_steps_trained': 20000}, 'sampler_results': {'episode_reward_max': 500.0, 'episode_reward_min': 15.0, 'episode_reward_mean': 128.79, 'episode_len_mean': 128.79, 'episode_media': {}, 'episodes_this_iter': 13, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [17.0, 132.0, 27.0, 17.0, 66.0, 30

In [16]:
num_eval_episodes = 3

rewards = evaluate_using_checkpoint(result.checkpoint, num_episodes=num_eval_episodes)
print(f"Average reward over {num_eval_episodes} episodes: " f"{np.mean(rewards)}")

2023-02-28 16:35:49,000	INFO policy.py:1214 -- Policy (worker=local) running on CPU.
2023-02-28 16:35:49,001	INFO tf_policy.py:171 -- Found 0 visible cuda devices.
2023-02-28 16:35:49,171	INFO dynamic_tf_policy_v2.py:710 -- Adding extra-action-fetch `action_prob` to view-reqs.
2023-02-28 16:35:49,172	INFO dynamic_tf_policy_v2.py:710 -- Adding extra-action-fetch `action_logp` to view-reqs.
2023-02-28 16:35:49,173	INFO dynamic_tf_policy_v2.py:710 -- Adding extra-action-fetch `action_dist_inputs` to view-reqs.
2023-02-28 16:35:49,175	INFO dynamic_tf_policy_v2.py:710 -- Adding extra-action-fetch `vf_preds` to view-reqs.
2023-02-28 16:35:49,175	INFO dynamic_tf_policy_v2.py:722 -- Testing `postprocess_trajectory` w/ dummy batch.


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (1, 2) + inhomogeneous part.