In [1]:
import ray
from ray.rllib.agents import cql, Trainer, pg
import gym
from gym.wrappers import RecordVideo
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.animation
import numpy as np
from IPython.display import HTML, Video

In [2]:
config = {
    "env": "Pendulum-v1",
    "framework": "torch",
}

In [3]:
VIDEOS_PATH = Path.cwd().parent / "videos"
EXPERIMENT_PATH = Path.cwd().parent / "experience"
VIDEOS_PATH.mkdir(exist_ok=True, parents=True)
EXPERIMENT_PATH.mkdir(exist_ok=True, parents=True)

In [4]:
def evaluation(env: gym.Env, agent = None, show_video = False):
    env = RecordVideo(env, VIDEOS_PATH / config["env"])
    obs = env.reset()
    done = False
    total_steps = 1 
    while not done:
        if agent:
            action = agent.compute_single_action(obs)
        else:
            action = env.action_space.sample()

        obs, reward, done, info = env.step(action)
    env.close()

    if show_video:
        from IPython.display import Video
        video_path = list((VIDEOS_PATH / config["env"]).glob("*.mp4"))[0]
        return Video(video_path)

In [6]:
from ray.tune.logger import pretty_print
def collect_experience(num_episodes):
    config["output"] = str(EXPERIMENT_PATH)
    env = gym.make(config["env"])
    agent = pg.PGTrainer(config=config)
    for i in range(num_episodes):
        print(pretty_print(agent.train()))

collect_experience(5)

agent_timesteps_total: 200
counters:
  num_agent_steps_sampled: 200
  num_agent_steps_trained: 200
  num_env_steps_sampled: 200
  num_env_steps_trained: 200
custom_metrics: {}
date: 2022-06-23_16-03-21
done: false
episode_len_mean: 200.0
episode_media: {}
episode_reward_max: -878.1567108044175
episode_reward_mean: -878.1567108044175
episode_reward_min: -878.1567108044175
episodes_this_iter: 1
episodes_total: 1
experiment_id: 1a1ab261c6fd422eb88a2a97d8787eef
hostname: G990FXA8370E
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        allreduce_latency: 0.0
        policy_loss: -358.562744140625
      model: {}
  num_agent_steps_sampled: 200
  num_agent_steps_trained: 200
  num_env_steps_sampled: 200
  num_env_steps_trained: 200
iterations_since_restore: 1
node_ip: 172.29.40.230
num_agent_steps_sampled: 200
num_agent_steps_trained: 200
num_env_steps_sampled: 200
num_env_steps_sampled_this_iter: 200
num_env_steps_trained: 200
num_env_steps_trained_this

In [10]:
config["input"] = [str(path) for path in list(EXPERIMENT_PATH.glob("*.json"))]
config["output"] = None
config["input"]

['/home/napnel/experience/output-2022-06-23_16-03-21_worker-0_0.json']

In [11]:
import os
print("data_file={} exists={}".format(EXPERIMENT_PATH, os.path.isfile(EXPERIMENT_PATH)))

data_file=/home/napnel/experience exists=False


In [12]:
agent = cql.CQLTrainer(config=config, env=config["env"])
env = gym.make(config["env"])

for i in range(10):
    results = agent.train()
    episode = results.get("episodes_total")
    step = results.get("timesteps_total")
    reward = results.get("episode_reward_mean")
    print(f"Iter {i} | Episode {episode} | Step {step} | Reward {reward}")
    if step >= 100000:
        break


Loaded 5 batches (1000 ts) into the replay buffer, which has capacity 1000000.


AssertionError: step() needs to return a dict.

In [None]:
evaluation(env, agent)