In [7]:
"""This is a simple example demonstrating how to clone the behavior of an expert.

Refer to the jupyter notebooks for more detailed examples of how to use the algorithms.
"""
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.ppo import MlpPolicy
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env

In [8]:
rng = np.random.default_rng(0)
env = make_vec_env(
    "seals:seals/CartPole-v0",
    rng=rng,
    post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # for computing rollouts
)

In [9]:
def train_expert():
    # note: use `download_expert` instead to download a pretrained, competent expert
    print("Training a expert.")
    expert = PPO(
        policy=MlpPolicy,
        env=env,
        seed=0,
        batch_size=64,
        ent_coef=0.0,
        learning_rate=0.0003,
        n_epochs=10,
        n_steps=64,
    )
    expert.learn(1_000)  # Note: change this to 100_000 to train a decent expert.
    return expert


def download_expert():
    print("Downloading a pretrained expert.")
    expert = load_policy(
        "ppo-huggingface",
        organization="HumanCompatibleAI",
        env_name="seals-CartPole-v0",
        venv=env,
    )
    return expert


def sample_expert_transitions():
    # expert = train_expert()  # uncomment to train your own expert
    expert = download_expert()

    print("Sampling expert transitions.")
    rollouts = rollout.rollout(
        expert,
        env,
        rollout.make_sample_until(min_timesteps=None, min_episodes=50),
        rng=rng,
    )
    return rollouts

In [10]:
transitions = sample_expert_transitions()

Downloading a pretrained expert.
Sampling expert transitions.


In [11]:
print(transitions[0].obs.shape)


(501, 4)


In [None]:
# from imitation.data.types import Transitions
# import copy
# no_of_transitions = 10
# obs=np.ones(shape=(no_of_transitions,3))*3
# acts=np.ones(shape=(no_of_transitions-1,1))*5
# next_obs=np.ones(shape=(no_of_transitions,3))*7
# dones=np.zeros(shape=(no_of_transitions,1))
# infos= [{} for i in range(no_of_transitions-1)]
# next_obs=copy.deepcopy(obs)[1:]
# custom_dataset = Transitions(obs=obs, acts=acts,infos=infos,next_obs=next_obs,dones=dones)


# # rollout2= rollout.flatten_trajectories(custom_dataset)
# print(custom_dataset.obs.shape)
# print(custom_dataset[0])

In [None]:
# bc_trainer = bc.BC(
#     observation_space=env.observation_space,
#     action_space=env.action_space,
#     demonstrations=transitions,
#     rng=rng,
# )

# evaluation_env = make_vec_env(
#     "seals:seals/CartPole-v0",
#     rng=rng,
#     env_make_kwargs={"render_mode": "human"},  # for rendering
# )

# print("Evaluating the untrained policy.")
# reward, _ = evaluate_policy(
#     bc_trainer.policy,  # type: ignore[arg-type]
#     evaluation_env,
#     n_eval_episodes=3,
#     render=True,  # comment out to speed up
# )
# print(f"Reward before training: {reward}")

# print("Training a policy using Behavior Cloning")
# bc_trainer.train(n_epochs=1)

# print("Evaluating the trained policy.")
# reward, _ = evaluate_policy(
#     bc_trainer.policy,  # type: ignore[arg-type]
#     evaluation_env,
#     n_eval_episodes=3,
#     render=True,  # comment out to speed up
# )
# print(f"Reward after training: {reward}")

In [None]:
import random
x_range = np.arange(1, 10, 0.1)
y_range = np.arange(1, 10, 0.1)
goal_poses = [(x, y) for x in x_range for y in y_range if np.sqrt((x - 2)**2 + (y - 1)**2) <= 10]

# Randomly select a goal pose from the list
goal_x, goal_y = random.choice(goal_poses)