In [2]:
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.ppo import MlpPolicy
import imitation
from imitation.algorithms import bc
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
#from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env

begins with baseline from: https://imitation.readthedocs.io/en/latest/getting-started/first_steps.html

In [2]:
rng = np.random.default_rng(0)
env = make_vec_env(
    "seals:seals/CartPole-v0",
    rng=rng,
    post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # for computing rollouts
)

In [None]:
# LOAD dataframe here!!!! from extractions

In [None]:
demonstrations = []
for _, row in df.iterrows():
    transition = imitation.types.Transition(
        obs=row["obs"],
        act=row["act"],
        rew=row["rew"],
        next_obs=row["next_obs"],
        done=row["done"],
    )
    demonstrations.append(transition)

In [3]:
def train_expert():
    # note: use `download_expert` instead to download a pretrained, competent expert
    print("Training a expert.")
    expert = PPO(
        policy=MlpPolicy,
        env=env,
        seed=0,
        batch_size=64,
        ent_coef=0.0,
        learning_rate=0.0003,
        n_epochs=10,
        n_steps=64,
    )
    expert.learn(1_000)  # Note: change this to 100_000 to train a decent expert.
    return expert

In [4]:
def sample_expert_transitions():
    # expert = train_expert()  # uncomment to train your own expert
    expert = train_expert()

    print("Sampling expert transitions.")
    rollouts = rollout.rollout(
        expert,
        env,
        rollout.make_sample_until(min_timesteps=None, min_episodes=50),
        rng=rng,
    )
    return rollout.flatten_trajectories(rollouts)

In [10]:
transitions = sample_expert_transitions()
# make the transitions

bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng,
)

Training a expert.
Sampling expert transitions.


In [11]:

reward, _ = evaluate_policy(
    bc_trainer.policy,  # type: ignore[arg-type]
    env,
    n_eval_episodes=3,
    render=True,  # comment out to speed up
)
print(f"Reward before training: {reward}")



Reward before training: 40.666666666666664




In [12]:
print("Training a policy using Behavior Cloning")
bc_trainer.train(n_epochs=1)

reward, _ = evaluate_policy(
    bc_trainer.policy,  # type: ignore[arg-type]
    env,
    n_eval_episodes=3,
    render=True,  # comment out to speed up
)
print(f"Reward after training: {reward}")

Training a policy using Behavior Cloning


0batch [00:00, ?batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 0         |
|    ent_loss       | -0.000693 |
|    entropy        | 0.693     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 72.5      |
|    loss           | 0.694     |
|    neglogp        | 0.694     |
|    prob_true_act  | 0.499     |
|    samples_so_far | 32        |
---------------------------------


500batch [00:01, 306.23batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000643 |
|    entropy        | 0.643     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 72.8      |
|    loss           | 0.578     |
|    neglogp        | 0.579     |
|    prob_true_act  | 0.58      |
|    samples_so_far | 16032     |
---------------------------------


875batch [00:02, 304.17batch/s]


Reward after training: 7.333333333333333


