[download this notebook here](https://github.com/HumanCompatibleAI/imitation/blob/master/docs/tutorials/8a_train_sqil_sac.ipynb)
# Train an Agent using Soft Q Imitation Learning with SAC

In the previous tutorial, we used Soft Q Imitation Learning ([SQIL](https://arxiv.org/abs/1905.11108)) on top of the DQN base algorithm. In fact, SQIL can be combined with any off-policy algorithm from `stable_baselines3`. Here, we train a Pendulum agent using SQIL + SAC.

First, we need some expert trajectories in our environment (`Pendulum-v1`).
Note that you can use other environments, but the action space must be continuous.

In [27]:
import datasets
import numpy as np
from datasets import Dataset, Features, Value, ClassLabel, Sequence
from imitation.data import types
from imitation.data import huggingface_utils

# Download some expert trajectories from the HuggingFace Datasets Hub.
dataset = datasets.load_dataset("HumanCompatibleAI/ppo-Pendulum-v1")

# Convert the dataset to a format usable by the imitation library.
expert_trajectories = huggingface_utils.TrajectoryDatasetSequence(dataset["train"])

In [34]:
dataset['train']

Dataset({
    features: ['obs', 'acts', 'infos', 'terminal', 'rews'],
    num_rows: 200
})

In [4]:
import os
import zipfile
import pickle

def load_data_from_zip(dir_path):
    # List all zip files in the directory
    zip_files = [f for f in os.listdir(dir_path) if f.endswith('.zip')]
    
    # Dictionary to hold the data extracted from each zip file
    data_buffer = {}

    # Iterate over each zip file
    for zip_file_name in zip_files:
        # Construct the full path to the zip file
        file_path = os.path.join(dir_path, zip_file_name)
        
        # Open the zip file
        with zipfile.ZipFile(file_path, 'r') as zip_file:
            # Extract data.pkl
            with zip_file.open('data.pkl') as file:
                # Deserialize the data
                data = pickle.load(file)
                
                # Use the file name without extension as the key
                step = zip_file_name.replace('.zip', '')
                data_buffer[step] = data

    return data_buffer


Check the loaded trajectories

In [5]:
trajectories = load_data_from_zip(dir_path="data/demo_seed_0/2024-04-15_13:41:33/traces")
trajectories

{'trace': [([(array([0.   , 0.02 , 0.835, ..., 0.   , 0.   , 0.   ]),
     [0, 0, 1, 0],
     array([ 0.        ,  0.02      ,  0.82345681, ..., -0.48633561,
             1.05214276, -0.4651814 ]),
     -1.0,
     False),
    (array([ 0.        ,  0.02      ,  0.82345681, ..., -0.48633561,
             1.05214276, -0.4651814 ]),
     [0, 0, 1, 0],
     array([ 1.16472885e-21,  2.00000000e-02,  8.22343948e-01, ...,
            -4.98359963e-02,  5.12415895e-03, -6.06934319e-02]),
     -1.0,
     False),
    (array([ 1.16472885e-21,  2.00000000e-02,  8.22343948e-01, ...,
            -4.98359963e-02,  5.12415895e-03, -6.06934319e-02]),
     [0, 0, 1, 0],
     array([ 4.14548887e-21,  2.00000000e-02,  8.23633896e-01, ...,
            -3.89591959e-02,  3.45008773e-02, -5.70800324e-02]),
     -1.0,
     False),
    (array([ 4.14548887e-21,  2.00000000e-02,  8.23633896e-01, ...,
            -3.89591959e-02,  3.45008773e-02, -5.70800324e-02]),
     [0, 0, 1, 0],
     array([ 5.36837800e-21,  2.

In [8]:
trajectories['trace'][1]

([(None,
   [0, 0, 1, 0],
   array([-0.00229494,  0.2118331 ,  0.82464294, ..., -0.00103151,
           0.03973852, -0.02992962]),
   -1.0,
   False),
  (array([-0.00229494,  0.2118331 ,  0.82464294, ..., -0.00103151,
           0.03973852, -0.02992962]),
   [0, 0, 1, 0],
   array([-0.00229368,  0.21183744,  0.82471434, ...,  0.00240825,
           0.03528124, -0.02426058]),
   -1.0,
   False),
  (array([-0.00229368,  0.21183744,  0.82471434, ...,  0.00240825,
           0.03528124, -0.02426058]),
   [0, 0, 1, 0],
   array([-0.00229306,  0.21183957,  0.82474958, ...,  0.00239152,
           0.03183717, -0.02173084]),
   -1.0,
   False),
  (array([-0.00229306,  0.21183957,  0.82474958, ...,  0.00239152,
           0.03183717, -0.02173084]),
   [0, 0, 1, 0],
   array([-0.00229275,  0.21184063,  0.82476709, ...,  0.00212174,
           0.02869532, -0.01961085]),
   -1.0,
   False),
  (array([-0.00229275,  0.21184063,  0.82476709, ...,  0.00212174,
           0.02869532, -0.01961085]),
   

In [33]:



features = Features({
    'observations': Sequence(Value('float32')),  # Assuming observations are vectors of floats
    'actions': Sequence(Value('float32')),  # Assuming actions are vectors of floats
    'next_observations': Sequence(Value('float32')),  # Same as observations
    'rewards': Sequence(Value('float32')),  # Rewards are floats
    'dones': Sequence(ClassLabel(names=["false", "true"]))  # Boolean done flags
})


def prepare_data_for_dataset(trajectories):
    trajectory_objects = []

    for trajectory in trajectories:
        if not trajectory:
            continue
        episode = trajectory[0]
        # Assuming each step has observations, actions, next_obs, rewards, and done flags
        obs = np.array([step[0] for step in episode])  
        acts = np.array([step[1] for step in episode])
        infos = np.array([{} for _ in episode])  # Assuming empty dicts for infos
        terminal = trajectory[-1][4]  # The 'done' flag of the last step
        
        traj_obj = types.Trajectory(obs=obs, acts=acts, infos=infos, terminal=terminal)
        trajectory_objects.append(traj_obj)

    return trajectory_objects


demo_auto_trajectories = prepare_data_for_dataset(trajectories['trace'])
print(demo_auto_trajectories)
#dataset = Dataset.from_dict(data, features=features)

#print(dataset)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (250,) + inhomogeneous part.

In [None]:
from imitation.data import rollout

trajectory_stats = rollout.rollout_stats(expert_trajectories)

print(
    f"We have {trajectory_stats['n_traj']} trajectories. "
    f"The average length of each trajectory is {trajectory_stats['len_mean']}. "
    f"The average return of each trajectory is {trajectory_stats['return_mean']}."
)

After we collected our expert trajectories, it's time to set up our imitation algorithm.

In [None]:
from imitation.algorithms import sqil
from imitation.util.util import make_vec_env
import numpy as np
from stable_baselines3 import sac

SEED = 42

venv = make_vec_env(
    "Pendulum-v1",
    rng=np.random.default_rng(seed=SEED),
)

sqil_trainer = sqil.SQIL(
    venv=venv,
    demonstrations=expert_trajectories,
    policy="MlpPolicy",
    rl_algo_class=sac.SAC,
    rl_kwargs=dict(seed=SEED),
)

As you can see the untrained policy only gets poor rewards (< 0):

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

reward_before_training, _ = evaluate_policy(sqil_trainer.policy, venv, 100)
print(f"Reward before training: {reward_before_training}")

After training, we can observe that agent is quite improved (> 1000), although it does not reach the expert performance in this case.

In [None]:
sqil_trainer.train(
    total_timesteps=1000,
)  # Note: set to 300_000 to obtain good results
reward_after_training, _ = evaluate_policy(sqil_trainer.policy, venv, 100)
print(f"Reward after training: {reward_after_training}")