# Setup Environment and Dependencies
Import required libraries including NumPy, sklearn with Intel extension, and pi_optimal utilities. Configure warning suppression.

In [1]:
# Setup Environment and Dependencies

import os
import numpy as np
#from sklearnex import patch_sklearn
import warnings

# Change directory to the parent directory
os.chdir("..")

# Apply Intel extension to sklearn
#patch_sklearn()

# Suppress warnings
warnings.filterwarnings('ignore')

# Import pi_optimal utilities
from pi_optimal.utils.data_generators.gym_data_generator import GymDataGenerator
from pi_optimal.datasets.timeseries_dataset import TimeseriesDataset
from pi_optimal.models.sklearn.random_forest_model import RandomForest
from pi_optimal.models.sklearn.mlp import NeuralNetwork
from pi_optimal.evaluators.base_evaluator import BaseEvaluator
from pi_optimal.evaluators.plotting import plot_n_step_evaluation, plot_n_step_episode_rollout

# Create Gym Data Generator
Initialize GymDataGenerator with LunarLander environment and collect training and test data with specified parameters.

In [2]:
# Create Gym Data Generator

# Initialize GymDataGenerator with BipedalWalker environment
data_collector = GymDataGenerator(env_name="BipedalWalker")

# Collect training data
df_train = data_collector.collect(n_steps=10000, max_steps_per_episode=200, env_seed=None, action_seed=None)
df_test = data_collector.collect(n_steps=5000, max_steps_per_episode=200, env_seed=None, action_seed=None)

Collecting steps: 100%|██████████| 10000/10000 [00:00<00:00, 11255.87it/s]
Collecting steps: 100%|██████████| 5000/5000 [00:00<00:00, 11329.03it/s]


In [3]:
df_train

Unnamed: 0,episode,step,reward,done,state_0,state_1,state_2,state_3,state_4,state_5,...,state_18,state_19,state_20,state_21,state_22,state_23,action_0,action_1,action_2,action_3
0,0,0,0.000000,False,0.002747,-0.000014,0.001106,-0.016000,0.091956,-0.001459,...,0.534103,0.602461,0.709149,0.885932,1.0,1.0,0.362195,0.817553,0.204113,0.879735
1,0,1,-0.030488,False,0.008967,-0.014899,0.007804,0.027074,-0.279903,-0.417461,...,0.552951,0.623721,0.734174,0.917196,1.0,1.0,-0.913504,0.399155,-0.606546,-0.324907
2,0,2,-0.172784,False,0.021874,0.036472,0.020428,0.002459,-0.062662,-0.830744,...,0.567385,0.640003,0.753339,0.941138,1.0,1.0,-0.827086,0.525922,0.971741,-0.676759
3,0,3,-0.159519,False,0.034931,0.026900,-0.006921,0.052227,-0.131982,-1.019528,...,0.571541,0.644691,0.758857,0.948032,1.0,1.0,0.598664,-0.950631,0.050628,0.476462
4,0,4,-0.081774,False,0.036350,0.002703,-0.021020,0.033742,-0.161395,-0.056679,...,0.574264,0.647762,0.762473,0.952549,1.0,1.0,0.657584,0.088714,-0.202445,0.023098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,77,36,-0.218375,False,0.426909,0.060835,-0.021596,-0.138653,0.262896,-1.000000,...,0.492241,0.555242,0.653568,0.816495,1.0,1.0,0.065793,0.203441,0.036619,-0.305542
9996,77,37,-0.171305,False,0.454696,0.055480,-0.019627,-0.169166,0.188369,-0.915699,...,0.482711,0.544492,0.640914,0.800687,1.0,1.0,-0.056706,-0.819509,0.335117,-0.472653
9997,77,38,-0.175264,False,0.475991,0.043072,-0.026802,-0.196466,0.243394,0.082573,...,0.471564,0.531918,0.626114,0.782197,1.0,1.0,-0.510836,-0.663696,0.301046,-0.918718
9998,77,39,-0.209152,False,0.501708,0.051331,-0.017314,-0.214434,0.210570,-0.349046,...,0.459457,0.518261,0.610039,0.762114,1.0,1.0,0.253496,0.237579,-0.054549,-0.943965


# Configure Dataset Parameters
Set up dataset configuration dictionary defining features, processors, and evaluation metrics for states, actions, and rewards.

In [4]:
# Configure Dataset Parameters

# Define dataset configuration dictionary
dataset_config = {
    "episode_column": "episode",
    "timestep_column": "step",
    "states": {
        0: {"name": "state_0", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        1: {"name": "state_1", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        2: {"name": "state_2", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        3: {"name": "state_3", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        4: {"name": "state_4", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        5: {"name": "state_5", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        6: {"name": "state_6", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        7: {"name": "state_7", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        8: {"name": "state_8", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        9: {"name": "state_9", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        10: {"name": "state_10", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        11: {"name": "state_11", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        12: {"name": "state_12", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        13: {"name": "state_13", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        14: {"name": "state_14", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        15: {"name": "state_15", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        16: {"name": "state_16", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        17: {"name": "state_17", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        18: {"name": "state_18", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        19: {"name": "state_19", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        20: {"name": "state_20", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        21: {"name": "state_21", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        22: {"name": "state_22", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        23: {"name": "state_23", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        24: {"name": "done", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        25: {"name": "reward", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
    
        },
    "actions": {
        0: {"name": "action_0", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        1: {"name": "action_1", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        2: {"name": "action_2", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        3: {"name": "action_3", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
    },
    "reward_feature_idx": 9,
    "reward_vector_idx": 9,
    "reward_column": "reward",
}

# Create Training and Test Datasets
Initialize TimeseriesDataset objects with collected data, applying the configuration and setting lookback/forecast windows.

In [5]:
# Create Training and Test Datasets

# Define lookback and forecast timesteps
LOOKBACK_TIMESTEPS = 5
FORECAST_TIMESTEPS = 1

# Initialize TimeseriesDataset objects for training and test data
dataset_train = TimeseriesDataset(
    df=df_train,
    dataset_config=dataset_config,
    lookback_timesteps=LOOKBACK_TIMESTEPS,
    forecast_timesteps=FORECAST_TIMESTEPS,
    train_processors=True
)


dataset_test = TimeseriesDataset(
    df=df_test,
    dataset_config=dataset_config,
    lookback_timesteps=LOOKBACK_TIMESTEPS,
    forecast_timesteps=FORECAST_TIMESTEPS,
    train_processors=False
)

# Train Neural Network Model
Create and train a Neural Network model with specified hyperparameters on the training dataset.

In [6]:

# Initialize Neural Network model with specified hyperparameters
nn_model1 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001,}
)
    
# Train the Neural Network model on the first training dataset
nn_model1.fit(dataset_train)

Training models...:   0%|          | 0/26 [00:00<?, ?it/s]

In [7]:

# Initialize Neural Network model with specified hyperparameters
nn_model2 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001}
)
    
# Train the Neural Network model on the first training dataset
nn_model2.fit(dataset_train)

Training models...:   0%|          | 0/26 [00:00<?, ?it/s]

In [8]:

# Initialize Neural Network model with specified hyperparameters
nn_model3 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001}
)
    
# Train the Neural Network model on the first training dataset
nn_model3.fit(dataset_train)

Training models...:   0%|          | 0/26 [00:00<?, ?it/s]

# Low Level Workflow

Here you could see how it works under the hood.

In [9]:
from pi_optimal.utils.gym_wrapper.model_based_env import ModelBasedEnv

sim_env = ModelBasedEnv(models=[nn_model1, nn_model2, nn_model3], dataset=dataset_train, max_episode_steps=50)

In [10]:
obs, _ = sim_env.reset()
total_reward = 0
for _ in range(200):
    action = sim_env.action_space.sample()
    obs, reward, done, done, info = sim_env.step(action)
    total_reward += reward
    sim_env.render("human")
    if done:
        break
sim_env.close()
print(total_reward)

Step: 110, State history: [ 0.10917307 -0.03028735  0.11007358 -0.00603868 -0.85159158 -0.42067238
  0.12535425  1.10235948  0.57490481  1.10159591 -0.28652132  0.91063872
  0.63494533  0.04704826  0.22945851  0.23521662  0.24439466  0.25533637
  0.27977127  0.31497341  0.37040988  0.46278045  0.63376983  1.00105393]
Step: 111, State history: [ 0.0772852  -0.03288959  0.07016263 -0.01197309 -0.8281429   0.31645677
  0.12015212  0.01790893  0.53604227  1.14413022  0.46617253  0.85588927
 -0.43038876 -0.32862685  0.22913386  0.23283266  0.24489528  0.2534031
  0.27912322  0.31416133  0.36514968  0.46010266  0.63365245  1.00069178]
Step: 112, State history: [ 0.09343471 -0.00163625  0.06434112 -0.00664197 -0.80073906  0.12971076
 -0.04222848 -0.99856567  0.11055739  1.10078093 -0.43983124  1.01253641
  1.01877128  0.24373973  0.22993612  0.23355617  0.24148511  0.25181005
  0.2776507   0.31425796  0.37289747  0.45769046  0.63058997  0.99921568]
Step: 113, State history: [ 0.08184601 -0.01

In [11]:
#!/usr/bin/env python
import os
import gymnasium
import matplotlib.pyplot as plt

from stable_baselines3 import PPO
from stable_baselines3 import DQN
from stable_baselines3 import TD3
from stable_baselines3.common.monitor import Monitor

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.results_plotter import load_results, ts2xy
# Monkey-patch gym to include a __version__ attribute if it's missing.

# Set up log folder for monitoring
log_dir = "./logs_dir/"
os.makedirs(log_dir, exist_ok=True)

# Create the training environment and wrap it with a Monitor to log rewards.
train_env = sim_env
train_env = Monitor(train_env, log_dir)

# Create a separate evaluation environment.
eval_env = gymnasium.make("BipedalWalker-v3")
eval_env = Monitor(eval_env, log_dir)

# Set up the evaluation callback. This will evaluate the model every 5000 timesteps,
# and save the model if it achieves a new best mean reward.
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=os.path.join(log_dir, 'best_model'),
    log_path=log_dir,
    eval_freq=5000,
    n_eval_episodes=50,
    deterministic=False,
    render=False
)


model = PPO("MlpPolicy",
            env=train_env,
            n_steps=2048,
            gamma=0.99,
            n_epochs=10,
            clip_range=0.999,
            verbose=1)

# Train the model and use the evaluation callback to save the best model.
model.learn(total_timesteps=300000, callback=eval_callback)

Using cpu device
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -67.4    |
| time/              |          |
|    fps             | 106      |
|    iterations      | 1        |
|    time_elapsed    | 19       |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 49.6       |
|    ep_rew_mean          | -47.7      |
| time/                   |            |
|    fps                  | 105        |
|    iterations           | 2          |
|    time_elapsed         | 38         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.07046617 |
|    clip_fraction        | 0.00425    |
|    clip_range           | 0.999      |
|    entropy_loss         | -5.67      |
|    explained_variance   | -0.001

KeyboardInterrupt: 

In [12]:
best_model = PPO.load(os.path.join(log_dir, 'best_model/best_model.zip'))

In [14]:

# Test the trained agent by running one episode and rendering it.
env = gymnasium.make("BipedalWalker", render_mode="human")   
obs, _ = env.reset()
done = False
terminated = False
total_reward = 0
step = 0 
while not (done or terminated):
    # Predict the next action using the trained policy.
    action, _ = best_model.predict(obs, deterministic=False)
    obs, reward, done, terminated ,_ = env.step(action)
    total_reward += reward
    step += 1
    env.render()

env.close()
print(f"Total reward: {total_reward}")

KeyboardInterrupt: 

In [15]:

# Test the trained agent by running one episode and rendering it.
env = gymnasium.make("BipedalWalker", render_mode="rgb_array")  
all_rewards = []
for i in range(100):
    obs, _ = env.reset()
    done = False
    terminated = False
    total_reward = 0
    while not (done or terminated):
        # Predict the next action using the trained policy.
        action, _ = best_model.predict(obs, deterministic=False)
        obs, reward, done, terminated ,_ = env.step(action)
        total_reward += reward
        env.render()
    all_rewards.append(total_reward)
    print(f"Total reward: {total_reward}")
env.close()


Total reward: -164.47833408422713
Total reward: -163.4879424148611
Total reward: -162.96492318751058
Total reward: -162.48233613970763
Total reward: -164.40617770835726


KeyboardInterrupt: 

In [29]:
import numpy as np
# 1 model
np.mean(all_rewards)

36.18399900038778

# High Level Application

In [29]:
from pi_optimal.planners.online_planner import OnlinePlanner
from pi_optimal.utils.gym_wrapper.model_based_env import ModelBasedEnv

sim_env = ModelBasedEnv(models=[nn_model, nn_model2, nn_model3], dataset=dataset_train, max_episode_steps=200)
eval_env = gymnasium.make("LunarLander-v3")

online_planner = OnlinePlanner(env=sim_env, eval_env=None, train_params={"total_timesteps": 6000}, eval_params={"n_eval_episodes": 50, "eval_freq": 5000})

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 147       |
|    ep_rew_mean     | -2.01e+06 |
| time/              |           |
|    fps             | 177       |
|    iterations      | 1         |
|    time_elapsed    | 11        |
|    total_timesteps | 2048      |
----------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 161           |
|    ep_rew_mean          | -1.2e+06      |
| time/                   |               |
|    fps                  | 167           |
|    iterations           | 2             |
|    time_elapsed         | 24            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 1.2118835e-07 |
|    clip_fraction        | 0             |
|    clip_range           | 

### Doing Inference from a dataset class

For details  look in to the predict function of the planer. It takes the last observation and uses it as observation. Ensure that the dataset is set to **is_inference == True** .

In [48]:
df_inf = data_collector.collect(n_steps=50, max_steps_per_episode=200, env_seed=None, action_seed=None)

dataset_inf = dataset_test = TimeseriesDataset(
    df=df_inf,
    dataset_config=dataset_config,
    train_processors=False,
    is_inference=True
)

Collecting steps: 100%|██████████| 50/50 [00:00<00:00, 5438.95it/s]


In [52]:
online_planner.plan(dataset_inf)

array([1])