# Setup Environment and Dependencies
Import required libraries including NumPy, sklearn with Intel extension, and pi_optimal utilities. Configure warning suppression.

In [1]:
# Setup Environment and Dependencies

import os
import numpy as np
#from sklearnex import patch_sklearn
import warnings

# Change directory to the parent directory
os.chdir("..")

# Apply Intel extension to sklearn
#patch_sklearn()

# Suppress warnings
warnings.filterwarnings('ignore')

# Import pi_optimal utilities
from pi_optimal.utils.data_generators.gym_data_generator import GymDataGenerator
from pi_optimal.datasets.timeseries_dataset import TimeseriesDataset
from pi_optimal.models.random_forest_model import RandomForest
from pi_optimal.models.mlp import NeuralNetwork
from pi_optimal.evaluators.base_evaluator import BaseEvaluator
from pi_optimal.evaluators.plotting import plot_n_step_evaluation, plot_n_step_episode_rollout

# Create Gym Data Generator
Initialize GymDataGenerator with LunarLander environment and collect training and test data with specified parameters.

In [None]:
# Create Gym Data Generator

# Initialize GymDataGenerator with LunarLander environment
data_collector = GymDataGenerator(env_name="LunarLander-v3")

# Collect training data
df_train = data_collector.collect(n_steps=10000, max_steps_per_episode=200, env_seed=None, action_seed=None)
df_test = data_collector.collect(n_steps=5000, max_steps_per_episode=200, env_seed=None, action_seed=None)

Collecting steps: 100%|██████████| 1000/1000 [00:00<00:00, 36515.39it/s]
Collecting steps: 100%|██████████| 5000/5000 [00:00<00:00, 50735.13it/s]


# Configure Dataset Parameters
Set up dataset configuration dictionary defining features, processors, and evaluation metrics for states, actions, and rewards.

In [3]:
# Configure Dataset Parameters

# Define dataset configuration dictionary
dataset_config = {
    "episode_column": "episode",
    "timestep_column": "step",
    "states": {
        0: {"name": "state_0", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        1: {"name": "state_1", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        2: {"name": "state_2", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        3: {"name": "state_3", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        4: {"name": "state_4", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        5: {"name": "state_5", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        6: {"name": "state_6", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        7: {"name": "state_7", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        8: {"name": "done", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        9: {"name": "reward", "type": "numerical", "processor": {"name": "PowerTransformer"}, "evaluation_metric": "mae"},
    },
    "actions": {
        0: {"name": "action_0", "type": "categorial", "processor": {"name": "OneHotEncoder"}},
    },
    "reward_feature_idx": 9,
    "reward_vector_idx": 9,
    "reward_column": "reward",
}

# Create Training and Test Datasets
Initialize TimeseriesDataset objects with collected data, applying the configuration and setting lookback/forecast windows.

In [4]:
# Create Training and Test Datasets

# Define lookback and forecast timesteps
LOOKBACK_TIMESTEPS = 10
FORECAST_TIMESTEPS = 1

# Initialize TimeseriesDataset objects for training and test data
dataset_train = TimeseriesDataset(
    df=df_train,
    dataset_config=dataset_config,
    lookback_timesteps=LOOKBACK_TIMESTEPS,
    forecast_timesteps=FORECAST_TIMESTEPS,
    train_processors=True
)


dataset_test = TimeseriesDataset(
    df=df_test,
    dataset_config=dataset_config,
    lookback_timesteps=LOOKBACK_TIMESTEPS,
    forecast_timesteps=FORECAST_TIMESTEPS,
    train_processors=False
)

# Train Neural Network Model
Create and train a Neural Network model with specified hyperparameters on the training dataset.

In [None]:

# Initialize Neural Network model with specified hyperparameters
nn_model1 = NeuralNetwork(
    hidden_layer_sizes=(128, 128),
    alpha=0.01, 
    learning_rate_init=0.001,
)
    
# Train the Neural Network model on the first training dataset
nn_model1.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [6]:

# Initialize Neural Network model with specified hyperparameters
nn_model2 = NeuralNetwork(
    hidden_layer_sizes=(128, 128),
    alpha=0.01, 
    learning_rate_init=0.001,
)
    
# Train the Neural Network model on the first training dataset
nn_model2.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [7]:

# Initialize Neural Network model with specified hyperparameters
nn_model3 = NeuralNetwork(
    hidden_layer_sizes=(128, 128),
    alpha=0.01, 
    learning_rate_init=0.001,
)
    
# Train the Neural Network model on the first training dataset
nn_model3.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

# Low Level Workflow

Here you could see how it works under the hood.

In [8]:
from pi_optimal.utils.gym_wrapper.model_based_env import ModelBasedEnv

sim_env = ModelBasedEnv(models=[nn_model1, nn_model2, nn_model3], dataset=dataset_train, max_episode_steps=200)

In [9]:
obs, _ = sim_env.reset()
total_reward = 0
for _ in range(200):
    action = sim_env.action_space.sample()
    obs, reward, done, done, info = sim_env.step(action)
    total_reward += reward
    sim_env.render("human")
    if done:
        break
sim_env.close()
print(total_reward)

Step: 23, State history: [ 0.09052636  1.30892165  0.45619737 -0.41948996 -0.11548094 -0.23644299
  0.          0.        ]
Step: 24, State history: [ 0.10001343  1.2771997   0.47070809 -0.35508364 -0.12485919 -0.1554039
  0.          0.        ]
Step: 25, State history: [ 0.05861678  1.26663382  0.3602799  -0.50082229 -0.27455575 -0.17481056
  0.          0.        ]
Step: 26, State history: [ 0.07945819  1.20570933  0.4363852  -0.41628732 -0.15712475 -0.15456128
  0.          0.        ]
Step: 27, State history: [ 0.0958831   1.25613371  0.57484813 -0.49003895 -0.13376869 -0.36245045
  0.          0.        ]
Step: 28, State history: [ 0.07742128  1.1195561   0.57236157 -0.51228361 -0.2498309  -0.1256556
  0.          0.        ]
Step: 29, State history: [ 0.12695883  1.14332263  0.42385142 -0.39039625 -0.20690076 -0.16220965
  0.          0.        ]
Step: 30, State history: [ 0.07692196  1.1128045   0.61450024 -0.46367135 -0.26626612 -0.20658028
  0.          0.        ]
Step: 31, 

In [10]:
#!/usr/bin/env python
import os
import gymnasium
import matplotlib.pyplot as plt

from stable_baselines3 import PPO
from stable_baselines3 import DQN
from stable_baselines3 import TD3
from stable_baselines3.common.monitor import Monitor

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.results_plotter import load_results, ts2xy
# Monkey-patch gym to include a __version__ attribute if it's missing.

# Set up log folder for monitoring
log_dir = "./logs_dir/"
os.makedirs(log_dir, exist_ok=True)

# Create the training environment and wrap it with a Monitor to log rewards.
train_env = sim_env
train_env = Monitor(train_env, log_dir)

# Create a separate evaluation environment.
eval_env = gymnasium.make("LunarLander-v3")
eval_env = Monitor(eval_env, log_dir)

# Set up the evaluation callback. This will evaluate the model every 5000 timesteps,
# and save the model if it achieves a new best mean reward.
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=os.path.join(log_dir, 'best_model'),
    log_path=log_dir,
    eval_freq=5000,
    n_eval_episodes=50,
    deterministic=False,
    render=False
)

model = PPO("MlpPolicy", train_env, verbose=1)


# Train the model and use the evaluation callback to save the best model.
model.learn(total_timesteps=300000, callback=eval_callback)

Using cpu device
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 145       |
|    ep_rew_mean     | -1.24e+05 |
| time/              |           |
|    fps             | 184       |
|    iterations      | 1         |
|    time_elapsed    | 11        |
|    total_timesteps | 2048      |
----------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 151           |
|    ep_rew_mean          | -8.2e+06      |
| time/                   |               |
|    fps                  | 183           |
|    iterations           | 2             |
|    time_elapsed         | 22            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 2.7193775e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         

KeyboardInterrupt: 

In [11]:
best_model = PPO.load(os.path.join(log_dir, 'best_model/best_model.zip'))

In [12]:

# Test the trained agent by running one episode and rendering it.
env = gymnasium.make("LunarLander-v3", render_mode="human")   
obs, _ = env.reset()
done = False
terminated = False
total_reward = 0
while not (done or terminated):
    # Predict the next action using the trained policy.
    action, _ = best_model.predict(obs, deterministic=False)
    obs, reward, done, terminated ,_ = env.step(action)
    total_reward += reward
    env.render()

env.close()
print(f"Total reward: {total_reward}")

2025-03-25 14:48:51.615 Python[9586:276873] +[IMKClient subclass]: chose IMKClient_Modern
2025-03-25 14:48:51.615 Python[9586:276873] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Total reward: -125.62913699720562


In [None]:

# Test the trained agent by running one episode and rendering it.
env = gymnasium.make("LunarLander-v3", render_mode="rgb_array")  
all_rewards = []
for i in range(100):
    obs, _ = env.reset()
    done = False
    terminated = False
    total_reward = 0
    while not (done or terminated):
        # Predict the next action using the trained policy.
        action, _ = best_model.predict(obs, deterministic=False)
        obs, reward, done, terminated ,_ = env.step(action)
        total_reward += reward
        env.render()
    all_rewards.append(total_reward)
    print(f"Total reward: {total_reward}")
env.close()


Total reward: -47.27233590475288
Total reward: -49.26818347728339
Total reward: -31.09010193558929
Total reward: 24.62084582662014
Total reward: 26.930244123126627
Total reward: 20.390678249649582
Total reward: 26.231009479917205
Total reward: 141.87562290808742
Total reward: -21.523899384709907
Total reward: 28.71361591344933
Total reward: -33.49156277028581
Total reward: 14.307746183233292
Total reward: 25.068558480531536
Total reward: 3.8256308610058056
Total reward: -5.88428720794407
Total reward: 140.46385962086885
Total reward: -41.16876828950086
Total reward: -49.145904509079074
Total reward: -1.3697761881111603
Total reward: 1.4950432277937296
Total reward: -33.076456065520006
Total reward: -50.51191341061657
Total reward: -24.485016168297463
Total reward: -56.033083499235254
Total reward: -2.7709723496333396
Total reward: -21.41303550699675
Total reward: 13.71695829171523
Total reward: -53.05022353912637
Total reward: -67.1398640232491
Total reward: -7.322877967328111
Total re

In [13]:
import numpy as np
# 1 model
np.mean(all_rewards)

NameError: name 'all_rewards' is not defined

# High Level Application

In [29]:
from pi_optimal.planners.online_planner import OnlinePlanner
from pi_optimal.utils.gym_wrapper.model_based_env import ModelBasedEnv

sim_env = ModelBasedEnv(models=[nn_model, nn_model2, nn_model3], dataset=dataset_train, max_episode_steps=200)
eval_env = gymnasium.make("LunarLander-v3")

online_planner = OnlinePlanner(env=sim_env, eval_env=None, train_params={"total_timesteps": 6000}, eval_params={"n_eval_episodes": 50, "eval_freq": 5000})

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 147       |
|    ep_rew_mean     | -2.01e+06 |
| time/              |           |
|    fps             | 177       |
|    iterations      | 1         |
|    time_elapsed    | 11        |
|    total_timesteps | 2048      |
----------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 161           |
|    ep_rew_mean          | -1.2e+06      |
| time/                   |               |
|    fps                  | 167           |
|    iterations           | 2             |
|    time_elapsed         | 24            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 1.2118835e-07 |
|    clip_fraction        | 0             |
|    clip_range           | 

### Doing Inference from a dataset class

For details  look in to the predict function of the planer. It takes the last observation and uses it as observation. Ensure that the dataset is set to **is_inference == True** .

In [48]:
df_inf = data_collector.collect(n_steps=50, max_steps_per_episode=200, env_seed=None, action_seed=None)

dataset_inf = dataset_test = TimeseriesDataset(
    df=df_inf,
    dataset_config=dataset_config,
    train_processors=False,
    is_inference=True
)

Collecting steps: 100%|██████████| 50/50 [00:00<00:00, 5438.95it/s]


In [52]:
online_planner.plan(dataset_inf)

array([1])