# Setup Environment and Dependencies
Import required libraries including NumPy, sklearn with Intel extension, and pi_optimal utilities. Configure warning suppression.

In [1]:
# Setup Environment and Dependencies

import os
import numpy as np
#from sklearnex import patch_sklearn
import warnings

# Change directory to the parent directory
os.chdir("..")

# Apply Intel extension to sklearn
#patch_sklearn()

# Suppress warnings
warnings.filterwarnings('ignore')

# Import pi_optimal utilities
from pi_optimal.utils.data_generators.gym_data_generator import GymDataGenerator
from pi_optimal.datasets.timeseries_dataset import TimeseriesDataset
from pi_optimal.models.sklearn.random_forest_model import RandomForest
from pi_optimal.models.sklearn.mlp import NeuralNetwork
from pi_optimal.evaluators.base_evaluator import BaseEvaluator
from pi_optimal.evaluators.plotting import plot_n_step_evaluation, plot_n_step_episode_rollout

# Create Gym Data Generator
Initialize GymDataGenerator with LunarLander environment and collect training and test data with specified parameters.

In [2]:
# Create Gym Data Generator

# Initialize GymDataGenerator with LunarLander environment
data_collector = GymDataGenerator(env_name="LunarLander-v3")

# Collect training data
df_train = data_collector.collect(n_steps=10000, max_steps_per_episode=200, env_seed=None, action_seed=None)
df_test = data_collector.collect(n_steps=5000, max_steps_per_episode=200, env_seed=None, action_seed=None)

Collecting steps: 100%|██████████| 10000/10000 [00:00<00:00, 41759.05it/s]
Collecting steps: 100%|██████████| 5000/5000 [00:00<00:00, 47382.13it/s]


In [8]:
import pandas as pd

df_train = pd.read_csv("tutorial_gym/dataset_20250401_160426.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'tutorial_gym/dataset_20250401_160426.csv'

In [27]:
df_train["reward"] = df_train.groupby("episode_id").reward.shift(1, fill_value=0)

KeyError: 'episode_id'

# Configure Dataset Parameters
Set up dataset configuration dictionary defining features, processors, and evaluation metrics for states, actions, and rewards.

In [3]:
# Configure Dataset Parameters

# Define dataset configuration dictionary
dataset_config = {
    "episode_column": "episode",
    "timestep_column": "step",
    "states": {
        0: {"name": "state_0", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        1: {"name": "state_1", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        2: {"name": "state_2", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        3: {"name": "state_3", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        4: {"name": "state_4", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        5: {"name": "state_5", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        6: {"name": "state_6", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        7: {"name": "state_7", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        8: {"name": "done", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        9: {"name": "reward", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
    },
    "actions": {
        0: {"name": "action_0", "type": "categorial", "processor": {"name": "OneHotEncoder"}},
    },
    "reward_feature_idx": 9,
    "reward_vector_idx": 9,
    "reward_column": "reward",
}

# Create Training and Test Datasets
Initialize TimeseriesDataset objects with collected data, applying the configuration and setting lookback/forecast windows.

In [4]:
# Create Training and Test Datasets

# Define lookback and forecast timesteps
LOOKBACK_TIMESTEPS = 10
FORECAST_TIMESTEPS = 1

# Initialize TimeseriesDataset objects for training and test data
dataset_train = TimeseriesDataset(
    df=df_train,
    dataset_config=dataset_config,
    lookback_timesteps=LOOKBACK_TIMESTEPS,
    forecast_timesteps=FORECAST_TIMESTEPS,
    train_processors=True
)


# Train Neural Network Model
Create and train a Neural Network model with specified hyperparameters on the training dataset.

In [5]:

from pi_optimal.models.torch.mlp import NeuralNetworkTorch
models = []
for i in range(1):
# Initialize Neural Network model with specified hyperparameters
    nn_model1 = NeuralNetworkTorch(params={
        "hidden_layer_sizes": (16, 16),
        "alpha": 0.01,
        "learning_rate_init": 0.001}
    )
        
    # Train the Neural Network model on the first training dataset
    nn_model1.fit(dataset_train)
    models.append(nn_model1)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

# Low Level Workflow

Here you could see how it works under the hood.

In [6]:
from pi_optimal.utils.gym_wrapper.model_based_env import ModelBasedEnv

sim_env = ModelBasedEnv(models=models, dataset=dataset_train, max_episode_steps=200, use_start_states=False)

In [7]:
obs, _ = sim_env.reset()
total_reward = 0
for _ in range(200):
    action = sim_env.action_space.sample()
    obs, reward, done, done, info = sim_env.step(action)
    total_reward += reward
    sim_env.render("human")
    if done:
        break
sim_env.close()
print(total_reward)

Step: 24, State history: [ 0.13584271  1.4454533   0.50406759 -0.04353851 -0.10645362 -0.05044999
  0.          0.        ]
Step: 25, State history: [ 0.13948215  1.4454533   0.49664563 -0.0505648  -0.11756632 -0.06748867
  0.          0.        ]
Step: 26, State history: [ 0.14363463  1.4454533   0.48446734 -0.07982541 -0.11896245  0.01715076
  0.          0.        ]
Step: 27, State history: [ 0.14799994  1.4454533   0.47276958 -0.1063061  -0.10479766  0.04932263
  0.          0.        ]
Step: 28, State history: [ 0.15311162  1.4454533   0.47790869 -0.12601496 -0.10870639  0.07045847
  0.          0.        ]
Step: 29, State history: [ 0.15726526  1.4454533   0.47446665 -0.14419209 -0.1022042   0.02351007
  0.          0.        ]
Step: 30, State history: [ 0.16126725  1.44440108  0.47670515 -0.13536394 -0.10906601 -0.00806893
  0.          0.        ]
Step: 31, State history: [ 0.16683203  1.44179361  0.4668234  -0.17482216 -0.11482928  0.05028876
  0.          0.        ]
Step: 32

In [9]:
#!/usr/bin/env python
import os
import gymnasium
import matplotlib.pyplot as plt

from stable_baselines3 import PPO
from stable_baselines3 import DQN
from stable_baselines3 import TD3
from stable_baselines3.common.monitor import Monitor

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.results_plotter import load_results, ts2xy
# Monkey-patch gym to include a __version__ attribute if it's missing.

# Set up log folder for monitoring
log_dir = "./logs_dir/"
os.makedirs(log_dir, exist_ok=True)

# Create the training environment and wrap it with a Monitor to log rewards.
train_env = sim_env
train_env = Monitor(train_env, log_dir)

# Create a separate evaluation environment.
eval_env = gymnasium.make("LunarLander-v3")
eval_env = Monitor(eval_env, log_dir)

# Set up the evaluation callback. This will evaluate the model every 5000 timesteps,
# and save the model if it achieves a new best mean reward.
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=os.path.join(log_dir, 'best_model'),
    log_path=log_dir,
    eval_freq=5000,
    n_eval_episodes=50,
    deterministic=False,
    render=False
)

model = PPO("MlpPolicy",
            env=train_env,
            n_steps=2000,
            batch_size=64,
            gamma=0.99,
            n_epochs=10,
            clip_range=0.2,
            verbose=1)

# 3 Layers with 64 neurons each policy network both
# 

# Train the model and use the evaluation callback to save the best model.
model.learn(total_timesteps=300000, callback=eval_callback)

Using cpu device
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -3.1e+04 |
| time/              |          |
|    fps             | 383      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 2000     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 200           |
|    ep_rew_mean          | -1.91e+05     |
| time/                   |               |
|    fps                  | 377           |
|    iterations           | 2             |
|    time_elapsed         | 10            |
|    total_timesteps      | 4000          |
| train/                  |               |
|    approx_kl            | 0.00031341295 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.39   

KeyboardInterrupt: 

In [9]:
best_model = PPO.load(os.path.join(log_dir, 'best_model/best_model.zip'))

In [10]:

# Test the trained agent by running one episode and rendering it.
env = gymnasium.make("LunarLander-v3", render_mode="human")   
obs, _ = env.reset()
done = False
terminated = False
total_reward = 0
while not (done or terminated):
    # Predict the next action using the trained policy.
    action, _ = best_model.predict(obs, deterministic=False)
    obs, reward, done, terminated ,_ = env.step(action)
    total_reward += reward
    env.render()

env.close()
print(f"Total reward: {total_reward}")

2025-04-09 17:08:34.886 Python[1576:23869] +[IMKClient subclass]: chose IMKClient_Modern
2025-04-09 17:08:34.887 Python[1576:23869] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Total reward: 49.49009550974978


In [11]:

# Test the trained agent by running one episode and rendering it.
env = gymnasium.make("LunarLander-v3", render_mode="rgb_array")  
all_rewards = []
for i in range(100):
    obs, _ = env.reset()
    done = False
    terminated = False
    total_reward = 0
    while not (done or terminated):
        # Predict the next action using the trained policy.
        action, _ = best_model.predict(obs, deterministic=False)
        obs, reward, done, terminated ,_ = env.step(action)
        total_reward += reward
        env.render()
    all_rewards.append(total_reward)
    print(f"Total reward: {total_reward}")
env.close()


Total reward: -5.288970231821338
Total reward: 26.286912093857694
Total reward: -12.856226673468612
Total reward: 12.946497926210256
Total reward: 38.60115129334602
Total reward: 18.78524829945077
Total reward: 35.285303981524294
Total reward: 14.526711042461073
Total reward: 17.797928607848448
Total reward: 42.06150815568901
Total reward: 28.097011554536124
Total reward: 36.0323057969143
Total reward: 62.95725373502157
Total reward: 22.9625340032443
Total reward: 9.385850641281905
Total reward: 85.13013116307755
Total reward: 56.36120356538578
Total reward: 27.501199298959094
Total reward: 35.1031880145066
Total reward: -8.861749677979049
Total reward: 10.024359557117094


KeyboardInterrupt: 

In [13]:
import numpy as np
# 1 model
np.mean(all_rewards)

56.31434120791784

# High Level Application

In [29]:
from pi_optimal.planners.online_planner import OnlinePlanner
from pi_optimal.utils.gym_wrapper.model_based_env import ModelBasedEnv

sim_env = ModelBasedEnv(models=[nn_model, nn_model2, nn_model3], dataset=dataset_train, max_episode_steps=200)
eval_env = gymnasium.make("LunarLander-v3")

online_planner = OnlinePlanner(env=sim_env, eval_env=None, train_params={"total_timesteps": 6000}, eval_params={"n_eval_episodes": 50, "eval_freq": 5000})

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 147       |
|    ep_rew_mean     | -2.01e+06 |
| time/              |           |
|    fps             | 177       |
|    iterations      | 1         |
|    time_elapsed    | 11        |
|    total_timesteps | 2048      |
----------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 161           |
|    ep_rew_mean          | -1.2e+06      |
| time/                   |               |
|    fps                  | 167           |
|    iterations           | 2             |
|    time_elapsed         | 24            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 1.2118835e-07 |
|    clip_fraction        | 0             |
|    clip_range           | 

### Doing Inference from a dataset class

For details  look in to the predict function of the planer. It takes the last observation and uses it as observation. Ensure that the dataset is set to **is_inference == True** .

In [48]:
df_inf = data_collector.collect(n_steps=50, max_steps_per_episode=200, env_seed=None, action_seed=None)

dataset_inf = dataset_test = TimeseriesDataset(
    df=df_inf,
    dataset_config=dataset_config,
    train_processors=False,
    is_inference=True
)

Collecting steps: 100%|██████████| 50/50 [00:00<00:00, 5438.95it/s]


In [52]:
online_planner.plan(dataset_inf)

array([1])