# Setup Environment and Dependencies
Import required libraries including NumPy, sklearn with Intel extension, and pi_optimal utilities. Configure warning suppression.

In [1]:
# Setup Environment and Dependencies

import os
import numpy as np
#from sklearnex import patch_sklearn
import warnings

# Change directory to the parent directory
os.chdir("..")

# Apply Intel extension to sklearn
#patch_sklearn()

# Suppress warnings
warnings.filterwarnings('ignore')

# Import pi_optimal utilities
from pi_optimal.utils.data_generators.gym_data_generator import GymDataGenerator
from pi_optimal.datasets.timeseries_dataset import TimeseriesDataset
from pi_optimal.models.sklearn.random_forest_model import RandomForest
from pi_optimal.models.sklearn.mlp import NeuralNetwork
from pi_optimal.evaluators.base_evaluator import BaseEvaluator
from pi_optimal.evaluators.plotting import plot_n_step_evaluation, plot_n_step_episode_rollout

# Create Gym Data Generator
Initialize GymDataGenerator with LunarLander environment and collect training and test data with specified parameters.

In [2]:
# Create Gym Data Generator

# Initialize GymDataGenerator with LunarLander environment
data_collector = GymDataGenerator(env_name="LunarLander-v3")

# Collect training data
df_train = data_collector.collect(n_steps=10000, max_steps_per_episode=200, env_seed=None, action_seed=None)
df_test = data_collector.collect(n_steps=5000, max_steps_per_episode=200, env_seed=None, action_seed=None)

Collecting steps: 100%|██████████| 10000/10000 [00:00<00:00, 49769.26it/s]
Collecting steps: 100%|██████████| 5000/5000 [00:00<00:00, 48232.34it/s]


# Configure Dataset Parameters
Set up dataset configuration dictionary defining features, processors, and evaluation metrics for states, actions, and rewards.

In [3]:
# Configure Dataset Parameters

# Define dataset configuration dictionary
dataset_config = {
    "episode_column": "episode",
    "timestep_column": "step",
    "states": {
        0: {"name": "state_0", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        1: {"name": "state_1", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        2: {"name": "state_2", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        3: {"name": "state_3", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        4: {"name": "state_4", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        5: {"name": "state_5", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        6: {"name": "state_6", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        7: {"name": "state_7", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        8: {"name": "done", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        9: {"name": "reward", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
    },
    "actions": {
        0: {"name": "action_0", "type": "categorial", "processor": {"name": "OneHotEncoder"}},
    },
    "reward_feature_idx": 9,
    "reward_vector_idx": 9,
    "reward_column": "reward",
}

# Create Training and Test Datasets
Initialize TimeseriesDataset objects with collected data, applying the configuration and setting lookback/forecast windows.

In [4]:
# Create Training and Test Datasets

# Define lookback and forecast timesteps
LOOKBACK_TIMESTEPS = 10
FORECAST_TIMESTEPS = 1

# Initialize TimeseriesDataset objects for training and test data
dataset_train = TimeseriesDataset(
    df=df_train,
    dataset_config=dataset_config,
    lookback_timesteps=LOOKBACK_TIMESTEPS,
    forecast_timesteps=FORECAST_TIMESTEPS,
    train_processors=True
)


dataset_test = TimeseriesDataset(
    df=df_test,
    dataset_config=dataset_config,
    lookback_timesteps=LOOKBACK_TIMESTEPS,
    forecast_timesteps=FORECAST_TIMESTEPS,
    train_processors=False
)

# Train Neural Network Model
Create and train a Neural Network model with specified hyperparameters on the training dataset.

In [5]:

# Initialize Neural Network model with specified hyperparameters
nn_model1 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001,}
)
    
# Train the Neural Network model on the first training dataset
nn_model1.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [6]:

# Initialize Neural Network model with specified hyperparameters
nn_model2 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001,}
)
    
# Train the Neural Network model on the first training dataset
nn_model2.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [7]:

# Initialize Neural Network model with specified hyperparameters
nn_model3 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001}
)
    
# Train the Neural Network model on the first training dataset
nn_model3.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [8]:

# Initialize Neural Network model with specified hyperparameters
nn_model4 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001}
)
    
# Train the Neural Network model on the first training dataset
nn_model4.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [9]:

# Initialize Neural Network model with specified hyperparameters
nn_model5 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001}
)
    
# Train the Neural Network model on the first training dataset
nn_model5.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [10]:

# Initialize Neural Network model with specified hyperparameters
nn_model6 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001}
)
    
# Train the Neural Network model on the first training dataset
nn_model6.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [11]:

# Initialize Neural Network model with specified hyperparameters
nn_model7 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001}
)
    
# Train the Neural Network model on the first training dataset
nn_model7.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [12]:

# Initialize Neural Network model with specified hyperparameters
nn_model8 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001}
)
    
# Train the Neural Network model on the first training dataset
nn_model8.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [13]:

# Initialize Neural Network model with specified hyperparameters
nn_model9 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001}
)
    
# Train the Neural Network model on the first training dataset
nn_model9.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [14]:

# Initialize Neural Network model with specified hyperparameters
nn_model10 = NeuralNetwork(params={
    "hidden_layer_sizes": (128, 128),
    "alpha": 0.01,
    "learning_rate_init": 0.001}
)
    
# Train the Neural Network model on the first training dataset
nn_model10.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

# Low Level Workflow

Here you could see how it works under the hood.

In [15]:
from pi_optimal.utils.gym_wrapper.model_based_env import ModelBasedEnv

sim_env = ModelBasedEnv(models=[nn_model1, nn_model2, nn_model3, nn_model4, nn_model5, nn_model6, nn_model7, nn_model8, nn_model9, nn_model10], dataset=dataset_train, max_episode_steps=200, use_start_states=True)

In [16]:
obs, _ = sim_env.reset()
total_reward = 0
for _ in range(200):
    action = sim_env.action_space.sample()
    obs, reward, done, done, info = sim_env.step(action)
    total_reward += reward
    sim_env.render("human")
    if done:
        break
sim_env.close()
print(total_reward)

Step: 12, State history: [-0.00470073  1.34486113 -0.07679896 -0.38653565  0.05215874  0.08994773
  0.          0.        ]
Step: 13, State history: [-0.01410913  1.32018666 -0.04364895 -0.41056232  0.05746433  0.1645839
  0.          0.        ]
Step: 14, State history: [-0.00573451  1.34256692 -0.03274342 -0.41341624  0.05369352  0.1382038
  0.          0.        ]
Step: 15, State history: [-0.00135175  1.31640741 -0.05056884 -0.43816871  0.06746983  0.12415782
  0.          0.        ]
Step: 16, State history: [-0.00618959  1.30754704 -0.0563117  -0.44799178  0.05307124  0.10002305
  0.          0.        ]
Step: 17, State history: [-0.01331905  1.29178258 -0.07776237 -0.42933553  0.06300507  0.12323796
  0.          0.        ]
Step: 18, State history: [-0.00783595  1.28365262 -0.04696333 -0.46053702  0.07305327  0.14354231
  0.          0.        ]
Step: 19, State history: [-0.0084333   1.27871331 -0.05872541 -0.47863985  0.09516768  0.13996049
  0.          0.        ]
Step: 20, 

In [17]:
#!/usr/bin/env python
import os
import gymnasium
import matplotlib.pyplot as plt

from stable_baselines3 import PPO
from stable_baselines3 import DQN
from stable_baselines3 import TD3
from stable_baselines3.common.monitor import Monitor

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.results_plotter import load_results, ts2xy
# Monkey-patch gym to include a __version__ attribute if it's missing.

# Set up log folder for monitoring
log_dir = "./logs_dir/"
os.makedirs(log_dir, exist_ok=True)

# Create the training environment and wrap it with a Monitor to log rewards.
train_env = sim_env
train_env = Monitor(train_env, log_dir)

# Create a separate evaluation environment.
eval_env = gymnasium.make("LunarLander-v3")
eval_env = Monitor(eval_env, log_dir)

# Set up the evaluation callback. This will evaluate the model every 5000 timesteps,
# and save the model if it achieves a new best mean reward.
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=os.path.join(log_dir, 'best_model'),
    log_path=log_dir,
    eval_freq=5000,
    n_eval_episodes=50,
    deterministic=False,
    render=False
)

model = PPO("MlpPolicy",
            env=train_env,
            n_steps=2048,
            gamma=0.99,
            n_epochs=10,
            clip_range=0.999,
            verbose=1)

# 3 Layers with 64 neurons each policy network both
# 

# Train the model and use the evaluation callback to save the best model.
model.learn(total_timesteps=300000, callback=eval_callback)

Using cpu device
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 86.8     |
|    ep_rew_mean     | -171     |
| time/              |          |
|    fps             | 187      |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 88.3        |
|    ep_rew_mean          | -377        |
| time/                   |             |
|    fps                  | 172         |
|    iterations           | 2           |
|    time_elapsed         | 23          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014061442 |
|    clip_fraction        | 0           |
|    clip_range           | 0.999       |
|    entropy_loss         | -1.38       |
|    explained_varia

KeyboardInterrupt: 

In [18]:
best_model = PPO.load(os.path.join(log_dir, 'best_model/best_model.zip'))

In [19]:

# Test the trained agent by running one episode and rendering it.
env = gymnasium.make("LunarLander-v3", render_mode="human")   
obs, _ = env.reset()
done = False
terminated = False
total_reward = 0
while not (done or terminated):
    # Predict the next action using the trained policy.
    action, _ = best_model.predict(obs, deterministic=False)
    obs, reward, done, terminated ,_ = env.step(action)
    total_reward += reward
    env.render()

env.close()
print(f"Total reward: {total_reward}")

2025-03-28 16:44:08.856 Python[7548:401244] +[IMKClient subclass]: chose IMKClient_Modern
2025-03-28 16:44:08.856 Python[7548:401244] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Total reward: 276.47738249932047


In [20]:

# Test the trained agent by running one episode and rendering it.
env = gymnasium.make("LunarLander-v3", render_mode="rgb_array")  
all_rewards = []
for i in range(100):
    obs, _ = env.reset()
    done = False
    terminated = False
    total_reward = 0
    while not (done or terminated):
        # Predict the next action using the trained policy.
        action, _ = best_model.predict(obs, deterministic=False)
        obs, reward, done, terminated ,_ = env.step(action)
        total_reward += reward
        env.render()
    all_rewards.append(total_reward)
    print(f"Total reward: {total_reward}")
env.close()


Total reward: 274.01828930464137
Total reward: 23.67992769688901
Total reward: 16.005903790004222
Total reward: 271.4731485612306
Total reward: 222.98293610548535
Total reward: 271.04334930186394
Total reward: 59.97551668656732
Total reward: 234.0408235007612
Total reward: 290.0745296086543
Total reward: 226.18771030630424
Total reward: 101.59305895282387
Total reward: 289.3743976757225
Total reward: 40.37370258956648
Total reward: 258.8458143657387
Total reward: 45.0012761598185
Total reward: 74.40971576479845
Total reward: 0.6525354259194933
Total reward: 258.19833158768483
Total reward: 70.02118823812222
Total reward: 272.85355399447565
Total reward: 17.713849126552276
Total reward: 219.66683689786572
Total reward: 279.9356451213092
Total reward: 281.39704523955686
Total reward: -148.46578093843607
Total reward: 38.370883120157835
Total reward: 48.535443095708345
Total reward: 239.2377544720301
Total reward: 60.58952525979842
Total reward: 19.006608262262063
Total reward: 43.1000395

In [22]:
import numpy as np
# 1 model
np.mean(all_rewards)

173.28859287085692

# High Level Application

In [29]:
from pi_optimal.planners.online_planner import OnlinePlanner
from pi_optimal.utils.gym_wrapper.model_based_env import ModelBasedEnv

sim_env = ModelBasedEnv(models=[nn_model, nn_model2, nn_model3], dataset=dataset_train, max_episode_steps=200)
eval_env = gymnasium.make("LunarLander-v3")

online_planner = OnlinePlanner(env=sim_env, eval_env=None, train_params={"total_timesteps": 6000}, eval_params={"n_eval_episodes": 50, "eval_freq": 5000})

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 147       |
|    ep_rew_mean     | -2.01e+06 |
| time/              |           |
|    fps             | 177       |
|    iterations      | 1         |
|    time_elapsed    | 11        |
|    total_timesteps | 2048      |
----------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 161           |
|    ep_rew_mean          | -1.2e+06      |
| time/                   |               |
|    fps                  | 167           |
|    iterations           | 2             |
|    time_elapsed         | 24            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 1.2118835e-07 |
|    clip_fraction        | 0             |
|    clip_range           | 

### Doing Inference from a dataset class

For details  look in to the predict function of the planer. It takes the last observation and uses it as observation. Ensure that the dataset is set to **is_inference == True** .

In [48]:
df_inf = data_collector.collect(n_steps=50, max_steps_per_episode=200, env_seed=None, action_seed=None)

dataset_inf = dataset_test = TimeseriesDataset(
    df=df_inf,
    dataset_config=dataset_config,
    train_processors=False,
    is_inference=True
)

Collecting steps: 100%|██████████| 50/50 [00:00<00:00, 5438.95it/s]


In [52]:
online_planner.plan(dataset_inf)

array([1])