# Setup Environment and Dependencies
Import required libraries including NumPy, sklearn with Intel extension, and pi_optimal utilities. Configure warning suppression.

In [4]:
# Setup Environment and Dependencies

import os
import numpy as np
#from sklearnex import patch_sklearn
import warnings

# Change directory to the parent directory
os.chdir("..")

# Apply Intel extension to sklearn
#patch_sklearn()

# Suppress warnings
warnings.filterwarnings('ignore')

# Import pi_optimal utilities
from pi_optimal.utils.data_generators.gym_data_generator import GymDataGenerator
from pi_optimal.datasets.timeseries_dataset import TimeseriesDataset
from pi_optimal.models.sklearn.random_forest_model import RandomForest
from pi_optimal.models.sklearn.mlp import NeuralNetwork
from pi_optimal.evaluators.base_evaluator import BaseEvaluator
from pi_optimal.evaluators.plotting import plot_n_step_evaluation, plot_n_step_episode_rollout

# Create Gym Data Generator
Initialize GymDataGenerator with LunarLander environment and collect training and test data with specified parameters.

In [5]:
# Create Gym Data Generator

# Initialize GymDataGenerator with LunarLander environment
data_collector = GymDataGenerator(env_name="LunarLander-v3")

# Collect training data
df_train = data_collector.collect(n_steps=10000, max_steps_per_episode=200, env_seed=None, action_seed=None)
df_test = data_collector.collect(n_steps=5000, max_steps_per_episode=200, env_seed=None, action_seed=None)

Collecting steps:   7%|▋         | 736/10000 [00:00<00:01, 7358.22it/s]

Collecting steps: 100%|██████████| 10000/10000 [00:00<00:00, 10852.98it/s]
Collecting steps: 100%|██████████| 5000/5000 [00:00<00:00, 13159.02it/s]


# Configure Dataset Parameters
Set up dataset configuration dictionary defining features, processors, and evaluation metrics for states, actions, and rewards.

In [6]:
# Configure Dataset Parameters

# Define dataset configuration dictionary
dataset_config = {
    "episode_column": "episode",
    "timestep_column": "step",
    "states": {
        0: {"name": "state_0", "type": "numerical", "processor": {"name": "StandardScaler"}, "evaluation_metric": "mae"},
        1: {"name": "state_1", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        2: {"name": "state_2", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        3: {"name": "state_3", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        4: {"name": "state_4", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        5: {"name": "state_5", "type": "numerical", "processor": {"name": "RobustScaler", "params": {"quantile_range": (5.0, 95.0)}}, "evaluation_metric": "mae"},
        6: {"name": "state_6", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        7: {"name": "state_7", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        8: {"name": "done", "type": "binary", "processor": None, "evaluation_metric": "f1_binary"},
        9: {"name": "reward", "type": "numerical", "processor": {"name": "PowerTransformer"}, "evaluation_metric": "mae"},
    },
    "actions": {
        0: {"name": "action_0", "type": "categorial", "processor": {"name": "OneHotEncoder"}},
    },
    "reward_feature_idx": 9,
    "reward_vector_idx": 9,
    "reward_column": "reward",
}

# Create Training and Test Datasets
Initialize TimeseriesDataset objects with collected data, applying the configuration and setting lookback/forecast windows.

In [7]:
# Create Training and Test Datasets

# Define lookback and forecast timesteps
LOOKBACK_TIMESTEPS = 10
FORECAST_TIMESTEPS = 1

# Initialize TimeseriesDataset objects for training and test data
dataset_train = TimeseriesDataset(
    df=df_train,
    dataset_config=dataset_config,
    lookback_timesteps=LOOKBACK_TIMESTEPS,
    forecast_timesteps=FORECAST_TIMESTEPS,
    train_processors=True
)


dataset_test = TimeseriesDataset(
    df=df_test,
    dataset_config=dataset_config,
    lookback_timesteps=LOOKBACK_TIMESTEPS,
    forecast_timesteps=FORECAST_TIMESTEPS,
    train_processors=False
)

# Train Neural Network Model
Create and train a Neural Network model with specified hyperparameters on the training dataset.

In [8]:
nn_model1_params = {
    "hidden_layer_sizes":(128, 128),
    "alpha":0.01, 
    "learning_rate_init": 0.001
    }

# Initialize Neural Network model with specified hyperparameters
nn_model1 = NeuralNetwork(nn_model1_params )
    
# Train the Neural Network model on the first training dataset
nn_model1.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [9]:
nn_model2_params = {
    "hidden_layer_sizes":(128, 128),
    "alpha":0.01, 
    "learning_rate_init": 0.001
    }

# Initialize Neural Network model with specified hyperparameters
nn_model2 = NeuralNetwork(nn_model2_params)
    
# Train the Neural Network model on the first training dataset
nn_model2.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

In [10]:
nn_model3_params = {
    "hidden_layer_sizes":(128, 128),
    "alpha":0.01, 
    "learning_rate_init": 0.001
    }

# Initialize Neural Network model with specified hyperparameters
nn_model3 = NeuralNetwork(nn_model3_params)
    
# Train the Neural Network model on the first training dataset
nn_model3.fit(dataset_train)

Training models...:   0%|          | 0/10 [00:00<?, ?it/s]

# Low Level Workflow

Here you could see how it works under the hood.

In [11]:
from pi_optimal.utils.gym_wrapper.model_based_env import ModelBasedEnv

sim_env = ModelBasedEnv(models=[nn_model1, nn_model2, nn_model3], dataset=dataset_train, max_episode_steps=200)

In [12]:
obs, _ = sim_env.reset()
total_reward = 0
for _ in range(200):
    action = sim_env.action_space.sample()
    obs, reward, done, done, info = sim_env.step(action)
    total_reward += reward
    sim_env.render("human")
    if done:
        break
sim_env.close()
print(total_reward)

Step: 4, State history: [-0.00799365  1.43109277 -0.43077302  0.36261339 -0.02471587  0.07597064
  0.          0.        ]
Step: 5, State history: [-0.020135    1.42896844 -0.47974141  0.34353693 -0.04914006  0.12348212
  0.          0.        ]
Step: 6, State history: [-0.02063704  1.49531107 -0.5347775   0.36369197  0.00942229  0.06014732
  0.          0.        ]
Step: 7, State history: [-0.0441081   1.4740501  -0.53140469  0.34319195  0.0581589   0.05496297
  0.          0.        ]
Step: 8, State history: [-0.03686068  1.43536405 -0.46924847  0.30872668  0.04788847  0.13928711
  0.          0.        ]
Step: 9, State history: [-0.04486147  1.45026552 -0.45808422  0.24156467  0.05471485  0.16023316
  0.          0.        ]
Step: 10, State history: [-0.05183768  1.51144444 -0.4933244   0.26615997  0.08147543  0.11819338
  0.          0.        ]
Step: 11, State history: [-0.04323971  1.48465514 -0.51441555  0.18004638  0.06444473  0.13596467
  0.          0.        ]
Step: 12, Stat

In [13]:
#!/usr/bin/env python
import os
import gymnasium
import matplotlib.pyplot as plt

from stable_baselines3 import PPO
from stable_baselines3 import DQN
from stable_baselines3 import TD3
from stable_baselines3.common.monitor import Monitor

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.results_plotter import load_results, ts2xy
# Monkey-patch gym to include a __version__ attribute if it's missing.

# Set up log folder for monitoring
log_dir = "./logs_dir/"
os.makedirs(log_dir, exist_ok=True)

# Create the training environment and wrap it with a Monitor to log rewards.
train_env = sim_env
train_env = Monitor(train_env, log_dir)

# Create a separate evaluation environment.
eval_env = gymnasium.make("LunarLander-v3")
eval_env = Monitor(eval_env, log_dir)

# Set up the evaluation callback. This will evaluate the model every 5000 timesteps,
# and save the model if it achieves a new best mean reward.
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=os.path.join(log_dir, 'best_model'),
    log_path=log_dir,
    eval_freq=5000,
    n_eval_episodes=5,
    deterministic=False,
    render=False
)

model = PPO("MlpPolicy", train_env, verbose=1)


# Train the model and use the evaluation callback to save the best model.
model.learn(total_timesteps=30000, callback=eval_callback)

Using cuda device
Wrapping the env in a DummyVecEnv.


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 49       |
|    ep_rew_mean     | -171     |
| time/              |          |
|    fps             | 161      |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 48.9        |
|    ep_rew_mean          | -155        |
| time/                   |             |
|    fps                  | 132         |
|    iterations           | 2           |
|    time_elapsed         | 30          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.005098087 |
|    clip_fraction        | 0.015       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | 0.00506     |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x7de8df360c70>

In [14]:
best_model = PPO.load(os.path.join(log_dir, 'best_model/best_model.zip'))

In [15]:

# Test the trained agent by running one episode and rendering it.
env = gymnasium.make("LunarLander-v3", render_mode="human")   
obs, _ = env.reset()
done = False
terminated = False
total_reward = 0
while not (done or terminated):
    # Predict the next action using the trained policy.
    action, _ = best_model.predict(obs, deterministic=False)
    obs, reward, done, terminated ,_ = env.step(action)
    total_reward += reward
    env.render()

env.close()
print(f"Total reward: {total_reward}")

Total reward: -251.36562273417067


In [16]:

# Test the trained agent by running one episode and rendering it.
env = gymnasium.make("LunarLander-v3", render_mode="rgb_array")  
all_rewards = []
for i in range(100):
    obs, _ = env.reset()
    done = False
    terminated = False
    total_reward = 0
    while not (done or terminated):
        # Predict the next action using the trained policy.
        action, _ = best_model.predict(obs, deterministic=False)
        obs, reward, done, terminated ,_ = env.step(action)
        total_reward += reward
        env.render()
    all_rewards.append(total_reward)
    print(f"Total reward: {total_reward}")
env.close()


Total reward: -356.7004370579849
Total reward: -205.0672176445213
Total reward: -185.42442994538737
Total reward: -180.583259349532
Total reward: -74.3874245609765
Total reward: -131.11515657040428
Total reward: -328.54980789341494
Total reward: -15.775288461501205
Total reward: -250.72430317604957
Total reward: -344.161095402511
Total reward: -378.00936461893
Total reward: -163.0313261288647
Total reward: -344.6415081425979
Total reward: -193.39735169148756
Total reward: -84.7045404038461
Total reward: -87.1937146048694
Total reward: -51.40288570194947
Total reward: -120.21858425173662
Total reward: -259.98012877021154
Total reward: -186.32341640938367
Total reward: -114.7653707964826
Total reward: -275.9906541269082
Total reward: -53.82357769370684
Total reward: -80.98208635387768
Total reward: -191.41037610373314
Total reward: -329.2542637683055
Total reward: -158.78980841539396
Total reward: -204.61298171309147
Total reward: -312.3517801164194
Total reward: -142.3654677531664
Total

In [13]:
import numpy as np
# 1 model
np.mean(all_rewards)

NameError: name 'all_rewards' is not defined

# High Level Application

In [17]:
from pi_optimal.planners.online_planner import OnlinePlanner
from pi_optimal.utils.gym_wrapper.model_based_env import ModelBasedEnv

sim_env = ModelBasedEnv(models=[nn_model1, nn_model2, nn_model3], dataset=dataset_train, max_episode_steps=200)
eval_env = gymnasium.make("LunarLander-v3")

online_planner = OnlinePlanner(env=sim_env, eval_env=None, train_params={"total_timesteps": 11000}, eval_params={"n_eval_episodes": 2, "eval_freq": 5000})

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 47.1     |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 165      |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 49.1        |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 133         |
|    iterations           | 2           |
|    time_elapsed         | 30          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.005396694 |
|    clip_fraction        | 0.0209      |
|    clip_range           | 0.2         |
|    entropy_loss  

### Doing Inference from a dataset class

For details  look in to the predict function of the planer. It takes the last observation and uses it as observation. Ensure that the dataset is set to **is_inference == True** .

In [18]:
df_inf = data_collector.collect(n_steps=50, max_steps_per_episode=200, env_seed=None, action_seed=None)

dataset_inf = dataset_test = TimeseriesDataset(
    df=df_inf,
    dataset_config=dataset_config,
    train_processors=False,
    is_inference=True
)

Collecting steps: 100%|██████████| 50/50 [00:00<00:00, 6651.92it/s]


In [19]:
online_planner.plan(dataset_inf)

array([3])