# 0.Setup

In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (9, 4)

import normet as nm

In [2]:
#Optional
#Enable concise logging (scrolling box is handy in notebooks):
from normet.utils.logging import enable_default_logging, get_logger
enable_default_logging(level="INFO")
log = get_logger("tutorial.model_training")

# 1. Data: load

In [3]:
df=pd.read_csv(r'data_MY1_data.csv',parse_dates=['date'],index_col='date')

In [4]:
df.columns

Index(['O3', 'NO', 'NO2', 'NOXasNO2', 'SO2', 'CO', 'PM10', 'NV10', 'V10',
       'PM2.5', 'NV2.5', 'V2.5', 'ETHANE', 'ETHENE', 'ETHYNE', 'PROPANE',
       'PROPENE', 'iBUTANE', 'nBUTANE', '1BUTENE', 't2BUTENE', 'c2BUTENE',
       'iPENTANE', 'nPENTANE', 't2PENTEN', '1PENTEN', '2MEPENT', 'ISOPRENE',
       'nHEXANE', 'nHEPTANE', 'iOCTANE', 'nOCTANE', 'BENZENE', 'TOLUENE',
       'ETHBENZ', 'mpXYLENE', 'oXYLENE', '124TMB', '135TMB', 'wd', 'ws',
       'temp', 'AT10', 'AP10', 'AT2.5', 'AP2.5', 'site', 'code', 'latitude',
       'longitude', 'location_type', 'Ox', 'NOx', 'u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m', 'lat', 'lon'],
      dtype='object')

In [5]:
# Prepare data, including adding time variables, training/testing split
df_prep=nm.prepare_data(df, value='PM2.5', feature_names=['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m'])

INFO | normet.normet.utils.prepare | Dropped 2411 rows with NA in target.
INFO | normet.normet.utils.prepare | Prepared data: 6373 rows, 17 columns


In [6]:
?nm.prepare_data

[31mSignature:[39m
nm.prepare_data(
    df: [33m'pd.DataFrame'[39m,
    value: [33m'str'[39m,
    feature_names: [33m'List[str]'[39m,
    na_rm: [33m'bool'[39m = [38;5;28;01mTrue[39;00m,
    split_method: [33m'str'[39m = [33m'random'[39m,
    fraction: [33m'float'[39m = [32m0.75[39m,
    seed: [33m'int'[39m = [32m7654321[39m,
) -> [33m'pd.DataFrame'[39m
[31mDocstring:[39m
Clean, validate, and split the input DataFrame in a single pipeline.

Steps:
  1) Ensure a datetime column named ``date`` is present.
  2) Validate target and features.
  3) Impute/drop missing values.
  4) Add derived date variables (unix, julian day, weekday, hour).
  5) Split into training/testing sets.

Parameters
----------
df : pandas.DataFrame
    Raw input dataset containing at least the target column and date/time info.
value : str
    Target column name in ``df``.
feature_names : list of str
    Predictor variable names to keep (must exist in ``df``).
na_rm : bool, default True
  

In [7]:
df_prep.columns

Index(['blh', 'd2m', 'rh2m', 'sp', 'ssrd', 't2m', 'tcc', 'tp', 'u10', 'v10',
       'date', 'value', 'date_unix', 'day_julian', 'weekday', 'hour', 'set'],
      dtype='object')

In [8]:
df_prep.to_csv('data_df_prep.csv',index=False)

# 2.Training model with FLAML backend (lightweight, recommended default)

## 2.1 Quick start

In [9]:
target = 'value'
features = ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour']

In [10]:
#Use a single estimator (e.g., LightGBM) for a quick baseline.
#Below we use LightGBM with a small time budget.
flaml_quick_cfg = {
    "time_budget": 90,          # seconds for the search
    "metric": "r2",             # optimize R^2 (use "mae"/"mse" if preferred)
    "estimator_list": ["lgbm"], # single estimator keeps things fast
    "task": "regression",
    "eval_method": "auto",
    "save_model": True,
    "folder_path":'.',
    "filename": "automl"
}

model_flaml = nm.train_model(
    df=df_prep,
    value= target,
    backend="flaml",
    variables=features,
    model_config=flaml_quick_cfg,
    seed=42,
    verbose=True,
)

INFO | normet.normet.backends.flaml_backend | Training FLAML AutoML: X shape=(4780, 14), target='value'
INFO | normet.normet.backends.flaml_backend | FLAML best_estimator=lgbm | best_config={'n_estimators': 1149, 'num_leaves': 14, 'min_child_samples': 15, 'learning_rate': np.float64(0.2061003131290804), 'log_max_bin': 10, 'colsample_bytree': np.float64(0.9510000624084615), 'reg_alpha': np.float64(1.162262163142946), 'reg_lambda': np.float64(0.0034651839263365124)}
INFO | normet.normet.backends.flaml_backend | Saved FLAML model to automl.joblib


In [11]:
model_flaml

0,1,2
,time_budget,-1
,task,'classification'
,n_jobs,-1
,eval_method,'auto'
,split_ratio,0.1
,n_splits,5
,auto_augment,True
,metric,'auto'
,estimator_list,'auto'
,log_file_name,''


# 2.2 Save/load FLAML model

In [12]:
# You can still save model if you didn't set save model as True in the model config during model training
nm.save_model(model_flaml,folder_path='.',filename='automl.joblib')

INFO | normet.normet.backends.flaml_backend | Saved FLAML model to automl.joblib


'automl.joblib'

In [13]:
?nm.save_model

[31mSignature:[39m
nm.save_model(
    model: [33m'object'[39m,
    folder_path: [33m'Union[str, Path]'[39m = [33m'.'[39m,
    filename: [33m'str'[39m = [33m'automl.joblib'[39m,
) -> [33m'str'[39m
[31mDocstring:[39m
Save a trained model by delegating to the appropriate backend saver.

Parameters
----------
model : object
    Trained model with attribute ``backend`` in {"flaml", "h2o"}.
folder_path : str | Path, default="."
    Destination directory. Created if it does not exist.
filename : str, default="automl.joblib"
    Output name:
      - FLAML: exact filename, must end with ``.joblib`` or ``.pkl``.
      - H2O:   desired artifact name; handled by renaming after save.

Returns
-------
str
    Path to the saved artifact:
      - FLAML: full path to the ``.joblib``/``.pkl`` file.
      - H2O:   full path to the renamed model file.

Raises
------
AttributeError
    If model does not define a ``backend`` attribute.
TypeError
    If backend is unsupported.
[31mFile:[39m

In [14]:
model_flaml=nm.load_model(folder_path='.',filename='automl.joblib')

INFO | normet.normet.backends.flaml_backend | Loaded FLAML model from automl.joblib


# 2.3 Model performance_FLAML

In [15]:
modStat = nm.modStats(df_prep, model_flaml)

In [16]:
modStat

Unnamed: 0,n,FAC2,MB,MGE,RMSE,NMB,NMGE,COE,IOA,r,p_level,R2,set
0,4780,0.990153,-4e-05,0.253551,0.330469,-4e-06,0.027825,0.953687,0.976843,0.999207,***,0.998415,training
1,1593,0.88805,0.153261,2.052048,2.931742,0.016659,0.22305,0.626259,0.81313,0.929289,***,0.863578,testing
2,6373,0.964639,0.03828,0.703104,1.493437,0.004191,0.076975,0.87168,0.93584,0.983044,***,0.966376,all


# 2.4 Partial Dependency_FLAML

In [17]:
pdp_value=nm.pdp(df_prep,model_flaml)

In [18]:
pdp_value.head()

Unnamed: 0,variable,value,pdp_mean,pdp_std
0,blh,32.616693,19.345874,9.574783
1,blh,71.202291,18.257101,9.641589
2,blh,109.787888,17.079692,9.567257
3,blh,148.373486,14.047617,7.88942
4,blh,186.959084,12.244408,7.785849


In [19]:
df_prep.columns

Index(['blh', 'd2m', 'rh2m', 'sp', 'ssrd', 't2m', 'tcc', 'tp', 'u10', 'v10',
       'date', 'value', 'date_unix', 'day_julian', 'weekday', 'hour', 'set'],
      dtype='object')

In [20]:
nm.pdp(df_prep,model_flaml,var_list=['blh','rh2m'])

Unnamed: 0,variable,value,pdp_mean,pdp_std
0,blh,32.616693,19.345874,9.574783
1,blh,71.202291,18.257101,9.641589
2,blh,109.787888,17.079692,9.567257
3,blh,148.373486,14.047617,7.889420
4,blh,186.959084,12.244408,7.785849
...,...,...,...,...
95,rh2m,93.271444,8.961660,7.744341
96,rh2m,94.519172,9.170470,7.782532
97,rh2m,95.766900,9.505157,7.829908
98,rh2m,97.014628,9.367317,7.753671


# 3.Training model with H2O backend (heavier, Java required)

## 3.1 Quick config

In [21]:
target = 'value'
features = ['u10', 'v10', 'd2m', 't2m',
       'blh', 'sp', 'ssrd', 'tcc', 'tp', 'rh2m','date_unix', 'day_julian', 'weekday',
       'hour']

h2o_quick_cfg = {
    "max_models": 8,
    "include_algos": ["GBM"],  # keep small for speed; expand later
    "sort_metric": "RMSE",
    "max_mem_size": "8G",      # adjust to your machine/data
    "save_model": True,
    "folder_path":'.',
    "filename":"automl"
}


model_h2o = nm.train_model(
    df=df_prep,
    value=target,
    backend="h2o",
    variables=features,
    model_config=h2o_quick_cfg,
    seed=42,
    verbose=False,)

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,4 mins 46 secs
H2O_cluster_timezone:,Europe/London
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,5 months and 21 days
H2O_cluster_name:,H2O_from_python_n94921cs_um8ihg
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.071 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,7


INFO | normet.normet.backends.h2o_backend | H2O cluster up | version=None | nthreads=? | mem=8G
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


In [22]:
model_h2o

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,144.0,144.0,170038.0,10.0,10.0,10.0,25.0,237.0,89.3125

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,,0.0,,,,,
loglikelihood,,0.0,,,,,
mae,1.9981214,0.0528831,1.9989221,2.0229292,2.068196,1.9735426,1.9270169
mean_residual_deviance,8.589263,0.6306002,9.184041,8.478395,9.292726,8.106892,7.8842587
mse,8.589263,0.6306002,9.184041,8.478395,9.292726,8.106892,7.8842587
r2,0.8725205,0.0073285,0.8659208,0.8702673,0.8658085,0.8802241,0.8803817
residual_deviance,8.589263,0.6306002,9.184041,8.478395,9.292726,8.106892,7.8842587
rmse,2.9291673,0.107479,3.0305183,2.9117684,3.0483973,2.8472602,2.8078923
rmsle,,0.0,,,,,

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2025-09-17 16:39:13,1.010 sec,0.0,8.2062346,5.4746640,67.3422868
,2025-09-17 16:39:13,1.028 sec,5.0,5.8383121,3.8835318,34.0858885
,2025-09-17 16:39:13,1.046 sec,10.0,4.3678206,2.9034754,19.0778568
,2025-09-17 16:39:13,1.061 sec,15.0,3.4818413,2.3349532,12.1232189
,2025-09-17 16:39:13,1.076 sec,20.0,2.9559543,2.0045670,8.7376659
,2025-09-17 16:39:13,1.091 sec,25.0,2.6157639,1.7935225,6.8422205
,2025-09-17 16:39:13,1.106 sec,30.0,2.3602398,1.6476786,5.5707320
,2025-09-17 16:39:13,1.119 sec,35.0,2.2005216,1.5521328,4.8422954
,2025-09-17 16:39:13,1.135 sec,40.0,2.0714786,1.4724228,4.2910236
,2025-09-17 16:39:13,1.150 sec,45.0,1.9631899,1.3987594,3.8541145

variable,relative_importance,scaled_importance,percentage
blh,378323.125,1.0,0.2597728
date_unix,185788.03125,0.4910829,0.12757
u10,148465.5,0.3924304,0.1019428
d2m,142141.15625,0.3757136,0.0976002
day_julian,111503.5390625,0.294731,0.0765631
sp,105021.5078125,0.2775974,0.0721123
v10,91910.7890625,0.2429426,0.0631099
t2m,65793.9921875,0.1739095,0.045177
weekday,61490.7851562,0.1625351,0.0422222
hour,47388.5429688,0.1252594,0.032539


# 3.2 Save/load H2O model

In [23]:
# You can still save model if you didn't set save model as True in the model config during model training
nm.save_model(model_h2o,folder_path='.',filename='automl')

'automl'

In [24]:
model_h2o=nm.load_model(folder_path='.',backend='h2o',filename='automl')

INFO | normet.normet.backends.h2o_backend | Loaded H2O model from automl


In [25]:
model_h2o

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,144.0,144.0,170038.0,10.0,10.0,10.0,25.0,237.0,89.3125

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,,0.0,,,,,
loglikelihood,,0.0,,,,,
mae,1.9981214,0.0528831,1.9989221,2.0229292,2.068196,1.9735426,1.9270169
mean_residual_deviance,8.589263,0.6306002,9.184041,8.478395,9.292726,8.106892,7.8842587
mse,8.589263,0.6306002,9.184041,8.478395,9.292726,8.106892,7.8842587
r2,0.8725205,0.0073285,0.8659208,0.8702673,0.8658085,0.8802241,0.8803817
residual_deviance,8.589263,0.6306002,9.184041,8.478395,9.292726,8.106892,7.8842587
rmse,2.9291673,0.107479,3.0305183,2.9117684,3.0483973,2.8472602,2.8078923
rmsle,,0.0,,,,,

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2025-09-17 16:39:13,1.010 sec,0.0,8.2062346,5.4746640,67.3422868
,2025-09-17 16:39:13,1.028 sec,5.0,5.8383121,3.8835318,34.0858885
,2025-09-17 16:39:13,1.046 sec,10.0,4.3678206,2.9034754,19.0778568
,2025-09-17 16:39:13,1.061 sec,15.0,3.4818413,2.3349532,12.1232189
,2025-09-17 16:39:13,1.076 sec,20.0,2.9559543,2.0045670,8.7376659
,2025-09-17 16:39:13,1.091 sec,25.0,2.6157639,1.7935225,6.8422205
,2025-09-17 16:39:13,1.106 sec,30.0,2.3602398,1.6476786,5.5707320
,2025-09-17 16:39:13,1.119 sec,35.0,2.2005216,1.5521328,4.8422954
,2025-09-17 16:39:13,1.135 sec,40.0,2.0714786,1.4724228,4.2910236
,2025-09-17 16:39:13,1.150 sec,45.0,1.9631899,1.3987594,3.8541145

variable,relative_importance,scaled_importance,percentage
blh,378323.125,1.0,0.2597728
date_unix,185788.03125,0.4910829,0.12757
u10,148465.5,0.3924304,0.1019428
d2m,142141.15625,0.3757136,0.0976002
day_julian,111503.5390625,0.294731,0.0765631
sp,105021.5078125,0.2775974,0.0721123
v10,91910.7890625,0.2429426,0.0631099
t2m,65793.9921875,0.1739095,0.045177
weekday,61490.7851562,0.1625351,0.0422222
hour,47388.5429688,0.1252594,0.032539


# 3.3 Model performance_H2O

In [26]:
modStat = nm.modStats(df_prep, model_h2o)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%


In [27]:
modStat

Unnamed: 0,n,FAC2,MB,MGE,RMSE,NMB,NMGE,COE,IOA,r,p_level,R2,set
0,4780,0.968992,-0.000336,0.865096,1.168712,-3.7e-05,0.094937,0.841982,0.920991,0.99021,***,0.980516,training
1,1593,0.904403,0.072883,2.072402,2.958598,0.007922,0.225263,0.622552,0.811276,0.928224,***,0.8616,testing
2,6373,0.952852,0.017966,1.166875,1.792331,0.001967,0.127747,0.787039,0.89352,0.975859,***,0.952301,all


# 3.4 Partial Dependency_H2O

In [28]:
pdp_value=nm.pdp(df_prep,model_h2o)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
PartialDependencePlot progress: |████████████████████████████████████████████████| (done) 100%
PartialDependencePlot progress: |████████████████████████████████████████████████| (done) 100%
PartialDependencePlot progress: |████████████████████████████████████████████████| (done) 100%
PartialDependencePlot progress: |████████████████████████████████████████████████| (done) 100%
PartialDependencePlot progress: |████████████████████████████████████████████████| (done) 100%
PartialDependencePlot progress: |████████████████████████████████████████████████| (done) 100%
PartialDependencePlot progress: |████████████████████████████████████████████████| (done) 100%
PartialDependencePlot progress: |████████████████████████████████████████████████| (done) 100%
PartialDependencePlot progress: |████████████████████████████████████████████████| (done) 100%
PartialDependencePlot progress: |█████████████████

In [29]:
pdp_value.head()

Unnamed: 0,variable,value,pdp_mean,pdp_std
0,blh,11.407354,18.165007,8.524272
1,blh,156.247874,14.033806,7.425718
2,blh,301.088394,10.045159,5.802106
3,blh,445.928914,8.622272,5.030158
4,blh,590.769434,8.185836,4.807409


In [30]:
pdp_value['variable'].unique()

array(['blh', 'date_unix', 'u10', 'd2m', 'day_julian', 'sp', 'v10', 't2m',
       'weekday', 'hour', 'rh2m', 'tcc', 'ssrd', 'tp'], dtype=object)

In [31]:
nm.pdp(df_prep,model_h2o,var_list=['blh','rh2m'])

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
PartialDependencePlot progress: |████████████████████████████████████████████████| (done) 100%
PartialDependencePlot progress: |████████████████████████████████████████████████| (done) 100%


Unnamed: 0,variable,value,pdp_mean,pdp_std
0,blh,11.407354,18.165007,8.524272
1,blh,156.247874,14.033806,7.425718
2,blh,301.088394,10.045159,5.802106
3,blh,445.928914,8.622272,5.030158
4,blh,590.769434,8.185836,4.807409
5,blh,735.609954,7.702386,4.774138
6,blh,880.450474,7.3961,4.649164
7,blh,1025.290994,7.10744,4.423848
8,blh,1170.131514,7.093472,4.37922
9,blh,1314.972034,6.701338,4.373802
