# Experiment 1: Exogenous Variables
Nachat Jatusripitak

In this notebook, we experiment with adding/removing exogenous variables from the predictor. Exogenous features are defined as features that are not $\Delta \text{PM}_{2.5}$. 

In [None]:
# Import required packages
import xgboost as xgb
import pandas as pd
from sklearn.dummy import DummyRegressor
import src.train_utils as T
import xarray as xr
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.width', None)

In [None]:
ds = xr.open_dataset('../datasets/base.nc')

# hold out 2022
mask = ds.time.dt.year < 2022
ds_excl_2022 = ds.sel(time=mask)

# create relative humidity feature (better than dew temp alone)
temp_da = ds_excl_2022['temp_t']
dew_t_da = ds_excl_2022['dew_temp_t']

# compute saturation vapor pressure at T and at dew-point
e_s_temp = 6.112 * np.exp((17.67 * temp_da) / (temp_da + 243.5))
e_s_dew  = 6.112 * np.exp((17.67 * dew_t_da) / (dew_t_da + 243.5))

# compute RH (%)
rh = (e_s_dew / e_s_temp * 100).rename("r_humidity_t")

# clip to 0-100
rh = rh.clip(min=0, max=100)

ds_excl_2022["r_humidity_t"] = rh

train_df = ds_excl_2022.to_dataframe().reset_index()
print(train_df)

In [7]:
# Summary stats for each variable
print(ds.mean())
print(ds.median())
print(ds.min())
print(ds.max())

<xarray.Dataset> Size: 44B
Dimensions:          ()
Data variables:
    pm25_t           float32 4B 19.56
    u_wind_t         float32 4B 0.09071
    v_wind_t         float32 4B 0.3834
    dew_temp_t       float32 4B 291.7
    temp_t           float32 4B 296.9
    surf_pressure_t  float32 4B 9.382e+04
    precip_t         float32 4B 0.004529
    frp_t            float32 4B 1.671
    elevation_t      float32 4B 646.9
    delta_pm25_t+1   float32 4B 0.04252
    delta_pm25_t     float32 4B 0.04365


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


<xarray.Dataset> Size: 88B
Dimensions:          ()
Data variables:
    pm25_t           float64 8B 12.0
    u_wind_t         float64 8B 0.05863
    v_wind_t         float64 8B 0.3674
    dew_temp_t       float64 8B 293.0
    temp_t           float64 8B 297.0
    surf_pressure_t  float64 8B 9.385e+04
    precip_t         float64 8B 0.0008532
    frp_t            float64 8B 0.0
    elevation_t      float64 8B 638.0
    delta_pm25_t+1   float64 8B 0.01073
    delta_pm25_t     float64 8B 0.01169
<xarray.Dataset> Size: 44B
Dimensions:          ()
Data variables:
    pm25_t           float32 4B 4.268
    u_wind_t         float32 4B -4.001
    v_wind_t         float32 4B -3.18
    dew_temp_t       float32 4B 270.4
    temp_t           float32 4B 279.2
    surf_pressure_t  float32 4B 8.374e+04
    precip_t         float32 4B 0.0
    frp_t            float32 4B 0.0
    elevation_t      float32 4B 9.0
    delta_pm25_t+1   float32 4B -213.1
    delta_pm25_t     float32 4B -213.1
<xarray.Dataset> 

## Persistence baseline
Just to see what kind of performance we should expect

In [3]:
feature_experiments = [
    ('persistence', []),
]

model=DummyRegressor(strategy='constant', constant=0)

results = T.run_experiments(
    df=train_df, 
    model=model, 
    feature_experiments=feature_experiments, 
    train_days=365*2,
    gap_days=7*3,
    val_days=7*7
)

print(results)

  0%|          | 0/1 [00:00<?, ?it/s]

Running experiment: persistence


  0%|          | 0/10 [00:00<?, ?it/s]

    experiment  n_features  mean_rmse  mean_mae  rmse_fold_1  rmse_fold_2  rmse_fold_3  rmse_fold_4  rmse_fold_5  rmse_fold_6  rmse_fold_7  rmse_fold_8  rmse_fold_9  rmse_fold_10
0  persistence           0   4.770767   3.12517     1.783652     6.286166     5.223866     9.354229    14.504134     2.214257     1.003456     1.061782     1.985275      4.290848


### Run 1: add exogenous variables one at a time

In [None]:
base = ['delta_pm25_t']

# hyperparameters found via trial-and-error
params = {
    'max_depth': 3,            
    'learning_rate': 0.1,     
    'n_estimators': 150,     
    'subsample': 0.8,          
    'colsample_bytree': 0.8,   
    'objective': 'reg:pseudohubererror' 
}

feature_experiments = [
    ('base', base),
    ('base + pm25_t', base + ['pm25_t']),
    ('base + u-v_wind', base + ['u_wind_t', 'v_wind_t']),
    ('base + dew_temp_t', base + ['dew_temp_t']),
    ('base + temp_t', base + ['temp_t']),
    ('base + surf_pressure_t', base + ['surf_pressure_t']),
    ('base + precip_t', base + ['precip_t']),
    ('base + frp_t', base + ['frp_t']),
    ('base + elevation_t', base + ['elevation_t']),
    ('base + r_humidity_t', base + ['r_humidity_t'])
]

model=xgb.XGBRegressor(**params, random_state=191)

results = T.run_experiments(
    df=train_df, 
    model=model, 
    feature_experiments=feature_experiments, 
    train_days=365*2,
    gap_days=7*3,
    val_days=7*7
)

print(results.sort_values('mean_rmse'))

  0%|          | 0/10 [00:00<?, ?it/s]

Running experiment: base


  0%|          | 0/10 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Run 2: do row/col indices improve performance or not?

In [None]:
base = ['delta_pm25_t', 'pm25_t', 'u_wind_t', 'v_wind_t',
       'dew_temp_t', 'temp_t', 'surf_pressure_t', 'precip_t',
       'frp_t', 'elevation_t', 'r_humidity_t']

params = {
    'max_depth': 4,            
    'learning_rate': 0.1,     
    'n_estimators': 150,     
    'subsample': 0.8,          
    'colsample_bytree': 0.8,
    'objective': 'reg:pseudohubererror'    
}

feature_experiments = [
    ('without i/j', base),
    ('with i/j', base + ['i', 'j']),
]

model=xgb.XGBRegressor(**params, random_state=191)

results = T.run_experiments(
    df=train_df, 
    model=model, 
    feature_experiments=feature_experiments, 
    train_days=365*2,
    gap_days=7*3,
    val_days=7*7
)

print(results.sort_values('mean_rmse'))

  0%|          | 0/2 [00:00<?, ?it/s]

Running experiment: without i/j


  0%|          | 0/10 [00:00<?, ?it/s]

Running experiment: with i/j


  0%|          | 0/10 [00:00<?, ?it/s]

    experiment  n_features  mean_rmse  mean_mae  rmse_fold_1  rmse_fold_2  rmse_fold_3  rmse_fold_4  rmse_fold_5  rmse_fold_6  rmse_fold_7  rmse_fold_8  rmse_fold_9  rmse_fold_10
0  without i/j          11   4.361622  2.964671     1.675788     4.937968     5.101307     8.723355    13.218467     2.039777     0.916728     1.001423     1.863846      4.137558
1     with i/j          13   4.377839  2.971601     1.666631     4.869468     5.087079     8.681473    13.541274     2.041027     0.913193     0.998304     1.860031      4.119911


In [None]:
# Export dataset with new features

temp_da = ds['temp_t']
dew_t_da = ds['dew_temp_t']

e_s_temp = 6.112 * np.exp((17.67 * temp_da) / (temp_da + 243.5))
e_s_dew  = 6.112 * np.exp((17.67 * dew_t_da) / (dew_t_da + 243.5))

rh = (e_s_dew / e_s_temp * 100).rename("r_humidity_t")

rh = rh.clip(min=0, max=100)

ds["r_humidity_t"] = rh

ds.to_netcdf(
    "exp_1_ds.nc",
    format="NETCDF4",       
    engine="netcdf4",      
    encoding={
        var: {
            "zlib": True,
            "complevel": 4,
        }
        for var in ds.data_vars
    }
)