In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import os

In [2]:
folder = './CAMS Reanalysis AOD Assimilated/'
paths = sorted(os.listdir(folder))
ads = [xr.open_dataset(folder + path) for path in paths]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
folder = './AOD nc/'
paths = sorted(os.listdir(folder))
aod = [xr.open_dataset(folder + path) for path in paths]

In [4]:
lc = xr.open_dataset('./LC nc/LC.nc')

In [5]:
for i in range(len(aod)):
    lon_dim = [name for name in aod[i].indexes if ('lon' in name.lower()) | ('eas' in name.lower()) | ('wes' in name.lower()) | ('x' in name.lower())][0]
    lat_dim = [name for name in aod[i].indexes if ('lat' in name.lower()) | ('nor' in name.lower()) | ('sou' in name.lower()) | ('y' in name.lower())][0]
    aod[i] = aod[i].rename({lat_dim: 'latitude'})
    aod[i] = aod[i].rename({lon_dim: 'longitude'})
    aod[i] = aod[i].drop('spatial_ref')

In [6]:
del lon_dim, lat_dim

In [7]:
all = xr.merge([xr.merge(ads),xr.concat(aod, dim='time')])

In [8]:
for var in list(all.data_vars):
    all[var].values = all[var].values.astype(np.float32)

In [9]:
all

In [10]:
all = all.drop('duaod550')
all

In [11]:
all = xr.merge([all, lc])
all

In [12]:
print(f'MAIAC AOD Mean: {np.nanmean(all["AOD055"].values):.4f}')
print(f'MAIAC AOD std: {np.nanstd(all["AOD055"].values):.4f}')
print(f'MAIAC AOD Max: {np.nanmax(all["AOD055"].values):.4f}')
print(f'MAIAC AOD Min: {np.nanmin(all["AOD055"].values):.4f}')

MAIAC AOD Mean: 0.1733
MAIAC AOD std: 0.1046
MAIAC AOD Max: 4.7905
MAIAC AOD Min: 0.0000


In [13]:
del ads, aod, paths, folder, i

In [14]:
all_df = all.to_dataframe().reset_index()

In [15]:
del all

In [16]:
all_df.sort_values(['time', 'latitude', 'longitude'], inplace=True)

In [17]:
def time_lag_features(in_df, col_name, lag_steps):
    lag = len(in_df[in_df['time'] == in_df['time'].unique()[0]])
    for i in range(lag_steps):
        in_df[col_name + '_' + f'lag{i+1}'] = in_df[col_name].shift(lag * (i+1))
        in_df[col_name + '_' + f'lag-{i+1}'] = in_df[col_name].shift(lag * -(i+1))
    return in_df

In [18]:
all_df = time_lag_features(all_df, 'aod550', 7)

In [19]:
all_df['day_of_year'] = all_df['time'].dt.day_of_year
all_df['day_of_week'] = all_df['time'].dt.day_of_week
all_df['month'] = all_df['time'].dt.month
all_df['year'] = all_df['time'].dt.year


In [20]:
x = all_df.dropna().drop(['time', 'AOD055'], axis=1)
y = all_df.dropna()['AOD055']

In [21]:
import optuna
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
#from sklearn.decomposition import PCA
#from sklearn.preprocessing import MaxAbsScaler
import random
import cupy as cp

In [22]:
# Fix random seed
np.random.seed(42)

# Predefine the 5 folds once
kf = KFold(n_splits=5, shuffle=True, random_state=42)
splits = list(kf.split(x))  # store all (train_idx, val_idx) pairs

In [23]:
folds = [(x.iloc[tr], x.iloc[val], y.iloc[tr], y.iloc[val]) for tr, val in splits]

In [24]:
del kf, splits

In [25]:
def objective(trial):
    params = {
        "booster": "gbtree",
        "lambda": trial.suggest_float("lambda", 1e-4, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-4, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "eta": trial.suggest_float("eta", 1e-4, 1.0, log=True),
        "gamma": trial.suggest_float("gamma", 1e-4, 10.0, log=True),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "tree_method": "gpu_hist",
        "predictor": "gpu_predictor",
        "n_jobs": -1,
        "gpu_id": 1,
        "eval_metric": "rmse",
    }

    val_scores = []
    for x_train, x_val, y_train, y_val in folds:
        # x_train = cp.array(x_train)
        # x_val = cp.array(x_val)
        # y_train = cp.array(y_train)
        # y_val = cp.array(y_val)

        model = xgb.XGBRegressor(**params, random_state=42, verbosity=0)
        model.fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=False)
        del x_train, y_train
        preds = model.predict(x_val)
        del x_val
        val_scores.append(r2_score(y_val, preds))
        del y_val, preds

    # return average R² across folds
    return np.mean(val_scores)

In [26]:
study = optuna.create_study(direction="maximize")

[I 2025-11-08 16:50:54,275] A new study created in memory with name: no-name-79633b0b-2cc8-4253-b639-703c52f1ee5b


In [None]:
study.optimize(objective, n_trials=5)

In [None]:
best_params = study.best_params

In [None]:
best_params.update({"tree_method": "gpu_hist",  # Enable GPU support
  "predictor": "gpu_predictor",  # Use GPU for prediction
  "n_jobs": -1,  # Adjust based on your system's CPU cores
  "gpu_id": 1})

In [None]:
best_params

{'lambda': 0.00015801646157906176,
 'alpha': 0.0014179715889163214,
 'max_depth': 14,
 'eta': 0.3873724419080007,
 'gamma': 0.00028000487098931885,
 'colsample_bytree': 0.9797000193265941,
 'subsample': 0.5779363058001556,
 'n_estimators': 455,
 'min_child_weight': 1,
 'tree_method': 'gpu_hist',
 'predictor': 'gpu_predictor',
 'n_jobs': -1,
 'gpu_id': 1}

In [27]:
best_params = {'lambda': 0.00015801646157906176,
 'alpha': 0.0014179715889163214,
 'max_depth': 14,
 'eta': 0.3873724419080007,
 'gamma': 0.00028000487098931885,
 'colsample_bytree': 0.9797000193265941,
 'subsample': 0.5779363058001556,
 'n_estimators': 455,
 'min_child_weight': 1,
 'tree_method': 'gpu_hist',
 'predictor': 'gpu_predictor',
 'n_jobs': -1,
 'gpu_id': 1}

In [28]:
# K-Fold cross-validation setup
val_losses = []

for x_train, x_val, y_train, y_val in folds:

    # Preprocessing: PCA and MaxAbsScaler
        # Train the XGBoost model with GPU support
    model = xgb.XGBRegressor(**best_params, verbosity=0, random_state=42)
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_val, y_val)],  # Evaluation set
        verbose=False
    )

    # Evaluate the model on the validation set
    preds = model.predict(x_val)
    loss = mean_squared_error(y_val, preds, squared=False)  # RMSE
    mae = mean_absolute_error(y_val, preds)
    r2 = r2_score(y_val, preds)
    val_losses.append([np.mean(loss), np.mean(mae), r2])


In [29]:
def avg_list_elements(list, id):
    c = 0
    for i in range(len(list)):
        c += list[i][id]
    return c/len(list)

In [47]:
print(f'avg rmse: {avg_list_elements(val_losses, 0):.5f}')
print(f'avg mae: {avg_list_elements(val_losses, 1):.5f}')
print(f'avg r2: {avg_list_elements(val_losses, 2):.5f}')

avg rmse: 0.01622
avg mae: 0.01046
avg r2: 0.97594


In [31]:
del folds, loss, mae, model, preds, r2, x_train, x_val, y_train, y_val

In [32]:
model = xgb.XGBRegressor(**best_params, verbosity=0, random_state=42)
model.fit(
        x,
        y,
        verbose=False
    )

In [33]:
pred_aod = pd.Series(name='pred_AOD_055', data=model.predict(all_df[x.columns].drop(x.index)), index=all_df.drop(x.index).index)

In [34]:
new = pd.concat([all_df, pred_aod], axis=1)

In [35]:
del all_df, x, y, pred_aod

In [36]:
new.loc[new['AOD055'].isna(), 'AOD055'] = new[new['AOD055'].isna()]['pred_AOD_055']

In [37]:
new['filled_AOD'] = new['AOD055']

In [38]:
new.set_index(['time', 'latitude', 'longitude'], inplace=True)

In [39]:
pd.DataFrame(new['filled_AOD']).to_xarray()

In [40]:
new_xr = pd.DataFrame(new['filled_AOD']).to_xarray()

In [41]:
new_xr

In [42]:
# Number of chunks
n = 4

# Get indices to split along the time dimension
time_len = new_xr.dims["time"]
splits = np.array_split(np.arange(time_len), n)

# Save each chunk to a separate file
for i, idx in enumerate(splits):
    ds_subset = new_xr.isel(time=idx)
    ds_subset.to_netcdf(f"./MAIAC AOD gap filled nc/AOD_{i+1}.nc")