In [1]:
import pandas as pd
import os
from glob import glob
from datetime import datetime, timedelta
from eruption_forecast import LabelBuilder
from eruption_forecast.utils import construct_windows

In [2]:
eruptions = [
	"2025-03-20",
	"2025-04-22",
	"2025-05-18",
	"2025-06-17",
	"2025-07-07",
	"2025-08-01",
	"2025-08-17",
]

In [3]:
tremor_csv = (
    r"D:\Projects\eruption-forecast\examples\output\VG.OJN.00.EHN\tremor\tremor.csv"
)
training_dir = r"D:\Projects\eruption-forecast\examples\output\VG.OJN.00.EHN\training"
os.makedirs(training_dir, exist_ok=True)

In [4]:
train_start_date = "2025-01-01"
train_end_date = "2025-07-31"
train_start_date_obj = datetime.strptime(train_start_date, "%Y-%m-%d").replace(
    hour=0, minute=0, second=0
)
train_end_date_obj = datetime.strptime(train_end_date, "%Y-%m-%d").replace(
    hour=23, minute=59, second=59
)

forecast_start_date = "2025-08-01"
forecast_end_date = "2025-08-24"
forecast_start_date_obj = datetime.strptime(forecast_start_date, "%Y-%m-%d").replace(
    hour=0, minute=0, second=0
)
forecast_end_date_obj = datetime.strptime(forecast_end_date, "%Y-%m-%d").replace(
    hour=23, minute=59, second=59
)

window_size = 2
window_step = 12
selected_tremor_columns = [
    "dsar_f0-f1",
    'dsar_f1-f2',
    'dsar_f2-f3',
    "dsar_f3-f4",
    "rsam_f0",
    "rsam_f1",
    'rsam_f2',
    'rsam_f3',
    'rsam_f4'
]
overwrite = True

#### Validate date for training and forecast

In [5]:
# Ensuring start date and end date in range
assert train_start_date_obj < train_end_date_obj, ValueError(
    f"Start date for training (train_start_date: {train_start_date}) should be less or equal (<=) than training end date (train_end_date: {train_end_date})"
)
assert forecast_start_date_obj < forecast_end_date_obj, ValueError(
    f"Start date for forecast (forecast_start_date: {forecast_start_date}) should be less or equal (<=) than forecast end date (forecast_end_date: {forecast_end_date})"
)

# Ensuring training date and forecast date in range
assert train_start_date_obj <= forecast_start_date_obj, ValueError(
    f"Start date for training (train_start_date: {train_start_date}) should be less or equal (<=) than start date to forecast (forecast_start_date: {forecast_start_date})"
)
assert train_end_date_obj < forecast_start_date_obj, ValueError(
    f"End date for training (train_end_date: {train_end_date}) should be less or equal (<=) than start date to forecast (forecast_start_date: {forecast_start_date})"
)

#### Build training label

In [6]:
training_label_builder: LabelBuilder = LabelBuilder(
    start_date=train_start_date,
    end_date=train_end_date,
    window_size=2,
    window_step=12,
    window_step_unit="hours",
    day_to_forecast=2,
    eruption_dates=eruptions,
    volcano_id="Lewotobi Laki-laki",
    output_dir=r"D:\Projects\eruption-forecast\examples\output\VG.OJN.00.EHN",
    verbose=True,
).build()

[32m2026-01-22 16:31:58.415[0m | [1mINFO    [0m | [36meruption_forecast.label.label_builder[0m:[36m__init__[0m:[36m111[0m - [1mStart Date (YYYY-MM-DD): 2025-01-01[0m
[32m2026-01-22 16:31:58.416[0m | [1mINFO    [0m | [36meruption_forecast.label.label_builder[0m:[36m__init__[0m:[36m112[0m - [1mEnd Date (YYYY-MM-DD): 2025-07-31[0m
[32m2026-01-22 16:31:58.418[0m | [1mINFO    [0m | [36meruption_forecast.label.label_builder[0m:[36m__init__[0m:[36m113[0m - [1mWindow Size (days): 2[0m
[32m2026-01-22 16:31:58.418[0m | [1mINFO    [0m | [36meruption_forecast.label.label_builder[0m:[36m__init__[0m:[36m114[0m - [1mWindow Step (hours): 12[0m
[32m2026-01-22 16:31:58.419[0m | [1mINFO    [0m | [36meruption_forecast.label.label_builder[0m:[36m__init__[0m:[36m115[0m - [1mDay To Forecast (days): 2[0m
[32m2026-01-22 16:31:58.420[0m | [1mINFO    [0m | [36meruption_forecast.label.label_builder[0m:[36m__init__[0m:[36m116[0m - [1mVolcano ID

In [7]:
training_label_builder.df

Unnamed: 0_level_0,id,is_erupted
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-01-01 00:00:00,0,0
2025-01-01 12:00:00,1,0
2025-01-02 00:00:00,2,0
2025-01-02 12:00:00,3,0
2025-01-03 00:00:00,4,0
...,...,...
2025-07-29 12:00:00,419,0
2025-07-30 00:00:00,420,0
2025-07-30 12:00:00,421,0
2025-07-31 00:00:00,422,0


In [8]:
training_label_csv = training_label_builder.filepath

#### Create features label directory

In [9]:
basename_label = os.path.basename(training_label_csv).split(".csv")[0]

training_label_dir = os.path.join(training_dir, basename_label)
features_matrix_dir = os.path.join(training_label_dir, "features_matrix")

os.makedirs(training_label_dir, exist_ok=True)
os.makedirs(features_matrix_dir, exist_ok=True)

#### Load dataframe

In [10]:
df_tremor = pd.read_csv(tremor_csv, index_col=0, parse_dates=True)
df_tremor = df_tremor[selected_tremor_columns]
df_tremor.sort_index(ascending=True, inplace=True)

#### Validate selected_tremor_columns

In [11]:
for column in selected_tremor_columns:
    assert column in df_tremor.columns, ValueError(f"Column {column} not exists in tremor data from {tremor_csv}")

In [12]:
df_training_label = pd.read_csv(training_label_csv, index_col=0, parse_dates=True)
df_training_label.sort_index(ascending=True, inplace=True)

In [13]:
tremor_start_date_obj = df_tremor.index[0]
tremor_end_date_obj = df_tremor.index[-1]

training_label_start_date_obj = df_training_label.index[0]
training_label_end_date_obj = df_training_label.index[-1]

#### Validate tremor data available for training and forecasting

In [14]:
# Ensure tremor data is within label data range
assert tremor_start_date_obj <= training_label_start_date_obj, ValueError(
    f"Tremor data start date ({tremor_start_date_obj}) should be less than or equal to training start date ({training_label_start_date_obj})"
)
assert tremor_end_date_obj >= training_label_end_date_obj, ValueError(
    f"Tremor data end date ({tremor_end_date_obj}) should be greater than or equal to training end date ({training_label_end_date_obj})"
)

# Ensure tremor data is within forecast data range
assert tremor_start_date_obj <= forecast_start_date_obj, ValueError(
    f"Tremor data start date ({tremor_start_date_obj}) should be less than or equal to forecast start date ({forecast_start_date_obj})"
)
assert tremor_end_date_obj <= forecast_end_date_obj, ValueError(
    f"Tremor data end date ({tremor_end_date_obj}) should be greater than or equal to forecast end date ({forecast_end_date_obj})"
)

#### Omitting first data based on window step

In [16]:
_training_label_start_date_obj = training_label_start_date_obj
for start_date_label, label in df_training_label.iterrows():
    if tremor_start_date_obj == (start_date_label - timedelta(days=window_size)):
        # Update training_label_start_date_obj to the start of the window
        _training_label_start_date_obj = start_date_label
        break

df_training_label_sliced = df_training_label.loc[
    _training_label_start_date_obj:training_label_end_date_obj
].copy()

In [17]:
df_training_label_sliced

Unnamed: 0_level_0,id,is_erupted
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-01-03 00:00:00,4,0
2025-01-03 12:00:00,5,0
2025-01-04 00:00:00,6,0
2025-01-04 12:00:00,7,0
2025-01-05 00:00:00,8,0
...,...,...
2025-07-29 12:00:00,419,0
2025-07-30 00:00:00,420,0
2025-07-30 12:00:00,421,0
2025-07-31 00:00:00,422,0


In [18]:
df_training_label_concat = pd.concat([df_training_label, df_training_label_sliced])
df_training_label_removed = df_training_label_concat[
    ~df_training_label_concat.index.duplicated(keep=False)
].sort_index()

In [19]:
df_training_label_removed

Unnamed: 0_level_0,id,is_erupted
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-01-01 00:00:00,0,0
2025-01-01 12:00:00,1,0
2025-01-02 00:00:00,2,0
2025-01-02 12:00:00,3,0


#### Create training data dir

In [20]:
training_data_dir = os.path.join(training_label_dir, "tremor_data")
os.makedirs(os.path.join(training_data_dir), exist_ok=True)

#### Build windows features

In [21]:
def training_data(column) -> str:
    # Get parameters
    datetime_index = column.name
    column_id = int(column["id"])
    column_eruption = int(column["is_erupted"])

    # Get window start and end dates
    start_date = datetime_index - timedelta(days=window_size)
    end_date = datetime_index - timedelta(milliseconds=1)

    # Set filepath to save tmp features
    # Example filename: 00001_2025-01-01--00-00-00_2025-01-01_00--00-00_eruption-0.csv
    start_date_str = start_date.strftime("%Y-%m-%d--%H-%M-%S")
    end_date_str = end_date.strftime("%Y-%m-%d_%H--%H-%M-%S")
    feature_tmp_filename = (
        f"{column_id:05}_{start_date_str}_{end_date_str}_eruption-{column_eruption}.csv"
    )
    feature_tmp_filepath = os.path.join(training_data_dir, feature_tmp_filename)

    # Skip if file already exists and overwrite is False
    if os.path.isfile(feature_tmp_filepath) and not overwrite:
        return feature_tmp_filepath

    df_tremor_sliced = df_tremor.loc[start_date:end_date, selected_tremor_columns]
    df_tremor_sliced = df_tremor_sliced.sort_index(ascending=True)
    df_tremor_sliced.reset_index(inplace=True)
    df_tremor_sliced["id"] = column_id
    df_tremor_sliced = df_tremor_sliced[["id", "datetime", *selected_tremor_columns]]
    df_tremor_sliced.to_csv(feature_tmp_filepath, index=False)

    return feature_tmp_filepath

In [22]:
# Add training_data csv to label
df_training_label_sliced["tremor_data"] = df_training_label_sliced.apply(
    training_data, axis=1
)

# Update label with tremor data location
training_label_filename = f"{basename_label.replace('label_','training_label_')}.csv"

# This label would be used for feature extraction
df_training_label_sliced.to_csv(
    os.path.join(training_label_dir, training_label_filename), index=True
)

#### Concat csv from featrues_data directory

In [30]:
files = glob(os.path.join(training_data_dir, "*.csv"))
if len(files) == 0:
	raise ValueError(
		f"No tremor data found. Tremor data location: {training_data_dir}. Run build_windows_features() first."
	)

# Concatenate all tremor data
df_matrix = pd.concat([pd.read_csv(file) for file in files])

# Save non interpolated data
training_data_non_interpolated_csv = (
	f"{basename_label.replace('label_','training_data_non_interpolated_')}.csv"
)
training_data_non_interpolated_csv = os.path.join(
	training_label_dir, training_data_non_interpolated_csv
)
df_matrix.to_csv(training_data_non_interpolated_csv, index=False)

# Interpolate nan values
# See: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html
for column in df_matrix.columns:
    if column not in ["id", "datetime"]:
        df_matrix[column] = df_matrix[column].interpolate()

# Save concatenated tremor data. This file would be used for feature extraction
training_data_csv = f"{basename_label.replace('label_','training_data_')}.csv"
training_data_csv = os.path.join(training_label_dir, training_data_csv)
df_matrix.to_csv(training_data_csv, index=False)

#### Build features matrix

In [31]:
df_matrix.head()

Unnamed: 0,id,datetime,dsar_f0-f1,dsar_f1-f2,dsar_f2-f3,dsar_f3-f4,rsam_f0,rsam_f1,rsam_f2,rsam_f3,rsam_f4
0,4,2025-01-01 00:00:00,42.295169,11.567851,2.475762,0.454574,0.578896,0.393133,0.23402,0.20109,0.802178
1,4,2025-01-01 00:10:00,8.339877,10.230095,2.39632,0.585748,0.193408,0.473813,0.318773,0.280802,0.873074
2,4,2025-01-01 00:20:00,9.83761,11.795762,2.379836,0.586542,0.255615,0.487551,0.318261,0.280866,0.874947
3,4,2025-01-01 00:30:00,8.639998,12.336922,2.229046,0.612159,0.248093,0.483001,0.321814,0.317194,0.903412
4,4,2025-01-01 00:40:00,7.958638,11.133125,2.34242,0.569708,0.198425,0.474674,0.312228,0.280739,0.900274


#### Save feature matrix for each method

In [32]:
for column_name in selected_tremor_columns:
    feature_matrix_filename = (
        f"{basename_label.replace('label',column_name)}.csv"
    )
    feature_matrix_filepath = os.path.join(features_matrix_dir, feature_matrix_filename)
    df_feature_matrix = df_matrix[["id", "datetime", column_name]]
    df_feature_matrix.to_csv(feature_matrix_filepath, index=False)

In [33]:
df_feature_matrix

Unnamed: 0,id,datetime,rsam_f4
0,4,2025-01-01 00:00:00,0.802178
1,4,2025-01-01 00:10:00,0.873074
2,4,2025-01-01 00:20:00,0.874947
3,4,2025-01-01 00:30:00,0.903412
4,4,2025-01-01 00:40:00,0.900274
...,...,...,...
283,423,2025-07-31 11:10:00,1.301511
284,423,2025-07-31 11:20:00,1.284949
285,423,2025-07-31 11:30:00,1.297796
286,423,2025-07-31 11:40:00,1.304122
