In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from pypots.utils.random import set_random_seed
from pypots.optim import Adam
from pypots.classification import Raindrop, BRITS, GRUD
from pypots.nn.functional import calc_binary_classification_metrics

# Utility functions

In [None]:
def prepare_df(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    cols = [
        "Temperature_C",
        "Dew_Point_C",
        "Humidity_%",
        "Wind_Speed_kmh",
        "Wind_Gust_kmh",
        "Pressure_hPa",
        "Precip_Rate_mm"
    ]

    # Convert to float
    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

    # Convert Wind and Condition to categorical
    df["Wind"] = df["Wind"].astype("category")
    df["Condition"] = df["Condition"].astype("category")

    # join date and time
    df["datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"])
    df = df.set_index("datetime")

    # dtop Date and Time
    df = df.drop(columns=["Date", "Time"])


    # Count duplicates on datetime
    df.index.duplicated().sum()
    # Drop duplicates
    df = df[~df.index.duplicated()]

    return df

In [25]:
def _generate_sequences_from_df(
    df: pd.DataFrame,
    sequence_length: int,
    target_column: str,
    target_keyword: str | None,
    numerical_cols: list[str],
    categorical_cols_encoders: dict[str, OneHotEncoder],
    master_feature_columns: list[str],
    nan_placeholder: str = 'missing_value',
    step_size: int = 1
):
    if df.empty:
        return None

    df_copy = df.copy()

    # 1. Target variable processing
    if target_column not in df_copy.columns:
        print(f"Warning: Target column '{target_column}' not found in DataFrame. Skipping sequence generation for this DF.")
        return None
    
    if target_keyword is not None:
        y_series = df_copy[target_column].astype(str).str.contains(target_keyword, case=False, na=False).astype(int)
    else: # Handle cases where target_keyword is None (e.g., target is already binary)
        if pd.api.types.is_bool_dtype(df_copy[target_column]):
            y_series = df_copy[target_column].astype(int)
        elif pd.api.types.is_numeric_dtype(df_copy[target_column]) and df_copy[target_column].dropna().isin([0, 1]).all():
            y_series = df_copy[target_column].fillna(-1).astype(int) # FillNa if necessary, then convert. -1 for missing y? Or drop?
            y_series = y_series[y_series != -1] # Drop rows where target was NaN
            if y_series.empty and not df_copy[target_column].dropna().empty : # if all targets were NaN
                 print(f"Warning: All values in target column '{target_column}' were NaN after attempting to treat as binary. Skipping.")
                 return None
        else:
            print(f"Error: target_keyword is None, but target_column '{target_column}' is not boolean or binary numeric. Cannot process target.")
            return None
    
    # Adjust df_copy if rows were dropped from y_series due to NaNs in target
    if len(y_series) < len(df_copy):
        df_copy = df_copy.loc[y_series.index]
        if df_copy.empty:
            return None


    # 2. Feature Processing
    processed_features_list = []

    # Numerical features
    present_numerical_cols = [col for col in numerical_cols if col in df_copy.columns]
    if present_numerical_cols:
        processed_features_list.append(df_copy[present_numerical_cols].copy())
    else:
        # If no numerical columns are present but some were expected, create an empty DF with original index
        # This helps pd.concat and reindex later if only categorical features exist.
        if numerical_cols: # only if numerical_cols list was not empty
             processed_features_list.append(pd.DataFrame(index=df_copy.index))


    # Categorical features
    for cat_col_name, encoder in categorical_cols_encoders.items():
        if cat_col_name in df_copy.columns:
            cat_column_data = df_copy[[cat_col_name]].copy()
            cat_column_data[cat_col_name] = cat_column_data[cat_col_name].astype(object).fillna(nan_placeholder)
            
            try:
                transformed_data = encoder.transform(cat_column_data)
                feature_names = encoder.get_feature_names_out([cat_col_name])
                transformed_df = pd.DataFrame(transformed_data, columns=feature_names, index=df_copy.index)
                processed_features_list.append(transformed_df)
            except Exception as e:
                print(f"Error transforming categorical column {cat_col_name} with pre-fitted encoder: {e}")
        else:
            print(f"Warning: Categorical column '{cat_col_name}' (for pre-fitted encoder) not found in current DataFrame.")
            # We still need to account for its feature columns in the master list.
            # Create empty columns for this encoder if it's missing, using its expected feature names.
            try:
                missing_encoder_feature_names = encoder.get_feature_names_out([cat_col_name])
                missing_df = pd.DataFrame(0, index=df_copy.index, columns=missing_encoder_feature_names)
                processed_features_list.append(missing_df)
            except Exception as e:
                print(f"Could not get feature names for missing {cat_col_name} to create placeholders: {e}")


    if not processed_features_list:
        # This case means no numerical cols were selected AND no categorical cols processed.
        # Reindex will handle creating an all-zero feature matrix if master_feature_columns is not empty.
        combined_features_df = pd.DataFrame(index=df_copy.index) 
    else:
        combined_features_df = pd.concat(processed_features_list, axis=1)
    
    aligned_features_df = combined_features_df.reindex(columns=master_feature_columns, fill_value=0.0)
    X_values = aligned_features_df.astype(np.float32).values

    # 3. Sequence Generation
    X_sequences = []
    y_labels_for_sequences = []
    num_rows_X = len(X_values)
    num_rows_y = len(y_series)

    # Ensure X and y have aligned lengths after any potential filtering
    if num_rows_X == 0 or num_rows_y == 0: # No data to form sequences
        return None
    if num_rows_X != num_rows_y: # Should not happen if df_copy.loc[y_series.index] was effective
        print(f"Warning: Mismatch in lengths of X ({num_rows_X}) and y ({num_rows_y}) after processing. Skipping sequences for this DF.")
        return None
    
    if num_rows_X < sequence_length:
        return None

    for i in range(0, num_rows_X - sequence_length + 1, step_size):
        feature_seq = X_values[i : i + sequence_length]
        X_sequences.append(feature_seq)
        # y_series was already aligned with df_copy, which X_values is based on
        label = y_series.iloc[i + sequence_length - 1]
        y_labels_for_sequences.append(label)

    if not X_sequences:
        return None

    return {
        "X": np.array(X_sequences, dtype=np.float32),
        "y": np.array(y_labels_for_sequences, dtype=np.int32)
    }

In [26]:
def prepare_datasets(
    train_paths: list[str],
    test_paths: list[str],
    prepare_df_function: callable,
    sequence_length: int,
    target_column: str,
    target_keyword: str | None,
    numerical_cols: list[str],
    categorical_cols_encoders: dict[str, OneHotEncoder],
    nan_placeholder: str = 'missing_value',
    step_size: int = 1
):
    master_feature_columns = list(numerical_cols)
    
    for cat_col_name in categorical_cols_encoders:
        encoder = categorical_cols_encoders[cat_col_name]
        try:
            master_feature_columns.extend(encoder.get_feature_names_out([cat_col_name]))
        except Exception as e:
            print(f"Error getting feature names from pre-fitted encoder for '{cat_col_name}': {e}. "
                  "Ensure encoders are fitted and can produce feature names. Feature set might be incomplete.")

    if not master_feature_columns:
        print("Warning: Master feature column list is empty. No features defined by numerical_cols or encoders.")
        empty_X_shape = (0, sequence_length, 0) if sequence_length > 0 else (0,0,0)
        empty_result = {"X": np.array([]).reshape(empty_X_shape), "y": np.array([])}
        return empty_result, empty_result
    
    print(f"Master feature columns determined ({len(master_feature_columns)} total). Example: {master_feature_columns[:5]}...")

    def _bulk_process_paths_with_prefitted_encoders(
        file_paths: list[str],
        is_training_data: bool
    ):
        all_X_sequences_list = []
        all_y_labels_list = []
        
        dataset_type = "training" if is_training_data else "testing"
        print(f"\nProcessing {dataset_type} data...")

        for i, path in enumerate(file_paths):
            print(f"  Loading and preparing {dataset_type} file {i+1}/{len(file_paths)}: {path}")
            try:
                df = prepare_df_function(path)
            except Exception as e:
                print(f"    Error calling prepare_df_function for {path}: {e}. Skipping this file.")
                continue

            if df is None or df.empty:
                print(f"    prepare_df_function returned None or empty DataFrame for {path}. Skipping.")
                continue
            
            pypots_dict_single_df = _generate_sequences_from_df(
                df=df,
                sequence_length=sequence_length,
                target_column=target_column,
                target_keyword=target_keyword,
                numerical_cols=numerical_cols,
                categorical_cols_encoders=categorical_cols_encoders,
                master_feature_columns=master_feature_columns,
                nan_placeholder=nan_placeholder,
                step_size=step_size
            )

            if pypots_dict_single_df and pypots_dict_single_df['X'].size > 0:
                all_X_sequences_list.append(pypots_dict_single_df['X'])
                all_y_labels_list.append(pypots_dict_single_df['y'])
            else:
                print(f"    No sequences generated for {path}.")

        if not all_X_sequences_list:
            print(f"No sequences generated for any {dataset_type} files.")
            n_master_features = len(master_feature_columns)
            empty_X_shape = (0, sequence_length, n_master_features) if sequence_length > 0 else (0,0,0)
            return {"X": np.array([]).reshape(empty_X_shape), "y": np.array([])}

        final_X = np.concatenate(all_X_sequences_list, axis=0)
        final_y = np.concatenate(all_y_labels_list, axis=0)
        return {"X": final_X, "y": final_y}

    # Process training data
    train_pypots_data = _bulk_process_paths_with_prefitted_encoders(train_paths, is_training_data=True)
    if train_pypots_data["X"].size > 0:
        print(f"Total training data: X shape {train_pypots_data['X'].shape}, y shape {train_pypots_data['y'].shape}")

    # Process testing data
    test_pypots_data = _bulk_process_paths_with_prefitted_encoders(test_paths, is_training_data=False)
    if test_pypots_data["X"].size > 0:
        print(f"Total testing data: X shape {test_pypots_data['X'].shape}, y shape {test_pypots_data['y'].shape}")
        
    return train_pypots_data, test_pypots_data

In [27]:
SEQUENCE_LENGTH = 12
STEP_SIZE = 1
TARGET_COLUMN = 'Condition'
FOG_KEYWORD = 'fog'
CATEGORICAL_FEATURES = ['Wind'] # List of raw categorical column names
NAN_PLACEHOLDER = 'missing_value' # For categoricals

TRAIN_PATHS = [
    "./datasets/weather/wrocław-EPWR.csv",
    "./datasets/weather/utrecht-IUTRECHT299.csv",
    "./datasets/weather/utrecht-IUTRECHT315.csv",
    "./datasets/weather/utrecht-IDEBIL13.csv"
]

TEST_PATHS = [
    "./datasets/weather/utrecht-EHAM.csv"
]

In [28]:
train_data, test_data = prepare_datasets(
    train_paths=TRAIN_PATHS,
    test_paths=TEST_PATHS,
    prepare_df_function=prepare_df,
    sequence_length=SEQUENCE_LENGTH,
    target_column=TARGET_COLUMN,
    target_keyword=FOG_KEYWORD,
    numerical_cols=["Temperature_C","Dew_Point_C","Humidity_%","Wind_Speed_kmh","Wind_Gust_kmh","Pressure_hPa","Precip_Rate_mm"],
    categorical_cols_encoders={},
    nan_placeholder=NAN_PLACEHOLDER,
    step_size=STEP_SIZE
)

Master feature columns determined (7 total). Example: ['Temperature_C', 'Dew_Point_C', 'Humidity_%', 'Wind_Speed_kmh', 'Wind_Gust_kmh']...

Processing training data...
  Loading and preparing training file 1/4: ./datasets/weather/wrocław-EPWR.csv


  df["datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"])


  Loading and preparing training file 2/4: ./datasets/weather/utrecht-IUTRECHT299.csv


  df["datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"])


  Loading and preparing training file 3/4: ./datasets/weather/utrecht-IUTRECHT315.csv


  df["datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"])


  Loading and preparing training file 4/4: ./datasets/weather/utrecht-IDEBIL13.csv


  df["datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"])


Total training data: X shape (46719, 12, 7), y shape (46719,)

Processing testing data...
  Loading and preparing testing file 1/1: ./datasets/weather/utrecht-EHAM.csv


  df["datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"])


Total testing data: X shape (11687, 12, 7), y shape (11687,)


In [29]:
if train_data and train_data["X"].size > 0:
    print(f"Final Training X shape: {train_data['X'].shape}")
    print(f"Final Training y shape: {train_data['y'].shape}")
    print(f"Training y distribution: {np.bincount(train_data['y'])}")
else:
    print("Training data is empty or could not be generated.")

if test_data and test_data["X"].size > 0:
    print(f"Final Test X shape: {test_data['X'].shape}")
    print(f"Final Test y shape: {test_data['y'].shape}")
    print(f"Test y distribution (if not empty): {np.bincount(test_data['y']) if test_data['y'].size > 0 else 'empty'}")
else:
    print("Test data is empty or could not be generated.")

Final Training X shape: (46719, 12, 7)
Final Training y shape: (46719,)
Training y distribution: [45442  1277]
Final Test X shape: (11687, 12, 7)
Final Test y shape: (11687,)
Test y distribution (if not empty): [11400   287]


# Model training

## Raindrop

In [33]:
raindrop = Raindrop(
    n_steps=train_data['X'].shape[1],
    n_features=train_data['X'].shape[2],
    n_classes=2,
    n_layers=2,
    d_model=train_data['X'].shape[2] * 4,
    d_ffn=256,
    n_heads=2,
    dropout=0.3,
    batch_size=32,
    epochs=10,
    patience=3,
    optimizer=Adam(lr=1e-3),
    num_workers=0,
    device=None,
    saving_path='./runs/classify/WEATHER/raindrop',
    model_saving_strategy='best'
)

2025-05-12 15:07:11 [INFO]: No given device, using default device: cuda
2025-05-12 15:07:11 [INFO]: Model files will be saved to ./runs/classify/WEATHER/raindrop/20250512_T150711
2025-05-12 15:07:11 [INFO]: Tensorboard file will be saved to ./runs/classify/WEATHER/raindrop/20250512_T150711/tensorboard
2025-05-12 15:07:11 [INFO]: Using customized CrossEntropy as the training loss function.
2025-05-12 15:07:11 [INFO]: Using customized CrossEntropy as the validation metric function.


ImportError: ❌ name 'MessagePassing' is not defined. Note that torch_geometric is missing, please install it with 'pip install torch_geometric torch_scatter torch_sparse' or 'conda install -c pyg pyg pytorch-scatter pytorch-sparse'

## BRITS

In [34]:
brits = BRITS(
    n_steps=train_data['X'].shape[1],
    n_features=train_data['X'].shape[2],
    n_classes=2,
    rnn_hidden_size=256,
    batch_size=32,
    epochs=10,
    patience=3,
    optimizer=Adam(lr=1e-3),
    num_workers=0,
    device=None,
    saving_path='./runs/classify/WEATHER/brits',
    model_saving_strategy='best'
)

2025-05-12 15:12:58 [INFO]: No given device, using default device: cuda
2025-05-12 15:12:58 [INFO]: Model files will be saved to ./runs/classify/WEATHER/brits/20250512_T151258
2025-05-12 15:12:58 [INFO]: Tensorboard file will be saved to ./runs/classify/WEATHER/brits/20250512_T151258/tensorboard
2025-05-12 15:12:58 [INFO]: Using customized CrossEntropy as the training loss function.
2025-05-12 15:12:58 [INFO]: Using customized CrossEntropy as the validation metric function.
2025-05-12 15:13:00 [INFO]: BRITS initialized with the given hyperparameters, the number of trainable parameters: 566,212


In [35]:
brits.fit(train_set=train_data, val_set=test_data)
results = brits.predict(test_data)
prediction = results['classification']
metrics = calc_binary_classification_metrics(prediction, test_data['y'])
print("Testing classification metrics: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
)

2025-05-12 15:22:59 [INFO]: Epoch 001 - training loss (CrossEntropy): 286.7109, validation CrossEntropy: 0.1258
2025-05-12 15:29:50 [INFO]: Epoch 002 - training loss (CrossEntropy): 229.2128, validation CrossEntropy: 0.1219
2025-05-12 15:36:04 [INFO]: Epoch 003 - training loss (CrossEntropy): 183.3828, validation CrossEntropy: 0.1189
2025-05-12 15:42:13 [INFO]: Epoch 004 - training loss (CrossEntropy): 138.8626, validation CrossEntropy: 0.1197
2025-05-12 15:48:23 [INFO]: Epoch 005 - training loss (CrossEntropy): 97.0802, validation CrossEntropy: 0.1215
2025-05-12 15:54:29 [INFO]: Epoch 006 - training loss (CrossEntropy): 68.2444, validation CrossEntropy: 0.1156
2025-05-12 16:00:41 [INFO]: Epoch 007 - training loss (CrossEntropy): 53.2646, validation CrossEntropy: 0.1157
2025-05-12 16:06:59 [INFO]: Epoch 008 - training loss (CrossEntropy): 39.8244, validation CrossEntropy: 0.1166
2025-05-12 16:13:10 [INFO]: Epoch 009 - training loss (CrossEntropy): 34.1167, validation CrossEntropy: 0.11

Testing classification metrics: 
ROC_AUC: 0.5, 
PR_AUC: 0.5122786001540173,
F1: 0.0,
Precision: 0.0,
Recall: 0.0,



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
