In [1]:
from collections import Counter

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder 
from pypots.utils.random import set_random_seed
from pypots.optim import Adam
from pypots.classification import Raindrop, BRITS, GRUD
from pypots.nn.functional import calc_binary_classification_metrics
import torch
import torch.nn as nn
import torch.nn.functional as F


[34m
████████╗██╗███╗   ███╗███████╗    ███████╗███████╗██████╗ ██╗███████╗███████╗    █████╗ ██╗
╚══██╔══╝██║████╗ ████║██╔════╝    ██╔════╝██╔════╝██╔══██╗██║██╔════╝██╔════╝   ██╔══██╗██║
   ██║   ██║██╔████╔██║█████╗█████╗███████╗█████╗  ██████╔╝██║█████╗  ███████╗   ███████║██║
   ██║   ██║██║╚██╔╝██║██╔══╝╚════╝╚════██║██╔══╝  ██╔══██╗██║██╔══╝  ╚════██║   ██╔══██║██║
   ██║   ██║██║ ╚═╝ ██║███████╗    ███████║███████╗██║  ██║██║███████╗███████║██╗██║  ██║██║
   ╚═╝   ╚═╝╚═╝     ╚═╝╚══════╝    ╚══════╝╚══════╝╚═╝  ╚═╝╚═╝╚══════╝╚══════╝╚═╝╚═╝  ╚═╝╚═╝
ai4ts v0.0.3 - building AI for unified time-series analysis, https://time-series.ai [0m



# Prepare df

In [4]:
# stations = '249','323', '377'
stations = '323',
test_station = '215'

In [3]:
def _convert_vv_to_meters(vv_code):
    if pd.isna(vv_code):
        return np.nan
    
    vv_code = int(vv_code)

    if 0 <= vv_code <= 49:
        return vv_code * 100 + 50
    elif vv_code == 50:
        return 5500
    elif 51 <= vv_code <= 55:
        return np.nan
    elif 56 <= vv_code <= 79:
        return int((vv_code - 56 + 6.5) * 1000)
    elif vv_code == 80:
        return 32500
    elif 81 <= vv_code <= 88:
        return int(32500 + (vv_code - 81) * 5000)
    elif vv_code == 89:
        return 70000
    else:
        return np.nan
    
def _get_valid_vv_codes() -> list[int]:
    valid_codes = list(range(0, 51))
    valid_codes += list(range(56, 90))
    return valid_codes

def get_vv_one_hot_encoder() -> OneHotEncoder:
    valid_codes = _get_valid_vv_codes()
    categories = [np.array(valid_codes, dtype=np.int32)]
    encoder = OneHotEncoder(categories=categories, handle_unknown='ignore', dtype=np.float32, sparse_output=False)
    encoder.fit(categories[0].reshape(-1, 1))
    return encoder

def prepare_df(path: str) -> pd.DataFrame:
    try:
        header_line_index = -1
        column_names = []
        data_lines_start_index = -1

        # Find the header and its index more efficiently
        with open(path, 'r') as f:
            for i, line in enumerate(f):
                if line.strip().startswith('# STN,YYYYMMDD,'):
                    header_line_index = i
                    column_names = [col.strip() for col in line.strip().lstrip('#').split(',')]
                    data_lines_start_index = header_line_index + 1
                    break
        
        if header_line_index == -1:
            raise ValueError("Header line not found.")

        # Use pandas.read_csv directly with skiprows and comment character
        # This avoids reading the whole file into a list first for data lines
        # and then joining them back.
        df = pd.read_csv(
            path,
            names=column_names,
            skiprows=data_lines_start_index,
            comment='#',  # Lines starting with '#' will be ignored as comments
            skipinitialspace=True,
            na_values=['       ', '     '] # Add other common missing value representations if needed
        )

        if df.empty:
            raise ValueError("No data found after the header or all data was commented out.")

        # Convert 'HH' to string and zfill, then create 'Timestamp'
        # It's crucial to handle potential NaN values in 'YYYYMMDD' or 'HH'
        # if they are not guaranteed to be present or valid in all rows.
        df['HH'] = df['HH'].astype(int) - 1
        df['HH'] = df['HH'].astype(str).str.zfill(2)
        df['Timestamp'] = pd.to_datetime(df['YYYYMMDD'].astype(str) + df['HH'].astype(str), format="%Y%m%d%H", errors='coerce')
        
        df.set_index('Timestamp', inplace=True)
        
        # Columns to drop
        cols_to_drop = ['YYYYMMDD', 'HH']
        df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)

        # Convert remaining columns to numeric, efficiently
        # Identify numeric columns once and convert
        # Exclude already processed or known non-numeric columns if necessary
        for col in df.columns:
            # This check is slightly redundant if YYYYMMDD and HH are already dropped,
            # but good for safety if they weren't or if other non-numeric columns exist.
            if df[col].dtype == 'object': # Only attempt conversion if the column is of object type
                try:
                    df[col] = pd.to_numeric(df[col], downcast='signed')
                except ValueError:
                    # Handle or log cases where a column expected to be numeric isn't
                    # For now, we'll coerce, which turns unparseable into NaT/NaN
                    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='signed')
        df['VV_m'] = df['VV'].apply(_convert_vv_to_meters)
        return df

    except FileNotFoundError:
        print(f"Error: The file '{path}' was not found.")
        raise
    except ValueError as ve:
        print(f"ValueError: {ve}")
        raise
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        raise

In [5]:
dfs = []
for station in stations:
    df = prepare_df(f"./datasets/knmi_station_data/{station}.txt")
    df = df.set_index('STN', append=True)
    # Check whether VV column has any noy nulls
    nulls = df['VV'].isna().sum()
    dfs.append(df)

train_df = pd.concat(dfs)
train_df = df.reset_index()
train_df.head()

Unnamed: 0,Timestamp,STN,DD,FH,FF,FX,T,T10N,TD,SQ,...,N,U,WW,IX,M,R,S,O,Y,VV_m
0,2011-01-01 00:00:00,323,250.0,30.0,40.0,50.0,40.0,,39.0,0.0,...,7.0,99.0,32.0,7,1.0,0.0,0.0,0.0,0.0,150.0
1,2011-01-01 01:00:00,323,270.0,40.0,40.0,50.0,41.0,,40.0,0.0,...,8.0,99.0,20.0,7,1.0,0.0,0.0,0.0,0.0,2550.0
2,2011-01-01 02:00:00,323,260.0,30.0,30.0,40.0,40.0,,39.0,0.0,...,8.0,99.0,10.0,7,0.0,0.0,0.0,0.0,0.0,6500.0
3,2011-01-01 03:00:00,323,250.0,30.0,30.0,50.0,40.0,,39.0,0.0,...,8.0,99.0,10.0,7,0.0,0.0,0.0,0.0,0.0,5500.0
4,2011-01-01 04:00:00,323,250.0,30.0,40.0,50.0,40.0,,39.0,0.0,...,8.0,99.0,22.0,7,0.0,1.0,0.0,0.0,0.0,4350.0


In [6]:
dfs = []
for station in [test_station]:
    df = prepare_df(f"./datasets/knmi_station_data/{station}.txt")
    df = df.set_index('STN', append=True)
    # Check whether VV column has any noy nulls
    nulls = df['VV'].isna().sum()
    dfs.append(df)

test_df = pd.concat(dfs)
test_df = df.reset_index()
test_df.head()

Unnamed: 0,Timestamp,STN,DD,FH,FF,FX,T,T10N,TD,SQ,...,N,U,WW,IX,M,R,S,O,Y,VV_m
0,2015-01-01 00:00:00,215,210.0,50.0,50.0,70.0,27,,8,0,...,0.0,87,10.0,7,0.0,0.0,0.0,0.0,0.0,4250.0
1,2015-01-01 01:00:00,215,220.0,50.0,50.0,70.0,26,,4,0,...,0.0,85,10.0,7,0.0,0.0,0.0,0.0,0.0,7500.0
2,2015-01-01 02:00:00,215,200.0,50.0,40.0,80.0,23,,2,0,...,0.0,86,,5,0.0,0.0,0.0,0.0,0.0,10500.0
3,2015-01-01 03:00:00,215,210.0,40.0,40.0,70.0,21,,1,0,...,0.0,87,,5,0.0,0.0,0.0,0.0,0.0,10500.0
4,2015-01-01 04:00:00,215,190.0,50.0,50.0,80.0,19,,2,0,...,1.0,88,,5,0.0,0.0,0.0,0.0,0.0,10500.0


In [13]:
# Helper function for core processing and sequence creation
def _process_and_create_sequences_internal(
        df_segment: pd.DataFrame,
        target_column_name: str,
        numerical_features_cols: list[str],
        categorical_features_cols: dict[str, OneHotEncoder],
        prev_time_steps: int,
        timestamp_column: str = 'Timestamp',
        expected_time_interval: pd.Timedelta = pd.Timedelta(hours=1)
) -> dict[str, np.ndarray]:

    # This df_segment is assumed to have target_column_name NaNs already handled if desired.
    
    processed_feature_dfs_list = []

    # 1. Process Numerical Features
    if numerical_features_cols:
        valid_numerical_cols = [col for col in numerical_features_cols if col in df_segment.columns]
        if valid_numerical_cols: # Only proceed if there are valid numerical columns to select
            numerical_df = df_segment[valid_numerical_cols].copy()
            processed_feature_dfs_list.append(numerical_df)
        elif not valid_numerical_cols and numerical_features_cols: # Specified but none found
            print(f"Warning (internal): None of the specified numerical features found in the current data segment. Numerical features count: {len(numerical_features_cols)}")


    # 2. Process Categorical Features
    for col_name, encoder in categorical_features_cols.items():
        if col_name not in df_segment.columns:
            # This happens if df_segment is empty or the column was dropped.
            # The impact on num_actual_features will be handled later.
            continue

        column_to_encode = df_segment[[col_name]]
        encoded_data_sparse = np.array([]) # Default empty
        
        if column_to_encode.empty: # Encoder might not handle empty DataFrame input well for transform if it expects rows
             try: # Get feature names to create an empty DataFrame with correct OHE columns
                ohe_feature_names = encoder.get_feature_names_out([col_name])
             except AttributeError:
                ohe_feature_names = encoder.get_feature_names([col_name])
             except Exception:
                ohe_feature_names = [f"{col_name}_cat{i}" for i in range(len(encoder.categories_[0]))] if hasattr(encoder, 'categories_') else [f"{col_name}_unknown_cat"]
             
             # Create an empty DataFrame (0 rows) with the OHE column names
             ohe_df = pd.DataFrame(columns=ohe_feature_names, index=df_segment.index, dtype=float)
             processed_feature_dfs_list.append(ohe_df)
             continue # Go to next categorical column

        # If not empty, proceed with transform
        encoded_data_sparse = encoder.transform(column_to_encode)
        
        try:
            ohe_feature_names = encoder.get_feature_names_out([col_name])
        except AttributeError:
            try:
                ohe_feature_names = encoder.get_feature_names([col_name])
            except TypeError:
                ohe_feature_names = [f"{col_name}_{category}" for category in encoder.categories_[0]]
            except Exception as e_fn: # Broad exception for get_feature_names issues
                print(f"Warning (internal): Could not reliably get OHE feature names for '{col_name}'. Using generic names. Error: {e_fn}")
                num_output_features = encoded_data_sparse.shape[1]
                ohe_feature_names = [f"{col_name}_ohe_{i}" for i in range(num_output_features)]

        if hasattr(encoded_data_sparse, "toarray"):
            encoded_data_dense = encoded_data_sparse.toarray()
        else:
            encoded_data_dense = encoded_data_sparse
        
        if encoded_data_dense.shape[0] > 0 : # Ensure data was actually produced
            ohe_df = pd.DataFrame(encoded_data_dense, columns=ohe_feature_names, index=df_segment.index)
            processed_feature_dfs_list.append(ohe_df)
        elif df_segment.shape[0] > 0 : # Input segment had rows, but OHE produced no rows (should not happen with sklearn)
             print(f"Warning (internal): OHE for '{col_name}' produced 0 rows from a non-empty segment. Check encoder.")


    # 3. Combine all processed feature DataFrames
    num_actual_features = 0
    if not processed_feature_dfs_list:
        final_features_df = pd.DataFrame(index=df_segment.index) # 0 columns
        if numerical_features_cols or categorical_features_cols:
             print("Warning (internal): No features were processed into final_features_df despite specification. X will have 0 features for this segment.")
    else:
        final_features_df = pd.concat(processed_feature_dfs_list, axis=1)
    num_actual_features = final_features_df.shape[1]

    # 4. Fill missing values
    final_features_df[timestamp_column] = df_segment[timestamp_column]
    final_features_df["target"] = df_segment[target_column_name]
    final_features_df = final_features_df.sort_values(timestamp_column).reset_index(drop=True)

    features_only = final_features_df.drop(columns=["target"])
    feature_cols = features_only.columns.difference([timestamp_column])
    feature_lookup = features_only.set_index(timestamp_column)

    x_list = []
    y_list = []

    for i in range(prev_time_steps, len(final_features_df)):
        current_time = final_features_df.loc[i, timestamp_column]
        expected_times = [current_time - j * expected_time_interval for j in range(prev_time_steps, 0, -1)]

        sequence_rows = []
        for ts in expected_times:
            if ts in feature_lookup.index:
                row = feature_lookup.loc[ts][feature_cols].values
            else:
                row = np.full(len(feature_cols), np.nan)
            sequence_rows.append(row)
        
        feature_sequence = np.vstack(sequence_rows)
        x_list.append(feature_sequence)
        target_value = final_features_df.loc[i, "target"]
        y_list.append(target_value)

    num_actual_features = len(feature_cols)
    if not x_list:
        return {
            'X': np.array([]).reshape(0, prev_time_steps, num_actual_features),
            'y': np.array([])
        }
    
    return {
        'X': np.array(x_list),
        'y': np.array(y_list)
    }

    # --- Sequence Creation ---
    # X_list = []
    # y_list = []
    # num_rows_in_features = len(final_features_df)

    # if prev_time_steps >= num_rows_in_features :
    #     return {'X': np.array([]).reshape(0, prev_time_steps, num_actual_features), 'y': np.array([])}

    # y_series_segment = df_segment[target_column_name]



    # for i in range(prev_time_steps, num_rows_in_features):
    #     feature_sequence = final_features_df.iloc[i - prev_time_steps : i].values
    #     X_list.append(feature_sequence)
    #     target_value = y_series_segment.iloc[i]
    #     y_list.append(target_value)

    # if not X_list:
    #     return {'X': np.array([]).reshape(0, prev_time_steps, num_actual_features), 'y': np.array([])}
        
    # X_np = np.array(X_list)
    # y_np = np.array(y_list)
    
    # return {'X': X_np, 'y': y_np}

# Helper for balancing data (undersampling)
def _balance_data_helper(X_in: np.ndarray, y_in: np.ndarray, random_seed: int | None = None, max_other_ratio: float = 2) -> tuple[np.ndarray, np.ndarray]:
    if X_in.shape[0] == 0: # No data to balance
        return X_in, y_in

    unique_classes, counts = np.unique(y_in, return_counts=True)
    
    if len(unique_classes) <= 1: # Already balanced or only one class
        return X_in, y_in

    min_count = np.min(counts)
    
    balanced_indices_list = []
    rng = np.random.RandomState(random_seed) # For reproducible undersampling if seed is provided

    for cls_val in unique_classes:
        cls_indices = np.where(y_in == cls_val)[0]
        cls_size = int(min_count * max_other_ratio)
        cls_size = min(len(cls_indices), cls_size)
        if len(cls_indices) > min_count:
            chosen_indices = rng.choice(cls_indices, size=cls_size, replace=False)
        else:
            chosen_indices = cls_indices # Take all if it's already the min count or less
        balanced_indices_list.extend(chosen_indices)
    
    # Shuffle the combined indices from different classes
    shuffled_balanced_indices = rng.permutation(balanced_indices_list)
    
    return X_in[shuffled_balanced_indices], y_in[shuffled_balanced_indices]


# Main Function
def create_sequences_for_classification(
        df: pd.DataFrame,
        target_column_name: str,
        numerical_features_cols: list[str],
        categorical_features_cols: dict[str, OneHotEncoder], # {col_name: fitted_encoder}
        prev_time_steps: int = 8,
        split_date: str | pd.Timestamp | None = None,
        balance_data: bool = False,
        balance_random_seed: int | None = None, # Seed for balancing reproducibility
        timestamp_column: str = 'Timestamp',
        expected_time_interval: pd.Timedelta = pd.Timedelta(hours=1),
        max_other_ratio: float = 2
) -> dict[str, np.ndarray] | tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:

    if target_column_name not in df.columns: # Check on original df
        raise ValueError(f"Target column '{target_column_name}' not found in DataFrame.")
    
    df_features = df.copy() # Use the name from your stub
    # --- User requested modification: remove rows where target is NaN ---
    df_features = df_features[~df_features[target_column_name].isna()]
    # --- End of modification ---

    if df_features.empty:
        print(f"DataFrame is empty after removing NaNs in target column '{target_column_name}'.")
        # The _process_and_create_sequences_internal can handle an empty df_features
        # and return correctly shaped empty arrays.
        empty_sequences = _process_and_create_sequences_internal(
            df_features, target_column_name, numerical_features_cols,
            categorical_features_cols, prev_time_steps
        )
        if split_date:
            return empty_sequences, empty_sequences.copy() # Return two empty dicts
        else:
            return empty_sequences


    # Basic Validations for other parameters
    if not isinstance(prev_time_steps, int) or prev_time_steps <= 0:
        raise ValueError("'prev_time_steps' must be a positive integer.")
    for col in numerical_features_cols:
        if col not in df_features.columns: # Check on df_features as it's the one being processed
            print(f"Warning: Numerical feature column '{col}' not found in (cleaned) DataFrame. It will be ignored if missing.")
    for col in categorical_features_cols.keys():
        if col not in df_features.columns:
            print(f"Warning: Categorical feature column '{col}' not found in (cleaned) DataFrame. It will be ignored if missing.")
    
    if split_date:
        split_date_ts = pd.to_datetime(split_date) # Converts str or pd.Timestamp

        train_df = df_features[df_features[timestamp_column] <= split_date_ts].copy() # Use .copy() to avoid SettingWithCopyWarning later
        test_df = df_features[df_features[timestamp_column] > split_date_ts].copy()

        print(f"Original df_features shape: {df_features.shape}")
        print(f"Train data shape (after target NaN drop, before sequence creation): {train_df.shape}")
        print(f"Test data shape (after target NaN drop, before sequence creation): {test_df.shape}")

        if train_df.empty:
            print("Warning: Train DataFrame is empty after split.")
        if test_df.empty:
            print("Warning: Test DataFrame is empty after split.")

        train_sequences = _process_and_create_sequences_internal(
            train_df, target_column_name, numerical_features_cols,
            categorical_features_cols, prev_time_steps, timestamp_column, expected_time_interval
        )
        test_sequences = _process_and_create_sequences_internal(
            test_df, target_column_name, numerical_features_cols,
            categorical_features_cols, prev_time_steps, timestamp_column, expected_time_interval
        )

        if balance_data and train_sequences['X'].shape[0] > 0:
            print(f"Balancing training data... Original counts: {dict(zip(*np.unique(train_sequences['y'], return_counts=True)))}")
            train_sequences['X'], train_sequences['y'] = _balance_data_helper(
                train_sequences['X'], train_sequences['y'], random_seed=balance_random_seed
            )
            print(f"Balanced training data shapes: X={train_sequences['X'].shape}, y={train_sequences['y'].shape}. New counts: {dict(zip(*np.unique(train_sequences['y'], return_counts=True)))}")
        if balance_data and test_sequences['X'].shape[0] > 0:
            test_sequences['X'], test_sequences['y'] = _balance_data_helper(
                test_sequences['X'], test_sequences['y'], random_seed=balance_random_seed
            )
        return train_sequences, test_sequences
    else:
        # No split, process the whole df_features
        all_sequences = _process_and_create_sequences_internal(
            df_features, target_column_name, numerical_features_cols,
            categorical_features_cols, prev_time_steps
        )
        
        if balance_data and all_sequences['X'].shape[0] > 0:
            print(f"Balancing all data... Original counts: {dict(zip(*np.unique(all_sequences['y'], return_counts=True)))}")
            all_sequences['X'], all_sequences['y'] = _balance_data_helper(
                all_sequences['X'], all_sequences['y'], random_seed=balance_random_seed
            )
            print(f"Balanced data shapes: X={all_sequences['X'].shape}, y={all_sequences['y'].shape}. New counts: {dict(zip(*np.unique(all_sequences['y'], return_counts=True)))}")

        return all_sequences

In [8]:
SEQUENCE_LENGTH = 12
STEP_SIZE = 1
TARGET_COLUMN = 'VV'
NUMERICAL_COLS = [
    "FH", "FF", "FX", "T", "T10N", "TD", "SQ", "Q", "DR", "RH", "P", "U", 
    # "DD"
]
CATEGORICAL_COLS = {
    # "WW", "IX", "VV"
}

vv_encoder = get_vv_one_hot_encoder()


In [14]:
train_data, test_data = create_sequences_for_classification(
    df=train_df, 
    target_column_name=TARGET_COLUMN, 
    numerical_features_cols=NUMERICAL_COLS, 
    categorical_features_cols=CATEGORICAL_COLS, 
    prev_time_steps=SEQUENCE_LENGTH,
    balance_data=True,
    balance_random_seed=42,
    # split_date=pd.Timestamp(year=2022, month=1, day=1)
    split_date=pd.Timestamp(year=2013, month=1, day=1),
    max_other_ratio=2
)

Original df_features shape: (10261, 25)
Train data shape (after target NaN drop, before sequence creation): (6748, 25)
Test data shape (after target NaN drop, before sequence creation): (3513, 25)
Balancing training data... Original counts: {np.float64(0.0): np.int64(5), np.float64(1.0): np.int64(14), np.float64(2.0): np.int64(25), np.float64(3.0): np.int64(18), np.float64(4.0): np.int64(10), np.float64(5.0): np.int64(9), np.float64(6.0): np.int64(4), np.float64(7.0): np.int64(9), np.float64(8.0): np.int64(9), np.float64(9.0): np.int64(10), np.float64(10.0): np.int64(10), np.float64(11.0): np.int64(12), np.float64(12.0): np.int64(15), np.float64(13.0): np.int64(15), np.float64(14.0): np.int64(12), np.float64(15.0): np.int64(15), np.float64(16.0): np.int64(12), np.float64(17.0): np.int64(19), np.float64(18.0): np.int64(26), np.float64(19.0): np.int64(24), np.float64(20.0): np.int64(22), np.float64(21.0): np.int64(32), np.float64(22.0): np.int64(30), np.float64(23.0): np.int64(21), np.fl

In [15]:
if train_data and train_data["X"].size > 0:
    print(f"Final Training X shape: {train_data['X'].shape}")
    print(f"Final Training y shape: {train_data['y'].shape}")
    y_int = train_data['y']
    print(f"Training y distribution: {np.unique(y_int, equal_nan=True, return_counts=True)}")
else:
    print("Training data is empty or could not be generated.")

if test_data and test_data["X"].size > 0:
    print(f"Final Test X shape: {test_data['X'].shape}")
    print(f"Final Test y shape: {test_data['y'].shape}")
    y_int = test_data['y']
    print(f"Test y distribution: {np.unique(y_int, equal_nan=True, return_counts=True)}")
else:
    print("Test data is empty or could not be generated.")

Final Training X shape: (625, 12, 12)
Final Training y shape: (625,)
Training y distribution: (array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
       39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 56.,
       57., 58., 59., 60., 61., 62., 63., 64., 65., 66., 67., 68., 69.,
       70., 71., 72., 73., 74., 75., 76., 77., 78., 79., 80., 81., 82.,
       83.]), array([5, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]))
Final Test X shape: (157, 12, 12)
Final Test y shape: (157,)
Test y distribution: (array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 

In [70]:
val_data = create_sequences_for_classification(
    df=test_df, 
    target_column_name=TARGET_COLUMN, 
    numerical_features_cols=NUMERICAL_COLS, 
    categorical_features_cols=CATEGORICAL_COLS, 
    prev_time_steps=SEQUENCE_LENGTH,
    balance_data=True,
    balance_random_seed=42,
    # split_date=pd.Timestamp(year=2022, month=1, day=1)
)

Balancing all data... Original counts: {np.float64(0.0): np.int64(109), np.float64(1.0): np.int64(411), np.float64(2.0): np.int64(264), np.float64(3.0): np.int64(163), np.float64(4.0): np.int64(88), np.float64(5.0): np.int64(65), np.float64(6.0): np.int64(54), np.float64(7.0): np.int64(61), np.float64(8.0): np.int64(49), np.float64(9.0): np.int64(42), np.float64(10.0): np.int64(40), np.float64(11.0): np.int64(60), np.float64(12.0): np.int64(55), np.float64(13.0): np.int64(54), np.float64(14.0): np.int64(48), np.float64(15.0): np.int64(51), np.float64(16.0): np.int64(60), np.float64(17.0): np.int64(57), np.float64(18.0): np.int64(59), np.float64(19.0): np.int64(66), np.float64(20.0): np.int64(64), np.float64(21.0): np.int64(76), np.float64(22.0): np.int64(73), np.float64(23.0): np.int64(72), np.float64(24.0): np.int64(96), np.float64(25.0): np.int64(73), np.float64(26.0): np.int64(64), np.float64(27.0): np.int64(72), np.float64(28.0): np.int64(92), np.float64(29.0): np.int64(88), np.flo

In [78]:
vv_encoder.transform([[0]]).shape

(1, 85)

# Model training

## Raindrop

In [22]:
raindrop = Raindrop(
    n_steps=train_data['X'].shape[1],
    n_features=train_data['X'].shape[2],
    # n_classes=vv_encoder.transform([[0]]).shape[1],
    n_classes=2,
    n_layers=2,
    d_model=train_data['X'].shape[2] * 4,
    d_ffn=256,
    n_heads=2,
    dropout=0.3,
    batch_size=32,
    epochs=20,
    patience=3,
    optimizer=Adam(lr=1e-3),
    num_workers=0,
    device=None,
    saving_path='./runs/classify/WEATHER-KNMI/raindrop',
    model_saving_strategy='best',
)

2025-06-03 20:53:34 [INFO]: No given device, using default device: cuda
2025-06-03 20:53:34 [INFO]: Model files will be saved to ./runs/classify/WEATHER-KNMI/raindrop/20250603_T205334
2025-06-03 20:53:34 [INFO]: Tensorboard file will be saved to ./runs/classify/WEATHER-KNMI/raindrop/20250603_T205334/tensorboard
2025-06-03 20:53:34 [INFO]: Using customized CrossEntropy as the training loss function.
2025-06-03 20:53:34 [INFO]: Using customized CrossEntropy as the validation metric function.
  nn.init.xavier_uniform(self.R_u)  # xavier_uniform also known as glorot


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [82]:
torch.nn.functional.cross_entropy(
    torch.tensor([[1.0, 0, 0], [0, 1, 0]], dtype=torch.float64), 
    torch.tensor([1,2])
)

tensor(1.5514, dtype=torch.float64)

In [18]:
raindrop.fit(train_set=train_data, val_set=test_data)
results = raindrop.predict(test_data)
prediction = results['classification']
metrics = calc_binary_classification_metrics(prediction, test_data['y'])
print("Testing classification metrics: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
    f'Accuracy: {metrics["accuracy"]}'
)

/pytorch/aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [4,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [5,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/sr

RuntimeError: Training got interrupted. Model was not trained. Please investigate the error printed above.

## BRITS

In [70]:
brits = BRITS(
    n_steps=train_data['X'].shape[1],
    n_features=train_data['X'].shape[2],
    n_classes=2,
    rnn_hidden_size=256,
    batch_size=32,
    epochs=20,
    patience=3,
    optimizer=Adam(lr=1e-3),
    num_workers=0,
    device=None,
    saving_path='./runs/classify/WEATHER-KNMI/brits',
    model_saving_strategy='best'
)

2025-05-21 21:42:12 [INFO]: No given device, using default device: cuda
2025-05-21 21:42:12 [INFO]: Model files will be saved to ./runs/classify/WEATHER-KNMI/brits/20250521_T214212
2025-05-21 21:42:12 [INFO]: Tensorboard file will be saved to ./runs/classify/WEATHER-KNMI/brits/20250521_T214212/tensorboard
2025-05-21 21:42:12 [INFO]: Using customized CrossEntropy as the training loss function.
2025-05-21 21:42:12 [INFO]: Using customized CrossEntropy as the validation metric function.
2025-05-21 21:42:12 [INFO]: BRITS initialized with the given hyperparameters, the number of trainable parameters: 592,612


In [71]:
brits.fit(train_set=train_data, val_set=test_data)
results = brits.predict(test_data)
prediction = results['classification']
metrics = calc_binary_classification_metrics(prediction, test_data['y'])
print("Testing classification metrics: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
    f'Accuracy: {metrics["accuracy"]}'
)

2025-05-21 21:42:32 [INFO]: Epoch 001 - training loss (CrossEntropy): 2901.0337, validation CrossEntropy: 0.6801
2025-05-21 21:42:51 [INFO]: Epoch 002 - training loss (CrossEntropy): 2373.2363, validation CrossEntropy: 0.6686
2025-05-21 21:43:06 [INFO]: Epoch 003 - training loss (CrossEntropy): 2074.8020, validation CrossEntropy: 0.6677
2025-05-21 21:43:21 [INFO]: Epoch 004 - training loss (CrossEntropy): 1931.4504, validation CrossEntropy: 0.6531
2025-05-21 21:43:37 [INFO]: Epoch 005 - training loss (CrossEntropy): 1874.9596, validation CrossEntropy: 0.6409
2025-05-21 21:43:58 [INFO]: Epoch 006 - training loss (CrossEntropy): 1862.3724, validation CrossEntropy: 0.6391
2025-05-21 21:44:17 [INFO]: Epoch 007 - training loss (CrossEntropy): 1855.3852, validation CrossEntropy: 0.6266
2025-05-21 21:44:36 [INFO]: Epoch 008 - training loss (CrossEntropy): 1848.6624, validation CrossEntropy: 0.6175
2025-05-21 21:44:54 [INFO]: Epoch 009 - training loss (CrossEntropy): 1842.3058, validation Cros

Testing classification metrics: 
ROC_AUC: 0.9007518796992481, 
PR_AUC: 0.9232825513565572,
F1: 0.9023668639053254,
Precision: 0.8879184861717613,
Recall: 0.9172932330827067,
Accuracy: 0.9007518796992481


## Not deep models

In [72]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.base import ClassifierMixin
from typing import Any, TypeVar
from collections import namedtuple
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

In [86]:
_T = TypeVar('_T', bound=ClassifierMixin)

def evaluate_model(model: ClassifierMixin, X: Any, y: Any):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred, average='macro')
    precision = precision_score(y, y_pred, average='macro')
    recall = recall_score(y, y_pred, average='macro')
    confusion = confusion_matrix(y, y_pred)
    return namedtuple('Evaluation', ['accuracy', 'f1', 'precision', 'recall', 'confusion'])(accuracy, f1, precision, recall, confusion)

def train_model(
        model_cls: _T, 
        model_kwargs: dict[str, Any],
        X_train: np.ndarray,
        y_train: np.ndarray,
        X_test: np.ndarray,
        y_test: np.ndarray
    ) -> _T:
    model = model_cls(**model_kwargs)
    if X_train.ndim == 3:
        X_train = X_train.reshape((X_train.shape[0], X_train.shape[1] * X_train.shape[2]))
    if X_test.ndim == 3:
        X_test = X_test.reshape((X_test.shape[0], X_test.shape[1] * X_test.shape[2]))

    X_train = np.nan_to_num(X_train, nan=-1.0)
    X_test = np.nan_to_num(X_test, nan=-1.0)
    model.fit(X_train, y_train)
    train_metrics = evaluate_model(model, X_train, y_train)
    val_metrics = evaluate_model(model, X_test, y_test)
    print(f"Model - {model_cls.__name__}")
    print("\tTrain metrics:")
    print(f"\t\tAccuracy: {train_metrics.accuracy:.4f}")
    print(f"\t\tF1: {train_metrics.f1:.4f}")
    print(f"\t\tPrecision: {train_metrics.precision:.4f}")
    print(f"\t\tRecall: {train_metrics.recall:.4f}")
    print("\tValidation metrics:")
    print(f"\t\tAccuracy: {val_metrics.accuracy:.4f}")
    print(f"\t\tF1: {val_metrics.f1:.4f}")
    print(f"\t\tPrecision: {val_metrics.precision:.4f}")
    print(f"\t\tRecall: {val_metrics.recall:.4f}")
    return model

In [87]:
svc = train_model(
    SVC,
    {},
    X_train=train_data["X"],
    y_train=train_data["y"],
    X_test=test_data["X"],
    y_test=test_data["y"],
)

Model - SVC
	Train metrics:
		Accuracy: 0.6802
		F1: 0.6641
		Precision: 0.7228
		Recall: 0.6802
	Validation metrics:
		Accuracy: 0.6857
		F1: 0.6606
		Precision: 0.7639
		Recall: 0.6857


In [88]:
rfc = train_model(
    RandomForestClassifier,
    {},
    X_train=train_data["X"],
    y_train=train_data["y"],
    X_test=test_data["X"],
    y_test=test_data["y"],
)

Model - RandomForestClassifier
	Train metrics:
		Accuracy: 1.0000
		F1: 1.0000
		Precision: 1.0000
		Recall: 1.0000
	Validation metrics:
		Accuracy: 0.9459
		F1: 0.9459
		Precision: 0.9461
		Recall: 0.9459


In [89]:
xgb = train_model(
    XGBClassifier,
    {},
    X_train=train_data["X"],
    y_train=train_data["y"],
    X_test=test_data["X"],
    y_test=test_data["y"],
)

Model - XGBClassifier
	Train metrics:
		Accuracy: 1.0000
		F1: 1.0000
		Precision: 1.0000
		Recall: 1.0000
	Validation metrics:
		Accuracy: 0.9308
		F1: 0.9307
		Precision: 0.9335
		Recall: 0.9308
