# SETUP

## SETUP

In [1]:
from google.colab import drive
drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import torch

!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git
!pip install pypots

Found existing installation: torch_scatter 2.1.2+pt26cu124
Uninstalling torch_scatter-2.1.2+pt26cu124:
  Successfully uninstalled torch_scatter-2.1.2+pt26cu124
Found existing installation: torch_sparse 0.6.18+pt26cu124
Uninstalling torch_sparse-0.6.18+pt26cu124:
  Successfully uninstalled torch_sparse-0.6.18+pt26cu124
[0mFound existing installation: torch_cluster 1.6.3+pt26cu124
Uninstalling torch_cluster-1.6.3+pt26cu124:
  Successfully uninstalled torch_cluster-1.6.3+pt26cu124
Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
Collecting torch-scatter
  Using cached https://data.pyg.org/whl/torch-2.6.0%2Bcu124/torch_scatter-2.1.2%2Bpt26cu124-cp311-cp311-linux_x86_64.whl (10.8 MB)
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt26cu124
Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
Collecting torch-sparse
  Using cached https://data.pyg.org/whl/torch-2.6.0%2Bcu124/torch_sparse-0.6.18%2Bpt26cu124-cp311-cp311-li

In [3]:
!pip install pytorch-forecasting



In [4]:
!pip install pytorch_optimizer



In [5]:
!mkdir -p datasets/knmi_station_data
!cp -r /content/drive/MyDrive/MAGISTERKA/datasets/knmi_station_data ./datasets/

## Imports

In [6]:
from collections import Counter
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from pypots.utils.random import set_random_seed
from pypots.optim import Adam
from pypots.classification import Raindrop, BRITS, GRUD
from pypots.nn.functional import calc_binary_classification_metrics, calc_mse, calc_rmse, calc_mae
from pypots.nn.modules.loss import Criterion, MSE, MAE
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import v2
from torch.utils.data import WeightedRandomSampler
import matplotlib.pyplot as plt
import seaborn as sns

from pytorch_forecasting.data.timeseries import TimeSeriesDataSet
from pytorch_forecasting.data.encoders import NaNLabelEncoder
from pytorch_forecasting.models import Baseline, TemporalFusionTransformer
from pytorch_forecasting.metrics import MAE, RMSE, MASE

import lightning as L
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.tuner import Tuner
from torchmetrics import Metric

[34m
████████╗██╗███╗   ███╗███████╗    ███████╗███████╗██████╗ ██╗███████╗███████╗    █████╗ ██╗
╚══██╔══╝██║████╗ ████║██╔════╝    ██╔════╝██╔════╝██╔══██╗██║██╔════╝██╔════╝   ██╔══██╗██║
   ██║   ██║██╔████╔██║█████╗█████╗███████╗█████╗  ██████╔╝██║█████╗  ███████╗   ███████║██║
   ██║   ██║██║╚██╔╝██║██╔══╝╚════╝╚════██║██╔══╝  ██╔══██╗██║██╔══╝  ╚════██║   ██╔══██║██║
   ██║   ██║██║ ╚═╝ ██║███████╗    ███████║███████╗██║  ██║██║███████╗███████║██╗██║  ██║██║
   ╚═╝   ╚═╝╚═╝     ╚═╝╚══════╝    ╚══════╝╚══════╝╚═╝  ╚═╝╚═╝╚══════╝╚══════╝╚═╝╚═╝  ╚═╝╚═╝
ai4ts v0.0.3 - building AI for unified time-series analysis, https://time-series.ai [0m



In [7]:
BASE_PATH = Path("/content")
DRIVE_PATH = Path("/content/drive/MyDrive/MAGISTERKA")
SAVE_DIR = DRIVE_PATH / "lightning_logs/runs/forecast"

# Prepare df

In [12]:
stations = '249','323', '377'
# stations = '323',
test_station = '215'

In [13]:
def _convert_vv_to_meters(vv_code):
    if pd.isna(vv_code):
        return np.nan

    vv_code = int(vv_code)

    if 0 <= vv_code <= 49:
        return vv_code * 100 + 50
    elif vv_code == 50:
        return 5500
    elif 51 <= vv_code <= 55:
        return np.nan
    elif 56 <= vv_code <= 79:
        return int((vv_code - 56 + 6.5) * 1000)
    elif vv_code == 80:
        return 32500
    elif 81 <= vv_code <= 88:
        return int(32500 + (vv_code - 81) * 5000)
    elif vv_code == 89:
        return 70000
    else:
        return np.nan

def _convert_vvm_to_simple(vv_m):
    if pd.isna(vv_m):
        return np.nan

    if vv_m < 500:
      return 0
    elif vv_m < 1000:
      return 1
    elif vv_m < 2000:
      return 2
    elif vv_m < 5000:
      return 3
    elif vv_m < 10000:
      return 4
    else:
      return 5

def _get_valid_vv_codes() -> list[int]:
    valid_codes = list(range(0, 51))
    valid_codes += list(range(56, 90))
    return valid_codes

def get_vv_one_hot_encoder() -> OneHotEncoder:
    valid_codes = _get_valid_vv_codes()
    categories = [np.array(valid_codes, dtype=np.int32)]
    encoder = OneHotEncoder(categories=categories, handle_unknown='ignore', dtype=np.float32, sparse_output=False)
    encoder.fit(categories[0].reshape(-1, 1))
    return encoder

def prepare_df(path: str) -> pd.DataFrame:
    try:
        header_line_index = -1
        column_names = []
        data_lines_start_index = -1

        # Find the header and its index more efficiently
        with open(path, 'r') as f:
            for i, line in enumerate(f):
                if line.strip().startswith('# STN,YYYYMMDD,'):
                    header_line_index = i
                    column_names = [col.strip() for col in line.strip().lstrip('#').split(',')]
                    data_lines_start_index = header_line_index + 1
                    break

        if header_line_index == -1:
            raise ValueError("Header line not found.")

        # Use pandas.read_csv directly with skiprows and comment character
        # This avoids reading the whole file into a list first for data lines
        # and then joining them back.
        df = pd.read_csv(
            path,
            names=column_names,
            skiprows=data_lines_start_index,
            comment='#',  # Lines starting with '#' will be ignored as comments
            skipinitialspace=True,
            na_values=['       ', '     '] # Add other common missing value representations if needed
        )

        if df.empty:
            raise ValueError("No data found after the header or all data was commented out.")

        # Convert 'HH' to string and zfill, then create 'Timestamp'
        # It's crucial to handle potential NaN values in 'YYYYMMDD' or 'HH'
        # if they are not guaranteed to be present or valid in all rows.
        df['HH'] = df['HH'].astype(int) - 1
        df['HH'] = df['HH'].astype(str).str.zfill(2)
        df['Timestamp'] = pd.to_datetime(df['YYYYMMDD'].astype(str) + df['HH'].astype(str), format="%Y%m%d%H", errors='coerce')

        df.set_index('Timestamp', inplace=True)

        # Columns to drop
        cols_to_drop = ['YYYYMMDD', 'HH']
        df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)

        # Convert remaining columns to numeric, efficiently
        # Identify numeric columns once and convert
        # Exclude already processed or known non-numeric columns if necessary
        for col in df.columns:
            # This check is slightly redundant if YYYYMMDD and HH are already dropped,
            # but good for safety if they weren't or if other non-numeric columns exist.
            if df[col].dtype == 'object': # Only attempt conversion if the column is of object type
                try:
                    df[col] = pd.to_numeric(df[col], downcast='signed')
                except ValueError:
                    # Handle or log cases where a column expected to be numeric isn't
                    # For now, we'll coerce, which turns unparseable into NaT/NaN
                    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='signed')
        df['VV_m'] = df['VV'].apply(_convert_vv_to_meters)
        df['VV_s'] = df['VV_m'].apply(_convert_vvm_to_simple)

        return df

    except FileNotFoundError:
        print(f"Error: The file '{path}' was not found.")
        raise
    except ValueError as ve:
        print(f"ValueError: {ve}")
        raise
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        raise

In [14]:
dfs = []
for station in stations:
    df = prepare_df(f"./datasets/knmi_station_data/{station}.txt")
    df = df.set_index('STN', append=True)
    # Check whether VV column has any noy nulls
    nulls = df['VV'].isna().sum()
    dfs.append(df)

train_df = pd.concat(dfs)
train_df = df.reset_index()
train_df.head()

Unnamed: 0,Timestamp,STN,DD,FH,FF,FX,T,T10N,TD,SQ,...,U,WW,IX,M,R,S,O,Y,VV_m,VV_s
0,2000-01-01 00:00:00,377,220,30,30,50,46,,39,0,...,95,,6,,,,,,,
1,2000-01-01 01:00:00,377,210,30,30,50,47,,41,0,...,96,,6,,,,,,,
2,2000-01-01 02:00:00,377,210,30,30,50,48,,44,0,...,97,,6,,,,,,,
3,2000-01-01 03:00:00,377,200,30,30,50,49,,45,0,...,97,,6,,,,,,,
4,2000-01-01 04:00:00,377,200,30,30,50,50,,46,0,...,97,,6,,,,,,,


In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88104 entries, 0 to 88103
Data columns (total 26 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Timestamp  88104 non-null  datetime64[ns]
 1   STN        88104 non-null  int64         
 2   DD         88104 non-null  int64         
 3   FH         88104 non-null  int64         
 4   FF         88104 non-null  int64         
 5   FX         88104 non-null  int64         
 6   T          88104 non-null  int64         
 7   T10N       14682 non-null  float64       
 8   TD         88104 non-null  int64         
 9   SQ         88104 non-null  int64         
 10  Q          88104 non-null  int64         
 11  DR         88104 non-null  int64         
 12  RH         88104 non-null  int64         
 13  P          0 non-null      float64       
 14  VV         77850 non-null  float64       
 15  N          77775 non-null  float64       
 16  U          88104 non-null  int64        

In [16]:
dfs = []
for station in [test_station]:
    df = prepare_df(f"./datasets/knmi_station_data/{station}.txt")
    df = df.set_index('STN', append=True)
    # Check whether VV column has any noy nulls
    nulls = df['VV'].isna().sum()
    dfs.append(df)

test_df = pd.concat(dfs)
test_df = df.reset_index()
test_df.head()

Unnamed: 0,Timestamp,STN,DD,FH,FF,FX,T,T10N,TD,SQ,...,U,WW,IX,M,R,S,O,Y,VV_m,VV_s
0,2015-01-01 00:00:00,215,210.0,50.0,50.0,70.0,27,,8,0,...,87,10.0,7,0.0,0.0,0.0,0.0,0.0,4250.0,3.0
1,2015-01-01 01:00:00,215,220.0,50.0,50.0,70.0,26,,4,0,...,85,10.0,7,0.0,0.0,0.0,0.0,0.0,7500.0,4.0
2,2015-01-01 02:00:00,215,200.0,50.0,40.0,80.0,23,,2,0,...,86,,5,0.0,0.0,0.0,0.0,0.0,10500.0,5.0
3,2015-01-01 03:00:00,215,210.0,40.0,40.0,70.0,21,,1,0,...,87,,5,0.0,0.0,0.0,0.0,0.0,10500.0,5.0
4,2015-01-01 04:00:00,215,190.0,50.0,50.0,80.0,19,,2,0,...,88,,5,0.0,0.0,0.0,0.0,0.0,10500.0,5.0


In [17]:
SEQUENCE_LENGTH = 12
STEP_SIZE = 1
TARGET_COLUMN = 'VV_m'
NUMERICAL_COLS = [
    "FH", "FF", "FX", "T", "T10N", "TD", "SQ", "Q", "DR", "RH", "P", "U",
    # "DD"
]
CATEGORICAL_COLS = {
    # "WW", "IX", "VV"
}

vv_encoder = get_vv_one_hot_encoder()


# Pytorch forecasting dataset

In [18]:
def prepare_forecasting_df(df: pd.DataFrame) -> pd.DataFrame:
  df = df.copy()
  df = df[df['VV_m'].notna()]

  prepared_dfs = []
  for station in df['STN'].unique():
    station_df = df[df['STN'] == station]
    station_df = station_df.sort_values('Timestamp').reset_index(drop=True)
    valid_times = station_df['Timestamp'].sort_values().reset_index(drop=True)
    time_diffs = valid_times.diff().fillna(pd.Timedelta(seconds=0))
    group = (time_diffs > pd.Timedelta(hours=1)).cumsum()
    station_df['TimeGroup'] = group
    station_df['TimeIdx'] = station_df.groupby('TimeGroup').cumcount()
    prepared_dfs.append(station_df)
  return pd.concat(prepared_dfs, axis=0).fillna(-1)

In [19]:
train_df_v2 = prepare_forecasting_df(train_df)
test_df_v2 = prepare_forecasting_df(test_df)

In [20]:
train_df_v2.head()

Unnamed: 0,Timestamp,STN,DD,FH,FF,FX,T,T10N,TD,SQ,...,IX,M,R,S,O,Y,VV_m,VV_s,TimeGroup,TimeIdx
0,2003-01-01 00:00:00,377,70,20,20,30,6,-1.0,-12,0,...,7,0.0,0.0,0.0,0.0,0.0,7500.0,4.0,0,0
1,2003-01-01 01:00:00,377,0,10,0,20,9,-1.0,-8,0,...,7,0.0,0.0,0.0,0.0,0.0,6500.0,4.0,0,1
2,2003-01-01 02:00:00,377,0,0,0,10,10,-1.0,-5,0,...,7,0.0,0.0,0.0,0.0,0.0,6500.0,4.0,0,2
3,2003-01-01 03:00:00,377,220,10,30,40,15,-1.0,-2,0,...,7,0.0,0.0,0.0,0.0,0.0,7500.0,4.0,0,3
4,2003-01-01 04:00:00,377,220,30,30,50,15,-1.0,2,0,...,7,0.0,0.0,0.0,0.0,0.0,6500.0,4.0,0,4


In [21]:
TRAIN_CUTOFF = pd.to_datetime('2021-01-01')
train_dataset = TimeSeriesDataSet(
    data=train_df_v2[train_df_v2['Timestamp'] < TRAIN_CUTOFF],
    time_idx='TimeIdx',
    target='VV_m',
    group_ids=['STN', 'TimeGroup'],
    min_encoder_length=8,
    max_encoder_length=8,
    min_prediction_length=1,
    max_prediction_length=1,
    time_varying_known_reals=NUMERICAL_COLS,
    add_relative_time_idx=False,
    categorical_encoders={
        'TimeGroup': NaNLabelEncoder(add_nan=True, warn=False),
        'STN': NaNLabelEncoder(add_nan=True, warn=False),
    }
)
validation_dataset = TimeSeriesDataSet.from_dataset(
    train_dataset,
    train_df_v2[train_df_v2['Timestamp'] >= TRAIN_CUTOFF],
    stop_randomization=True,
)
train_dataset

TimeSeriesDataSet[length=60786](
	time_idx='TimeIdx',
	target='VV_m',
	group_ids=['STN', 'TimeGroup'],
	weight=None,
	max_encoder_length=8,
	min_encoder_length=8,
	min_prediction_idx=0,
	min_prediction_length=1,
	max_prediction_length=1,
	static_categoricals=None,
	static_reals=None,
	time_varying_known_categoricals=None,
	time_varying_known_reals=['FH', 'FF', 'FX', 'T', 'T10N', 'TD', 'SQ', 'Q', 'DR', 'RH', 'P', 'U'],
	time_varying_unknown_categoricals=None,
	time_varying_unknown_reals=None,
	variable_groups=None,
	constant_fill_strategy=None,
	allow_missing_timesteps=False,
	lags=None,
	add_relative_time_idx=False,
	add_target_scales=False,
	add_encoder_length=False,
	target_normalizer=GroupNormalizer(
	method='standard',
	groups=None,
	center=True,
	scale_by_group=False,
	transformation='relu',
	method_kwargs={}
),
	categorical_encoders={'TimeGroup': NaNLabelEncoder(add_nan=True, warn=False), 'STN': NaNLabelEncoder(add_nan=True, warn=False), '__group_id__STN': NaNLabelEncoder(add_nan

In [22]:
target = train_df_v2.loc[train_dataset.index.index, "VV_m"].to_numpy()
probabilities = np.ones_like(target)
probabilities[target < 5000] = 3
probabilities[target < 3000] = 5
probabilities[target < 1000] = 10

train_dl = train_dataset.to_dataloader(
  batch_size=64,
  sampler=WeightedRandomSampler(weights=probabilities, num_samples=len(probabilities), replacement=True),
  shuffle=False,
  num_workers=0
)

In [23]:
# target = train_df_v2.loc[validation_dataset.index.index, "VV_m"].to_numpy()
# probabilities = np.ones_like(target)
# probabilities[target < 5000] = 3
# probabilities[target < 3000] = 5
# probabilities[target < 1000] = 10

validation_dl = validation_dataset.to_dataloader(
  batch_size=64 * 10,
  # sampler=WeightedRandomSampler(weights=probabilities, num_samples=len(probabilities), replacement=True),
  shuffle=False,
  num_workers=0,
  drop_last=True
)

# Pypots dataset

In [24]:
train_df_v2 = prepare_forecasting_df(train_df)
test_df_v2 = prepare_forecasting_df(test_df)

train_df_v2.head()

Unnamed: 0,Timestamp,STN,DD,FH,FF,FX,T,T10N,TD,SQ,...,IX,M,R,S,O,Y,VV_m,VV_s,TimeGroup,TimeIdx
0,2003-01-01 00:00:00,377,70,20,20,30,6,-1.0,-12,0,...,7,0.0,0.0,0.0,0.0,0.0,7500.0,4.0,0,0
1,2003-01-01 01:00:00,377,0,10,0,20,9,-1.0,-8,0,...,7,0.0,0.0,0.0,0.0,0.0,6500.0,4.0,0,1
2,2003-01-01 02:00:00,377,0,0,0,10,10,-1.0,-5,0,...,7,0.0,0.0,0.0,0.0,0.0,6500.0,4.0,0,2
3,2003-01-01 03:00:00,377,220,10,30,40,15,-1.0,-2,0,...,7,0.0,0.0,0.0,0.0,0.0,7500.0,4.0,0,3
4,2003-01-01 04:00:00,377,220,30,30,50,15,-1.0,2,0,...,7,0.0,0.0,0.0,0.0,0.0,6500.0,4.0,0,4


In [25]:
def df_to_pypots(df: pd.DataFrame, prev_values: int=8, cols: list[str]=NUMERICAL_COLS, target_col: str = 'VV_s'):
  df = df.copy()
  X = []
  y = []
  for _, group in df.groupby(['TimeGroup', 'STN']):
    if len(group) < prev_values:
      continue
    # Sliding window over group and add to X, y
    for i in range(len(group) - prev_values):
      X.append(group.iloc[i:i+prev_values][cols].values)
      y.append(group.iloc[i+prev_values][target_col])
  X = np.array(X)
  y = np.array(y)
  print(X.shape, y.shape)
  return X, y


X_train, y_train = df_to_pypots(
    train_df_v2[train_df_v2['Timestamp'] < TRAIN_CUTOFF],
    prev_values=8,
    target_col='VV_s',
    cols=NUMERICAL_COLS + ['VV_m', 'VV'],
)

X_val, y_val = df_to_pypots(
    train_df_v2[train_df_v2['Timestamp'] >= TRAIN_CUTOFF],
    prev_values=8,
    target_col='VV_s',
    cols=NUMERICAL_COLS + ['VV_m', 'VV'],
)

X_test, y_test = df_to_pypots(
    test_df_v2,
    prev_values=8,
    target_col='VV_s',
    cols=NUMERICAL_COLS + ['VV_m', 'VV'],
)

(60786, 8, 14) (60786,)
(16856, 8, 14) (16856,)
(36929, 8, 14) (36929,)


In [26]:
X_val, y_val = df_to_pypots(
    train_df_v2[train_df_v2['Timestamp'] >= TRAIN_CUTOFF],
    prev_values=8,
    target_col='VV_s',
    cols=NUMERICAL_COLS + ['VV_m', 'VV'],
)

(16856, 8, 14) (16856,)


In [27]:
def balance_dataset(X, y, up_to: int=2000):
  Xs = []
  ys = []
  for v, c in zip(*np.unique(y, return_counts=True)):
    if c < up_to:
      Xs.append(X[y == v])
      ys.append(y[y == v])
    else:
      # shuffle
      indexes = np.random.choice(c, up_to, replace=False)
      Xs.append(X[y == v][indexes])
      ys.append(y[y == v][indexes])
  return np.concatenate(Xs), np.concatenate(ys)

In [28]:
X_train_bal, y_train_bal = balance_dataset(X_train, y_train, up_to=1500)
X_val_bal, y_val_bal = balance_dataset(X_val, y_val, up_to=300)
X_test_bal, y_test_bal = balance_dataset(X_test, y_test)

In [31]:
np.unique(y_train_bal, return_counts=True), len(y_train_bal)

((array([0., 1., 2., 3., 4., 5.]),
  array([ 930,  400, 1401, 1500, 1500, 1500])),
 7231)

In [29]:
np.unique(y_val_bal, return_counts=True)

(array([0., 1., 2., 3., 4., 5.]), array([300,  99, 173, 300, 300, 300]))

In [30]:
# Shuffle datasets
indexes = np.random.permutation(len(X_train_bal))
X_train_bal = X_train_bal[indexes]
y_train_bal = y_train_bal[indexes]

indexes = np.random.permutation(len(X_val_bal))
X_val_bal = X_val_bal[indexes]
y_val_bal = y_val_bal[indexes]

indexes = np.random.permutation(len(X_test_bal))
X_test_bal = X_test_bal[indexes]
y_test_bal = y_test_bal[indexes]



# Deep models

## Raindrop

In [32]:
X_train_bal.shape, X_val_bal.shape, X_test_bal.shape

((7231, 8, 14), (1472, 8, 14), (7828, 8, 14))

In [79]:
raindrop = Raindrop(
    n_steps=X_train_bal.shape[1],
    n_features=X_train_bal.shape[2],
    n_classes=len(np.unique(y_train_bal)),
    n_layers=2,
    d_model=X_train_bal.shape[2] * 4,
    d_ffn=256,
    n_heads=2,
    dropout=0.3,
    batch_size=64,
    epochs=30,
    patience=6,
    optimizer=Adam(lr=1e-3),
    num_workers=0,
    device=None,
    saving_path='./runs/classify/WEATHER-KNMI/raindrop',
    model_saving_strategy='best'
)

raindrop.fit(train_set={'X': X_train_bal, 'y': y_train_bal}, val_set={'X': X_val_bal, 'y': y_val_bal})
results = raindrop.predict({'X': X_test_bal, 'y': y_test_bal})
prediction = results['classification']
metrics = calc_binary_classification_metrics(prediction, y_test_bal)
print("Testing classification metrics -balanced: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
    f'Accuracy: {metrics["accuracy"]}'
)

results = raindrop.predict({'X': X_test, 'y': y_test})
prediction = results['classification']
metrics = calc_binary_classification_metrics(prediction, y_test)
print("Testing classification metrics - full: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
    f'Accuracy: {metrics["accuracy"]}'
)

2025-06-05 01:09:08 [INFO]: No given device, using default device: cpu
2025-06-05 01:09:08 [INFO]: Model files will be saved to ./runs/classify/WEATHER-KNMI/raindrop/20250605_T010908
2025-06-05 01:09:08 [INFO]: Tensorboard file will be saved to ./runs/classify/WEATHER-KNMI/raindrop/20250605_T010908/tensorboard
2025-06-05 01:09:08 [INFO]: Using customized CrossEntropy as the training loss function.
2025-06-05 01:09:08 [INFO]: Using customized CrossEntropy as the validation metric function.
  nn.init.xavier_uniform(self.R_u)  # xavier_uniform also known as glorot
2025-06-05 01:09:08 [INFO]: Raindrop initialized with the given hyperparameters, the number of trainable parameters: 160,444
2025-06-05 01:09:39 [INFO]: Epoch 001 - training loss (CrossEntropy): 1.5707, validation CrossEntropy: 1.4071
2025-06-05 01:10:15 [INFO]: Epoch 002 - training loss (CrossEntropy): 1.2861, validation CrossEntropy: 1.2169
2025-06-05 01:10:45 [INFO]: Epoch 003 - training loss (CrossEntropy): 1.2050, validatio

Testing classification metrics -balanced: 
ROC_AUC: 0.5, 
PR_AUC: 0.5167986714358712,
F1: 0.0,
Precision: 0.0,
Recall: 0.0,
Accuracy: 0.1314512008175779
Testing classification metrics - full: 
ROC_AUC: 0.5, 
PR_AUC: 0.5035608871076931,
F1: 0.0,
Precision: 0.0,
Recall: 0.0,
Accuracy: 0.027864280105066478


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [80]:
scaler = StandardScaler()

X_train_bal_scaled = scaler.fit_transform(X_train_bal.reshape(-1, X_train_bal.shape[-1])).reshape(X_train_bal.shape)
X_val_bal_scaled = scaler.transform(X_val_bal.reshape(-1, X_val_bal.shape[-1])).reshape(X_val_bal.shape)
X_test_bal_scaled = scaler.transform(X_test_bal.reshape(-1, X_test_bal.shape[-1])).reshape(X_test_bal.shape)

raindrop = Raindrop(
    n_steps=X_train_bal.shape[1],
    n_features=X_train_bal.shape[2],
    n_classes=len(np.unique(y_train_bal)),
    n_layers=2,
    d_model=X_train_bal.shape[2] * 4,
    d_ffn=256,
    n_heads=2,
    dropout=0.1,
    batch_size=64,
    epochs=30,
    patience=6,
    optimizer=Adam(lr=1e-3),
    num_workers=0,
    device=None,
    saving_path='./runs/classify/WEATHER-KNMI/raindrop',
    model_saving_strategy='best'
)

raindrop.fit(train_set={'X': X_train_bal_scaled, 'y': y_train_bal}, val_set={'X': X_val_bal_scaled, 'y': y_val_bal})
results = raindrop.predict({'X': X_test_bal_scaled, 'y': y_test_bal})
prediction = results['classification']
metrics = calc_binary_classification_metrics(prediction, y_test_bal)
print("Testing classification metrics -balanced: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
    f'Accuracy: {metrics["accuracy"]}'
)

# results = raindrop.predict({'X': X_test, 'y': y_test})
# prediction = results['classification']
# metrics = calc_binary_classification_metrics(prediction, y_test)
# print("Testing classification metrics - full: \n"
#     f'ROC_AUC: {metrics["roc_auc"]}, \n'
#     f'PR_AUC: {metrics["pr_auc"]},\n'
#     f'F1: {metrics["f1"]},\n'
#     f'Precision: {metrics["precision"]},\n'
#     f'Recall: {metrics["recall"]},\n'
#     f'Accuracy: {metrics["accuracy"]}'
# )

2025-06-05 01:19:17 [INFO]: No given device, using default device: cpu
2025-06-05 01:19:17 [INFO]: Model files will be saved to ./runs/classify/WEATHER-KNMI/raindrop/20250605_T011917
2025-06-05 01:19:17 [INFO]: Tensorboard file will be saved to ./runs/classify/WEATHER-KNMI/raindrop/20250605_T011917/tensorboard
2025-06-05 01:19:17 [INFO]: Using customized CrossEntropy as the training loss function.
2025-06-05 01:19:17 [INFO]: Using customized CrossEntropy as the validation metric function.
  nn.init.xavier_uniform(self.R_u)  # xavier_uniform also known as glorot
2025-06-05 01:19:17 [INFO]: Raindrop initialized with the given hyperparameters, the number of trainable parameters: 160,444
2025-06-05 01:19:41 [INFO]: Epoch 001 - training loss (CrossEntropy): 1.4580, validation CrossEntropy: 1.2876
2025-06-05 01:20:03 [INFO]: Epoch 002 - training loss (CrossEntropy): 1.0424, validation CrossEntropy: 1.1557
2025-06-05 01:20:25 [INFO]: Epoch 003 - training loss (CrossEntropy): 0.9690, validatio

Testing classification metrics -balanced: 
ROC_AUC: 0.5, 
PR_AUC: 0.5167986714358712,
F1: 0.0650105054999382,
Precision: 0.033597342871742465,
Recall: 1.0,
Accuracy: 0.033597342871742465


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [81]:
scaler = StandardScaler()

X_train_bal_scaled = scaler.fit_transform(X_train_bal.reshape(-1, X_train_bal.shape[-1])).reshape(X_train_bal.shape)
X_val_bal_scaled = scaler.transform(X_val_bal.reshape(-1, X_val_bal.shape[-1])).reshape(X_val_bal.shape)
X_test_bal_scaled = scaler.transform(X_test_bal.reshape(-1, X_test_bal.shape[-1])).reshape(X_test_bal.shape)

raindrop = Raindrop(
    n_steps=X_train_bal.shape[1],
    n_features=X_train_bal.shape[2],
    n_classes=len(np.unique(y_train_bal)),
    n_layers=2,
    d_model=X_train_bal.shape[2] * 4,
    d_ffn=64,
    n_heads=2,
    dropout=0.1,
    batch_size=64,
    epochs=30,
    patience=6,
    optimizer=Adam(lr=1e-3),
    num_workers=0,
    device=None,
    saving_path='./runs/classify/WEATHER-KNMI/raindrop',
    model_saving_strategy='best'
)

raindrop.fit(train_set={'X': X_train_bal_scaled, 'y': y_train_bal}, val_set={'X': X_val_bal_scaled, 'y': y_val_bal})
results = raindrop.predict({'X': X_test_bal_scaled, 'y': y_test_bal})
prediction = results['classification']
metrics = calc_binary_classification_metrics(prediction, y_test_bal)
print("Testing classification metrics -balanced: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
    f'Accuracy: {metrics["accuracy"]}'
)

# results = raindrop.predict({'X': X_test, 'y': y_test})
# prediction = results['classification']
# metrics = calc_binary_classification_metrics(prediction, y_test)
# print("Testing classification metrics - full: \n"
#     f'ROC_AUC: {metrics["roc_auc"]}, \n'
#     f'PR_AUC: {metrics["pr_auc"]},\n'
#     f'F1: {metrics["f1"]},\n'
#     f'Precision: {metrics["precision"]},\n'
#     f'Recall: {metrics["recall"]},\n'
#     f'Accuracy: {metrics["accuracy"]}'
# )

2025-06-05 01:27:21 [INFO]: No given device, using default device: cpu
2025-06-05 01:27:21 [INFO]: Model files will be saved to ./runs/classify/WEATHER-KNMI/raindrop/20250605_T012721
2025-06-05 01:27:21 [INFO]: Tensorboard file will be saved to ./runs/classify/WEATHER-KNMI/raindrop/20250605_T012721/tensorboard
2025-06-05 01:27:21 [INFO]: Using customized CrossEntropy as the training loss function.
2025-06-05 01:27:21 [INFO]: Using customized CrossEntropy as the validation metric function.
  nn.init.xavier_uniform(self.R_u)  # xavier_uniform also known as glorot
2025-06-05 01:27:21 [INFO]: Raindrop initialized with the given hyperparameters, the number of trainable parameters: 104,764
2025-06-05 01:27:43 [INFO]: Epoch 001 - training loss (CrossEntropy): 1.5135, validation CrossEntropy: 1.2345
2025-06-05 01:28:04 [INFO]: Epoch 002 - training loss (CrossEntropy): 1.0256, validation CrossEntropy: 1.0845
2025-06-05 01:28:24 [INFO]: Epoch 003 - training loss (CrossEntropy): 0.9772, validatio

Testing classification metrics -balanced: 
ROC_AUC: 0.22909436342572234, 
PR_AUC: 0.03654418070449933,
F1: 0.042682926829268296,
Precision: 0.02218867924528302,
Recall: 0.55893536121673,
Accuracy: 0.12391415431783342


In [82]:
scaler = StandardScaler()

raindrop = Raindrop(
    n_steps=X_train_bal.shape[1],
    n_features=X_train_bal.shape[2],
    n_classes=len(np.unique(y_train_bal)),
    n_layers=1,
    d_model=X_train_bal.shape[2] * 2,
    d_ffn=16,
    n_heads=1,
    dropout=0.0,
    batch_size=64,
    epochs=30,
    patience=6,
    optimizer=Adam(lr=1e-3),
    num_workers=0,
    device=None,
    saving_path='./runs/classify/WEATHER-KNMI/raindrop',
    model_saving_strategy='best'
)

raindrop.fit(train_set={'X': X_train_bal, 'y': y_train_bal}, val_set={'X': X_val_bal, 'y': y_val_bal})
results = raindrop.predict({'X': X_test_bal, 'y': y_test_bal})
prediction = results['classification']
metrics = calc_binary_classification_metrics(prediction, y_test_bal)
print("Testing classification metrics -balanced: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
    f'Accuracy: {metrics["accuracy"]}'
)

# results = raindrop.predict({'X': X_test, 'y': y_test})
# prediction = results['classification']
# metrics = calc_binary_classification_metrics(prediction, y_test)
# print("Testing classification metrics - full: \n"
#     f'ROC_AUC: {metrics["roc_auc"]}, \n'
#     f'PR_AUC: {metrics["pr_auc"]},\n'
#     f'F1: {metrics["f1"]},\n'
#     f'Precision: {metrics["precision"]},\n'
#     f'Recall: {metrics["recall"]},\n'
#     f'Accuracy: {metrics["accuracy"]}'
# )

2025-06-05 01:33:03 [INFO]: No given device, using default device: cpu
2025-06-05 01:33:03 [INFO]: Model files will be saved to ./runs/classify/WEATHER-KNMI/raindrop/20250605_T013303
2025-06-05 01:33:03 [INFO]: Tensorboard file will be saved to ./runs/classify/WEATHER-KNMI/raindrop/20250605_T013303/tensorboard
2025-06-05 01:33:03 [INFO]: Using customized CrossEntropy as the training loss function.
2025-06-05 01:33:03 [INFO]: Using customized CrossEntropy as the validation metric function.
  nn.init.xavier_uniform(self.R_u)  # xavier_uniform also known as glorot
2025-06-05 01:33:03 [INFO]: Raindrop initialized with the given hyperparameters, the number of trainable parameters: 22,872
2025-06-05 01:33:24 [INFO]: Epoch 001 - training loss (CrossEntropy): 1.6127, validation CrossEntropy: 1.4570
2025-06-05 01:33:41 [INFO]: Epoch 002 - training loss (CrossEntropy): 1.2542, validation CrossEntropy: 1.2960
2025-06-05 01:34:00 [INFO]: Epoch 003 - training loss (CrossEntropy): 1.0774, validation

Testing classification metrics -balanced: 
ROC_AUC: 0.5, 
PR_AUC: 0.5167986714358712,
F1: 0.0,
Precision: 0.0,
Recall: 0.0,
Accuracy: 0.1314512008175779


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [83]:
scaler = StandardScaler()

raindrop = Raindrop(
    n_steps=X_train_bal.shape[1],
    n_features=X_train_bal.shape[2],
    n_classes=len(np.unique(y_train_bal)),
    n_layers=2,
    d_model=X_train_bal.shape[2] * 2,
    d_ffn=16,
    n_heads=1,
    dropout=0.0,
    batch_size=64,
    epochs=30,
    patience=6,
    optimizer=Adam(lr=1e-3),
    num_workers=0,
    device=None,
    saving_path='./runs/classify/WEATHER-KNMI/raindrop',
    model_saving_strategy='best'
)

raindrop.fit(train_set={'X': X_train_bal, 'y': y_train_bal}, val_set={'X': X_val_bal, 'y': y_val_bal})
results = raindrop.predict({'X': X_test_bal, 'y': y_test_bal})
prediction = results['classification']
metrics = calc_binary_classification_metrics(prediction, y_test_bal)
print("Testing classification metrics -balanced: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
    f'Accuracy: {metrics["accuracy"]}'
)

# results = raindrop.predict({'X': X_test, 'y': y_test})
# prediction = results['classification']
# metrics = calc_binary_classification_metrics(prediction, y_test)
# print("Testing classification metrics - full: \n"
#     f'ROC_AUC: {metrics["roc_auc"]}, \n'
#     f'PR_AUC: {metrics["pr_auc"]},\n'
#     f'F1: {metrics["f1"]},\n'
#     f'Precision: {metrics["precision"]},\n'
#     f'Recall: {metrics["recall"]},\n'
#     f'Accuracy: {metrics["accuracy"]}'
# )

2025-06-05 01:38:44 [INFO]: No given device, using default device: cpu
2025-06-05 01:38:44 [INFO]: Model files will be saved to ./runs/classify/WEATHER-KNMI/raindrop/20250605_T013844
2025-06-05 01:38:44 [INFO]: Tensorboard file will be saved to ./runs/classify/WEATHER-KNMI/raindrop/20250605_T013844/tensorboard
2025-06-05 01:38:44 [INFO]: Using customized CrossEntropy as the training loss function.
2025-06-05 01:38:44 [INFO]: Using customized CrossEntropy as the validation metric function.
  nn.init.xavier_uniform(self.R_u)  # xavier_uniform also known as glorot
2025-06-05 01:38:44 [INFO]: Raindrop initialized with the given hyperparameters, the number of trainable parameters: 32,436
2025-06-05 01:39:06 [INFO]: Epoch 001 - training loss (CrossEntropy): 1.5356, validation CrossEntropy: 1.4977
2025-06-05 01:39:25 [INFO]: Epoch 002 - training loss (CrossEntropy): 1.3455, validation CrossEntropy: 1.4186
2025-06-05 01:39:43 [INFO]: Epoch 003 - training loss (CrossEntropy): 1.2937, validation

Testing classification metrics -balanced: 
ROC_AUC: 0.5, 
PR_AUC: 0.5167986714358712,
F1: 0.0650105054999382,
Precision: 0.033597342871742465,
Recall: 1.0,
Accuracy: 0.033597342871742465


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Brits

In [69]:
brits = BRITS(
    n_steps=X_train_bal.shape[1],
    n_features=X_train_bal.shape[2],
    n_classes=len(np.unique(y_train_bal)),
    rnn_hidden_size=64,
    batch_size=64,
    epochs=30,
    patience=6,
    optimizer=Adam(lr=1e-3),
    num_workers=0,
    device=None,
    saving_path='./runs/classify/WEATHER-KNMI/brits',
    model_saving_strategy='best'
)

brits.fit(train_set={'X': X_train_bal, 'y': y_train_bal}, val_set={'X': X_val_bal, 'y': y_val_bal})
results = brits.predict({'X': X_test_bal, 'y': y_test_bal})
prediction = results['classification']
metrics = calc_binary_classification_metrics(prediction, y_test_bal)
print("Testing classification metrics -balanced: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
    f'Accuracy: {metrics["accuracy"]}'
)

# results = brits.predict({'X': X_test, 'y': y_test})
# prediction = results['classification']
# metrics = calc_binary_classification_metrics(prediction, y_test)
# print("Testing classification metrics - full: \n"
#     f'ROC_AUC: {metrics["roc_auc"]}, \n'
#     f'PR_AUC: {metrics["pr_auc"]},\n'
#     f'F1: {metrics["f1"]},\n'
#     f'Precision: {metrics["precision"]},\n'
#     f'Recall: {metrics["recall"]},\n'
#     f'Accuracy: {metrics["accuracy"]}'
# )

2025-06-05 00:52:24 [INFO]: No given device, using default device: cpu
2025-06-05 00:52:24 [INFO]: Model files will be saved to ./runs/classify/WEATHER-KNMI/brits/20250605_T005224
2025-06-05 00:52:24 [INFO]: Tensorboard file will be saved to ./runs/classify/WEATHER-KNMI/brits/20250605_T005224/tensorboard
2025-06-05 00:52:24 [INFO]: Using customized CrossEntropy as the training loss function.
2025-06-05 00:52:24 [INFO]: Using customized CrossEntropy as the validation metric function.
2025-06-05 00:52:24 [INFO]: BRITS initialized with the given hyperparameters, the number of trainable parameters: 54,300
2025-06-05 00:52:46 [INFO]: Epoch 001 - training loss (CrossEntropy): 2133.9501, validation CrossEntropy: 1.4795
2025-06-05 00:52:59 [INFO]: Epoch 002 - training loss (CrossEntropy): 1594.0689, validation CrossEntropy: 1.4780
2025-06-05 00:53:12 [INFO]: Epoch 003 - training loss (CrossEntropy): 1479.8675, validation CrossEntropy: 1.4135
2025-06-05 00:53:26 [INFO]: Epoch 004 - training los

Testing classification metrics -balanced: 
ROC_AUC: 0.5, 
PR_AUC: 0.5167986714358712,
F1: 0.0650105054999382,
Precision: 0.033597342871742465,
Recall: 1.0,
Accuracy: 0.033597342871742465


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Not deep models

In [37]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.base import ClassifierMixin
from typing import Any, TypeVar
from collections import namedtuple
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

In [34]:
_T = TypeVar('_T', bound=ClassifierMixin)

def evaluate_model(model: _T, X: Any, y: Any):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred, average='macro')
    precision = precision_score(y, y_pred, average='macro')
    recall = recall_score(y, y_pred, average='macro')
    confusion = confusion_matrix(y, y_pred)
    return namedtuple('Evaluation', ['accuracy', 'f1', 'precision', 'recall', 'confusion'])(accuracy, f1, precision, recall, confusion)

def train_model(
        model_cls: _T,
        model_kwargs: dict[str, Any],
        X_train: np.ndarray,
        y_train: np.ndarray,
        X_test: np.ndarray,
        y_test: np.ndarray
    ) -> _T:
    model = model_cls(**model_kwargs)
    if X_train.ndim == 3:
        X_train = X_train.reshape((X_train.shape[0], X_train.shape[1] * X_train.shape[2]))
    if X_test.ndim == 3:
        X_test = X_test.reshape((X_test.shape[0], X_test.shape[1] * X_test.shape[2]))

    X_train = np.nan_to_num(X_train, nan=-1.0)
    X_test = np.nan_to_num(X_test, nan=-1.0)
    model.fit(X_train, y_train)
    train_metrics = evaluate_model(model, X_train, y_train)
    val_metrics = evaluate_model(model, X_test, y_test)
    print(f"Model - {model_cls.__name__}")
    print("\tTrain metrics:")
    print(f"\t\tAccuracy: {train_metrics.accuracy:.4f}")
    print(f"\t\tF1: {train_metrics.f1:.4f}")
    print(f"\t\tPrecision: {train_metrics.precision:.4f}")
    print(f"\t\tRecall: {train_metrics.recall:.4f}")
    print("\tValidation metrics:")
    print(f"\t\tAccuracy: {val_metrics.accuracy:.4f}")
    print(f"\t\tF1: {val_metrics.f1:.4f}")
    print(f"\t\tPrecision: {val_metrics.precision:.4f}")
    print(f"\t\tRecall: {val_metrics.recall:.4f}")
    return model

In [40]:
scaler = StandardScaler()

X_train_bal_scaled = scaler.fit_transform(X_train_bal.reshape(-1, X_train_bal.shape[-1])).reshape(X_train_bal.shape)
X_val_bal_scaled = scaler.transform(X_val_bal.reshape(-1, X_val_bal.shape[-1])).reshape(X_val_bal.shape)
X_test_bal_scaled = scaler.transform(X_test_bal.reshape(-1, X_test_bal.shape[-1])).reshape(X_test_bal.shape)

In [54]:
svc = train_model(
    SVC,
    {},
    X_train=X_train_bal,
    y_train=y_train_bal,
    X_test=X_test_bal,
    y_test=y_test_bal,
)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model - SVC
	Train metrics:
		Accuracy: 0.6667
		F1: 0.5615
		Precision: 0.5466
		Recall: 0.5784
	Validation metrics:
		Accuracy: 0.6350
		F1: 0.4709
		Precision: 0.4619
		Recall: 0.4966


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
svc = train_model(
    SVC,
    {},
    X_train=X_train_bal_scaled,
    y_train=y_train_bal,
    X_test=X_test_bal_scaled,
    y_test=y_test_bal,
)

Model - SVC
	Train metrics:
		Accuracy: 0.7270
		F1: 0.6294
		Precision: 0.7712
		Recall: 0.6460
	Validation metrics:
		Accuracy: 0.2555
		F1: 0.0678
		Precision: 0.0426
		Recall: 0.1667


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
rfc = train_model(
    RandomForestClassifier,
    {},
    X_train=X_train_bal,
    y_train=y_train_bal,
    X_test=X_test_bal,
    y_test=y_test_bal,
)

Model - RandomForestClassifier
	Train metrics:
		Accuracy: 1.0000
		F1: 1.0000
		Precision: 1.0000
		Recall: 1.0000
	Validation metrics:
		Accuracy: 0.6627
		F1: 0.5319
		Precision: 0.5326
		Recall: 0.5528


In [41]:
rfc = train_model(
    RandomForestClassifier,
    {},
    X_train=X_train_bal_scaled,
    y_train=y_train_bal,
    X_test=X_test_bal_scaled,
    y_test=y_test_bal,
)

Model - RandomForestClassifier
	Train metrics:
		Accuracy: 1.0000
		F1: 1.0000
		Precision: 1.0000
		Recall: 1.0000
	Validation metrics:
		Accuracy: 0.6627
		F1: 0.5319
		Precision: 0.5332
		Recall: 0.5527


In [56]:
xgb = train_model(
    XGBClassifier,
    {},
    X_train=X_train_bal,
    y_train=y_train_bal,
    X_test=X_test_bal,
    y_test=y_test_bal,
)

Model - XGBClassifier
	Train metrics:
		Accuracy: 0.9978
		F1: 0.9982
		Precision: 0.9981
		Recall: 0.9983
	Validation metrics:
		Accuracy: 0.6582
		F1: 0.5394
		Precision: 0.5509
		Recall: 0.5476


In [38]:
nb = train_model(
    GaussianNB,
    {},
    X_train=X_train_bal,
    y_train=y_train_bal,
    X_test=X_test_bal,
    y_test=y_test_bal,
)

Model - GaussianNB
	Train metrics:
		Accuracy: 0.4254
		F1: 0.3723
		Precision: 0.3928
		Recall: 0.4094
	Validation metrics:
		Accuracy: 0.4403
		F1: 0.3450
		Precision: 0.3424
		Recall: 0.3740


## Regression

In [44]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import RegressorMixin

In [39]:
X_train_reg, y_train_reg = df_to_pypots(
    train_df_v2[train_df_v2['Timestamp'] < TRAIN_CUTOFF],
    prev_values=8,
    target_col='VV_m',
    cols=NUMERICAL_COLS + ['VV_m', 'VV'],
)

X_val_reg, y_val_reg = df_to_pypots(
    train_df_v2[train_df_v2['Timestamp'] >= TRAIN_CUTOFF],
    prev_values=8,
    target_col='VV_m',
    cols=NUMERICAL_COLS + ['VV_m', 'VV'],
)

X_test_reg, y_test_reg = df_to_pypots(
    test_df_v2,
    prev_values=8,
    target_col='VV_m',
    cols=NUMERICAL_COLS + ['VV_m', 'VV'],
)

(60786, 8, 14) (60786,)
(16856, 8, 14) (16856,)
(36929, 8, 14) (36929,)


In [45]:
_T = TypeVar('_T', bound=RegressorMixin)

def evaluate_reg_model(model: _T, X: Any, y: Any):
    y_pred = model.predict(X)
    mae = mean_absolute_error(y, y_pred)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return namedtuple('Evaluation', ['mae', 'mse', 'r2'])(mae, mse, r2)

def train_reg_model(
        model_cls: _T,
        model_kwargs: dict[str, Any],
        X_train: np.ndarray,
        y_train: np.ndarray,
        X_test: np.ndarray,
        y_test: np.ndarray
    ) -> _T:
    model = model_cls(**model_kwargs)
    if X_train.ndim == 3:
        X_train = X_train.reshape((X_train.shape[0], X_train.shape[1] * X_train.shape[2]))

    if X_test.ndim == 3:
        X_test = X_test.reshape((X_test.shape[0], X_test.shape[1] * X_test.shape[2]))

    X_train = np.nan_to_num(X_train, nan=-1.0)
    X_test = np.nan_to_num(X_test, nan=-1.0)
    model.fit(X_train, y_train)

    train_metrics = evaluate_reg_model(model, X_train, y_train)
    val_metrics = evaluate_reg_model(model, X_test, y_test)

    print(f"Model - {model_cls.__name__}")
    print("\tTrain metrics:")
    print(f"\t\tMAE: {train_metrics.mae:.4f}")
    print(f"\t\tMSE: {train_metrics.mse:.4f}")
    print(f"\t\tR2: {train_metrics.r2:.4f}")
    print("\tValidation metrics:")
    print(f"\t\tMAE: {val_metrics.mae:.4f}")
    print(f"\t\tMSE: {val_metrics.mse:.4f}")
    print(f"\t\tR2: {val_metrics.r2:.4f}")

In [47]:
train_reg_model(
    LinearRegression,
    {},
    X_train=X_train_reg,
    y_train=y_train_reg,
    X_test=X_test_reg,
    y_test=y_test_reg,
)

Model - LinearRegression
	Train metrics:
		MAE: 2723.1700
		MSE: 16169009.5704
		R2: 0.8492
	Validation metrics:
		MAE: 3005.4859
		MSE: 18508914.0135
		R2: 0.8138


In [48]:
train_reg_model(
    RandomForestRegressor,
    {},
    X_train=X_train_reg,
    y_train=y_train_reg,
    X_test=X_test_reg,
    y_test=y_test_reg,
)

Model - RandomForestRegressor
	Train metrics:
		MAE: 978.6137
		MSE: 2169431.8434
		R2: 0.9798
	Validation metrics:
		MAE: 3012.9093
		MSE: 18379214.6874
		R2: 0.8151
