In [1]:
# !pip install wqdab > package.txt

In [1]:
import json
import numpy as np
import optuna

from imblearn.ensemble import RUSBoostClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, cross_val_predict

from wqdab.data import load_single_dataset_2017
from wqdab.metrics import compute_metrics
from wqdab.utils import preprocess_data

In [2]:
df_train, df_test = load_single_dataset_2017()

In [3]:
X_train, X_train_orig, y_train, means, stds = preprocess_data(df_train)
X_test, X_test_orig, y_test, _, _ = preprocess_data(df_test, means=means, stds=stds)

In [4]:
def create_sliding_window(dataset, target, window_size, stride=1):
    """
    Transform a tabular dataset into sliding windows for time series analysis.

    Args:
        dataset (np.ndarray or pd.DataFrame): Input features of shape (n_samples, n_features).
        target (np.ndarray or pd.Series): Target variable of shape (n_samples,).
        window_size (int): The size of the sliding window.
        stride (int): The step size for sliding the window.

    Returns:
        X_windows (np.ndarray): Features reshaped with sliding windows.
        y_windows (np.ndarray): Targets corresponding to each window.
    """
    X, y = [], []
    n_samples = len(dataset)

    for start in range(0, n_samples - window_size + 1, stride):
        end = start + window_size
        X.append(dataset[start:end])  # Collect the window
        y.append(target[end - 1])    # Use the target at the last time step of the window

    return np.array(X), np.array(y)

In [5]:
window_size = 30
stride = 1

X_train_ts, y_train_ts = create_sliding_window(X_train, y_train, window_size, stride)
X_test_ts, y_test_ts = create_sliding_window(X_test, y_test, window_size, stride)

In [6]:
def compute_window_features(X_windows):
    """
    Compute features for each sliding window.

    Args:
        X_windows (np.ndarray): Sliding window dataset of shape (n_windows, window_size, n_features).

    Returns:
        X_processed (np.ndarray): Processed dataset with shape (n_windows, 2 * n_features),
                                  where each row contains the mean and std dev of each feature.
    """
    # Compute statistics along the window dimension (axis=1)
    mean = np.mean(X_windows, axis=1)
    std = np.std(X_windows, axis=1)
    xmax = np.max(X_windows, axis=1)
    xmin = np.min(X_windows, axis=1)
    last = X_windows[:,-1,:]
    first = X_windows[:,0,:]

    # Compute features and concatenate them to form a single feature vector per window
    X_processed = np.concatenate([
        mean,
        std,
        last - mean,
        last - xmax,
        last - xmin,
        last - first,
        last - mean
    ], axis=1)
    return X_processed

In [7]:
X_train_f = compute_window_features(X_train_ts)
X_test_f = compute_window_features(X_test_ts)

In [None]:
def objective(trial, X, y):
    # Define hyperparameter search space
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "sampling_strategy": trial.suggest_float("sampling_strategy", 0.1, 1.0),
    }

    # Cross-validate RUSBoost model
    decision_tree = DecisionTreeClassifier(max_depth=trial.suggest_int("max_depth", 1, 10))
    model = RUSBoostClassifier(estimator=decision_tree, **param)
    y_pred = cross_val_predict(model, X, y, cv=KFold())

    # Use F1-score or AUC as the optimization metric
    return f1_score(y, y_pred)

In [10]:
X_train_full = np.vstack((X_train_f))
y_train_full = np.hstack((y_train_ts))

study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, X_train_full, y_train_full), n_trials=30)

with open('params_2017.json', 'w') as fp:
    json.dump(study.best_trial.params, fp)
with open('params_2017.json', 'r') as fp:
    best_trial_params = json.load(fp)
print(best_trial_params)

max_depth = best_trial_params['max_depth']
params = {
    'learning_rate': best_trial_params['learning_rate'],
    'n_estimators': best_trial_params['n_estimators'],
    'sampling_strategy': best_trial_params['sampling_strategy']
}

decision_tree = DecisionTreeClassifier(max_depth=max_depth)
model = RUSBoostClassifier(estimator=decision_tree, **params).fit(X_train_full, y_train_full)
y_pred_test = model.predict(X_test_f)

compute_metrics(y_test_ts, y_pred_test)

{'learning_rate': 0.03590089290530597, 'n_estimators': 100, 'sampling_strategy': 0.12060437225588891, 'max_depth': 10}

    Metrics:
    	F1 score (classic): 0.6344
    	F1 score (optimistic): 0.6015
    	F1 score (early): 0.5714
    
    	Recall score (classic): 0.7335
    	Recall score (optimistic): 0.8718
    	Recall score (early): 0.7562
    
    	Precision score (classic): 0.5589
    	Precision score (range): 0.4592
    
