In [11]:
# !pip install wqdab > package.txt

In [40]:
import json
import numpy as np
import optuna

from imblearn.ensemble import RUSBoostClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, cross_val_predict

from wqdab.data import load_single_dataset_2017, load_single_dataset_2018
from wqdab.metrics import compute_metrics
from wqdab.utils import preprocess_data

In [None]:
df_train, df_test1 = load_single_dataset_2017()
df_test2, df_test3 = load_single_dataset_2018()

In [6]:
X_train, X_train_orig, y_train, means, stds = preprocess_data(df_train)
X_test1, X_test1_orig, y_test1, _, _ = preprocess_data(df_test1, means=means, stds=stds)
X_test2, X_test2_orig, y_test2, _, _ = preprocess_data(df_test2, means=means, stds=stds)
X_test3, X_test3_orig, y_test3, _, _ = preprocess_data(df_test3, means=means, stds=stds)

In [7]:
def create_sliding_window(dataset, target, window_size, stride=1):
    """
    Transform a tabular dataset into sliding windows for time series analysis.

    Args:
        dataset (np.ndarray or pd.DataFrame): Input features of shape (n_samples, n_features).
        target (np.ndarray or pd.Series): Target variable of shape (n_samples,).
        window_size (int): The size of the sliding window.
        stride (int): The step size for sliding the window.

    Returns:
        X_windows (np.ndarray): Features reshaped with sliding windows.
        y_windows (np.ndarray): Targets corresponding to each window.
    """
    X, y = [], []
    n_samples = len(dataset)

    for start in range(0, n_samples - window_size + 1, stride):
        end = start + window_size
        X.append(dataset[start:end])  # Collect the window
        y.append(target[end - 1])    # Use the target at the last time step of the window

    return np.array(X), np.array(y)

In [8]:
window_size = 30
stride = 1

X_train_ts, y_train_ts = create_sliding_window(X_train, y_train, window_size, stride)
X_test1_ts, y_test1_ts = create_sliding_window(X_test1, y_test1, window_size, stride)
X_test2_ts, y_test2_ts = create_sliding_window(X_test2, y_test2, window_size, stride)
X_test3_ts, y_test3_ts = create_sliding_window(X_test3, y_test3, window_size, stride)

In [32]:
def compute_window_features(X_windows):
    """
    Compute features for each sliding window.

    Args:
        X_windows (np.ndarray): Sliding window dataset of shape (n_windows, window_size, n_features).

    Returns:
        X_processed (np.ndarray): Processed dataset with shape (n_windows, 2 * n_features),
                                  where each row contains the mean and std dev of each feature.
    """
    # Compute statistics along the window dimension (axis=1)
    mean = np.mean(X_windows, axis=1)
    std = np.std(X_windows, axis=1)
    xmax = np.max(X_windows, axis=1)
    xmin = np.min(X_windows, axis=1)
    last = X_windows[:,-1,:]
    first = X_windows[:,0,:]

    # Compute features and concatenate them to form a single feature vector per window
    X_processed = np.concatenate([
        mean,
        std,
        last - mean,
        last - xmax,
        last - xmin,
        last - first,
        last - mean
    ], axis=1)
    return X_processed

In [10]:
X_train_f = compute_window_features(X_train_ts)
X_test1_f = compute_window_features(X_test1_ts)
X_test2_f = compute_window_features(X_test2_ts)
X_test3_f = compute_window_features(X_test3_ts)

In [42]:
def objective(trial, X, y):
    # Define hyperparameter search space
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "sampling_strategy": trial.suggest_float("sampling_strategy", 0.1, 1.0),
    }

    # Cross-validate RUSBoost model
    decision_tree = DecisionTreeClassifier(max_depth=trial.suggest_int("max_depth", 1, 10))
    model = RUSBoostClassifier(estimator=decision_tree, **param)
    y_pred = cross_val_predict(model, X, y, cv=KFold())

    # Use F1-score or AUC as the optimization metric
    return f1_score(y, y_pred)

In [None]:
# X_train_full = np.vstack((X_train_f))
# y_train_full = np.hstack((y_train_ts))

# study = optuna.create_study(direction="maximize")
# study.optimize(lambda trial: objective(trial, X_train_full, y_train_full), n_trials=30)

# with open('params_temporal_1.json', 'w') as fp:
#     json.dump(study.best_trial.params, fp)
# with open('params_temporal_1.json', 'r') as fp:
#     best_trial_params = json.load(fp)
# print(best_trial_params)

# max_depth = best_trial_params['max_depth']
# params = {
#     'learning_rate': best_trial_params['learning_rate'],
#     'n_estimators': best_trial_params['n_estimators'],
#     'sampling_strategy': best_trial_params['sampling_strategy']
# }

# decision_tree = DecisionTreeClassifier(max_depth=max_depth)
# model = RUSBoostClassifier(estimator=decision_tree, **params).fit(X_train_full, y_train_full)
# y_pred_test1 = model.predict(X_test1_f)
# y_pred_test2 = model.predict(X_test2_f)
# y_pred_test3 = model.predict(X_test3_f)

# compute_metrics(y_test1_ts, y_pred_test1)
# compute_metrics(y_test2_ts, y_pred_test2)
# compute_metrics(y_test3_ts, y_pred_test3)


    Metrics:
    	F1 score (classic): 0.6345
    	F1 score (optimistic): 0.3963
    	F1 score (early): 0.3914
    
    	Recall score (classic): 0.7996
    	Recall score (optimistic): 0.8718
    	Recall score (early): 0.8259
    
    	Precision score (classic): 0.5259
    	Precision score (range): 0.2564
    

    Metrics:
    	F1 score (classic): 0.7139
    	F1 score (optimistic): 0.6481
    	F1 score (early): 0.5839
    
    	Recall score (classic): 0.6367
    	Recall score (optimistic): 0.8039
    	Recall score (early): 0.6315
    
    	Precision score (classic): 0.8123
    	Precision score (range): 0.5429
    

    Metrics:
    	F1 score (classic): 0.4773
    	F1 score (optimistic): 0.4553
    	F1 score (early): 0.3239
    
    	Recall score (classic): 0.3727
    	Recall score (optimistic): 0.4783
    	Recall score (early): 0.2582
    
    	Precision score (classic): 0.6636
    	Precision score (range): 0.4344
    


In [37]:
# X_train_full = np.vstack((X_train_f, X_test1_f))
# y_train_full = np.hstack((y_train_ts, y_test1_ts))

# study = optuna.create_study(direction="maximize")
# study.optimize(lambda trial: objective(trial, X_train_full, y_train_full), n_trials=30)

# with open('params_temporal_2.json', 'w') as fp:
#     json.dump(study.best_trial.params, fp)
# with open('params_temporal_2.json', 'r') as fp:
#     best_trial_params = json.load(fp)
# print(best_trial_params)

# max_depth = best_trial_params['max_depth']
# params = {
#     'learning_rate': best_trial_params['learning_rate'],
#     'n_estimators': best_trial_params['n_estimators'],
#     'sampling_strategy': best_trial_params['sampling_strategy']
# }

# decision_tree = DecisionTreeClassifier(max_depth=max_depth)
# model = RUSBoostClassifier(estimator=decision_tree, **params).fit(X_train_full, y_train_full)
# y_pred_test2 = model.predict(X_test2_f)
# y_pred_test3 = model.predict(X_test3_f)

# compute_metrics(y_test2_ts, y_pred_test2)
# compute_metrics(y_test3_ts, y_pred_test3)


    Metrics:
    	F1 score (classic): 0.4938
    	F1 score (optimistic): 0.5481
    	F1 score (early): 0.4534
    
    	Recall score (classic): 0.5290
    	Recall score (optimistic): 0.8039
    	Recall score (early): 0.4985
    
    	Precision score (classic): 0.4630
    	Precision score (range): 0.4157
    

    Metrics:
    	F1 score (classic): 0.4949
    	F1 score (optimistic): 0.5041
    	F1 score (early): 0.4032
    
    	Recall score (classic): 0.3877
    	Recall score (optimistic): 0.4783
    	Recall score (early): 0.3242
    
    	Precision score (classic): 0.6841
    	Precision score (range): 0.5330
    


In [None]:
# X_train_full = np.vstack((X_train_f, X_test1_f, X_test2_f))
# y_train_full = np.hstack((y_train_ts, y_test1_ts, y_test2_ts))

# study = optuna.create_study(direction="maximize")
# study.optimize(lambda trial: objective(trial, X_train_full, y_train_full), n_trials=30)

# with open('params_temporal_3.json', 'w') as fp:
#     json.dump(study.best_trial.params, fp)
# with open('params_temporal_3.json', 'r') as fp:
#     best_trial_params = json.load(fp)
# print(best_trial_params)

# max_depth = best_trial_params['max_depth']
# params = {
#     'learning_rate': best_trial_params['learning_rate'],
#     'n_estimators': best_trial_params['n_estimators'],
#     'sampling_strategy': best_trial_params['sampling_strategy']
# }

# decision_tree = DecisionTreeClassifier(max_depth=max_depth)
# model = RUSBoostClassifier(estimator=decision_tree, **params).fit(X_train_full, y_train_full)
# y_pred_test3 = model.predict(X_test3_f)

# compute_metrics(y_test3_ts, y_pred_test3)

[I 2024-12-16 13:11:33,608] A new study created in memory with name: no-name-4f4892c0-ebec-4c4b-a7b9-6cd7e5dc552e


In [39]:
# X_train_full = np.vstack((X_test1_f, X_test2_f))
# y_train_full = np.hstack((y_test1_ts, y_test2_ts))

# study = optuna.create_study(direction="maximize")
# study.optimize(lambda trial: objective(trial, X_train_full, y_train_full), n_trials=30)

# with open('params_temporal_4.json', 'w') as fp:
#     json.dump(study.best_trial.params, fp)
# with open('params_temporal_4.json', 'r') as fp:
#     best_trial_params = json.load(fp)
# print(best_trial_params)

# max_depth = best_trial_params['max_depth']
# params = {
#     'learning_rate': best_trial_params['learning_rate'],
#     'n_estimators': best_trial_params['n_estimators'],
#     'sampling_strategy': best_trial_params['sampling_strategy']
# }

# decision_tree = DecisionTreeClassifier(max_depth=max_depth)
# model = RUSBoostClassifier(estimator=decision_tree, **params).fit(X_train_full, y_train_full)
# y_pred_test3 = model.predict(X_test3_f)

# compute_metrics(y_test3_ts, y_pred_test3)


    Metrics:
    	F1 score (classic): 0.5994
    	F1 score (optimistic): 0.4390
    	F1 score (early): 0.4091
    
    	Recall score (classic): 0.7278
    	Recall score (optimistic): 0.8696
    	Recall score (early): 0.6744
    
    	Precision score (classic): 0.5095
    	Precision score (range): 0.2936
    


In [42]:
X_train_full = np.vstack((X_test2_f))
y_train_full = np.hstack((y_test2_ts))

study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, X_train_full, y_train_full), n_trials=30)

with open('params_temporal_5.json', 'w') as fp:
    json.dump(study.best_trial.params, fp)
with open('params_temporal_5.json', 'r') as fp:
    best_trial_params = json.load(fp)
print(best_trial_params)

max_depth = best_trial_params['max_depth']
params = {
    'learning_rate': best_trial_params['learning_rate'],
    'n_estimators': best_trial_params['n_estimators'],
    'sampling_strategy': best_trial_params['sampling_strategy']
}

decision_tree = DecisionTreeClassifier(max_depth=max_depth)
model = RUSBoostClassifier(estimator=decision_tree, **params).fit(X_train_full, y_train_full)
y_pred_test3 = model.predict(X_test3_f)

compute_metrics(y_test3_ts, y_pred_test3)


    Metrics:
    	F1 score (classic): 0.5526
    	F1 score (optimistic): 0.5539
    	F1 score (early): 0.4534
    
    	Recall score (classic): 0.6608
    	Recall score (optimistic): 0.8478
    	Recall score (early): 0.5051
    
    	Precision score (classic): 0.4749
    	Precision score (range): 0.4113
    
