In [None]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import os
import sys
import pathlib
from pathlib import Path

current_dir = Path(os.getcwd())
TSER_data_dir = current_dir.parent.parent / "Data" / "TSER"
print("current_dir", current_dir)
print("TSER_data_dir", TSER_data_dir)

import numpy as np
import aeon
import torch
from torch import Tensor
import torch.nn as nn
import torch.functional as F
import pandas as pd
from aeon.datasets.tser_datasets import tser_soton; tser_soton = sorted(list(tser_soton))
from aeon.datasets import load_regression
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import RidgeCV
from tqdm import tqdm
np.set_printoptions(precision=3, threshold=5) # Print options

current_dir c:\Users\nz423\Code\exploring-hydra-boosting
TSER_data_dir c:\Users\nz423\Data\TSER


# Regressor

In [None]:
import numpy as np
import pandas as pd
from aeon.benchmarking.results_loaders import get_estimator_results
from aeon.datasets import load_regression
from aeon.regression import DummyRegressor
from aeon.visualisation import plot_critical_difference
from sklearn.metrics import mean_squared_error
from tsml.datasets import load_minimal_gas_prices
from aeon.datasets.tser_datasets import tser_soton; tser_soton = sorted(list(tser_soton))

from tsml_eval.evaluation.storage import load_regressor_results
from tsml_eval.experiments import (
    experiments,
    get_regressor_by_name,
    run_regression_experiment,
)

# print("TSER", "len", len(tser_soton), tser_soton)



In [None]:
# available regressors


In [None]:
from load_datasets import get_aeon_dataset
dataset_name = "AppliancesEnergy"
X_train, y_train, X_test, y_test = get_aeon_dataset(dataset_name, TSER_data_dir, "regression")
X_train = X_train.astype(np.float64) # TODO bug otherwise with other built in methods
y_train = y_train.astype(np.float64)
X_test = X_test.astype(np.float64)
y_test = y_test.astype(np.float64)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
np.mean(y_train)

In [None]:
# set_regressor can be used to find various regressors by string, but
# any aeon, tsml or sklearn regressor can be used in the experiments function
regressor = get_regressor_by_name("DummyRegressor")

# record memory usage every 0.1 seconds, just here for notebook speed
# does not need to be changed for usage
experiments.MEMRECORD_INTERVAL = 0.1

run_regression_experiment(
    X_train,
    y_train,
    X_test,
    y_test,
    regressor,
    "results/",
    dataset_name=dataset_name,
    resample_id=0,
)

rr = load_regressor_results(
    current_dir / "results" / "DummyRegressor" / "Predictions" / dataset_name / "testResample0.csv"
)
print(rr.predictions)
print(rr.mean_squared_error, "mse")
print(rr.root_mean_squared_error, "rmse")
print(rr.mean_absolute_percentage_error, "mape")
print(rr.r2_score, "r2")

In [None]:
# set_regressor can be used to find various regressors by string, but
# any aeon, tsml or sklearn regressor can be used in the experiments function
regressor = get_regressor_by_name("multirockethydra")

# record memory usage every 0.1 seconds, just here for notebook speed
# does not need to be changed for usage
experiments.MEMRECORD_INTERVAL = 0.1

run_regression_experiment(
    X_train,
    y_train,
    X_test,
    y_test,
    regressor,
    "results/",
    dataset_name=dataset_name,
    resample_id=0,
)

rr = load_regressor_results(
    current_dir / "results" / "MultiRocketHydraRegressor" / "Predictions" / dataset_name / "testResample0.csv"
)
print(rr.predictions)
print(rr.mean_squared_error, "mse")
print(rr.root_mean_squared_error, "rmse")
print(rr.mean_absolute_percentage_error, "mape")
print(rr.r2_score, "r2")

## benchmark against other

In [None]:
datasets = [
    "CardanoSentiment",
    "Covid3Month",
    "FloodModeling1",
    "FloodModeling2",
    "NaturalGasPricesSentiment",
    #"MethaneMonitoringHomeActivity",
    "HouseholdPowerConsumption1",
    #"AustraliaRainfall"
]

estimators = ["InceptionT", "FreshPRINCE", "DrCIF", "Rocket", "RandF", "RotF", "XGBoost", "Ridge"] #"MultiRocket"
benchmarks = get_estimator_results(
    datasets=datasets, estimators=estimators, task="regression", measure="rmse", path = current_dir / "data" / "bench_regression_TSER"
)

def add_regressor_to_benchmarks(
    model_name: str,
    regressor,
    benchmarks: Dict,
    ):
    results = {}
    for d in datasets:
        train_X, train_y = load_regression(d, split="train")
        test_X, test_y = load_regression(d, split="test")
        regressor.fit(train_X, train_y)
        y_pred = regressor.predict(test_X)
        results[d] = root_mean_squared_error(test_y, y_pred)
    benchmarks[model_name] = results
    return results

benchmarks

In [None]:
add_regressor_to_benchmarks("Dummy", DummyRegressor(), benchmarks) 

In [None]:
add_regressor_to_benchmarks("multirockethydra", get_regressor_by_name("multirockethydra"), benchmarks) 

In [None]:

table = pd.DataFrame(benchmarks)
table

In [None]:
plt, _ = plot_critical_difference(
    np.array(table), list(table.columns), lower_better=True
)
plt.show()

# Make my own wrapper to tsml run experiment

In [None]:
# set_regressor can be used to find various regressors by string, but
# any aeon, tsml or sklearn regressor can be used in the experiments function
regressor = get_regressor_by_name("DummyRegressor")

# # record memory usage every 0.1 seconds, just here for notebook speed
# # does not need to be changed for usage
# experiments.MEMRECORD_INTERVAL = 0.1

# run_regression_experiment(
#     X_train,
#     y_train,
#     X_test,
#     y_test,
#     regressor,
#     "results/",
#     dataset_name=dataset_name,
#     resample_id=0,
# )

# rr = load_regressor_results(
#     current_dir / "results" / "MultiRocketHydraRegressor" / "Predictions" / dataset_name / "testResample0.csv"
# )
# print(rr.predictions)
# print(rr.mean_squared_error, "mse")
# print(rr.root_mean_squared_error, "rmse")
# print(rr.mean_absolute_percentage_error, "mape")
# print(rr.r2_score, "r2")

# Classifier

In [None]:
# TODO whats the proper way to load the TSC results? same library?

# Make my own class



Features:
* For now only do wide not deep representation boosting
* percentage of dataset to be used at each boosting iteration
* Batching to calculate random features
* Do i need batching for least squares?
* What solver to use for least squares?
* How to initialize Phi_0? Should i use different n_features for the initial guess? Use all data for initial?



Would be interesting to see difference between boosting in label space vs representation boosting