1. Download files from here: https://drive.google.com/drive/folders/1LR-ftaIeV6_KJvVz8q-xbodA-oXtJuvV?usp=sharing
2. Place features.csv and metrics.csv to the following path from project root: resources/tabzilla/raw
3. Run this notebook

In [1]:
from ms.metadataset.data_source import TabzillaSource
from ms.metadataset.data_formatter import TabzillaFormatter
from ms.metadataset.data_filter import TabzillaFilter
from ms.metadataset.target_builder import TargetPerfBuilder, TargetDiffBuilder, TargetRawBuilder

In [2]:
from ms.metadataset.model_type import ModelType

md_source = TabzillaSource()
metric_name = "F1__test"
to_rewrite = True

Formatter handles raw TabZilla files performing fold values aggregation and metrics formatting.

Formatted files will be saved here: resources/tabzilla/formatted

In [3]:
formatter = TabzillaFormatter(
    features_folder="raw",
    metrics_folder="raw",
    test_mode=False,
    agg_func_features="mean",
    agg_func_metrics="mean",
    round_attrs=[
        "f__pymfe.general.nr_inst",
        "f__pymfe.general.nr_attr",
        "f__pymfe.general.nr_bin",
        "f__pymfe.general.nr_cat",
        "f__pymfe.general.nr_num",
        "f__pymfe.general.nr_class",
    ],
    filter_families=None,
    )
formatted_features = formatter.handle_features(to_rewrite=to_rewrite)
formatted_metrics = formatter.handle_metrics(to_rewrite=to_rewrite)
print(formatted_features.shape)
print(formatted_metrics.shape)

File D:\python_projects\meta-select\resources\tabzilla\formatted\features.csv already exists. Skipping...
File D:\python_projects\meta-select\resources\tabzilla\formatted\metrics.csv already exists. Skipping...
(176, 1605)
(3246, 19)


In [4]:
models = {
    "DecisionTree": ModelType.baseline,
    "KNN": ModelType.baseline,
    "LinearModel": ModelType.baseline,
    "RandomForest": ModelType.baseline,
    # "SVM": ModelType.baseline,
    "CatBoost": ModelType.gbdt,
    "LightGBM": ModelType.gbdt,
    "XGBoost": ModelType.gbdt,
    "DANet": ModelType.nn,
    "rtdl_FTTransformer": ModelType.nn,
    "rtdl_MLP": ModelType.nn,
    # "NODE": ModelType.nn,
    "rtdl_ResNet": ModelType.nn,
    # "SAINT": ModelType.nn,
    "STG": ModelType.nn,
    "TabNet": ModelType.nn,
    # "TabTransformer": ModelType.nn,
    # "TabPFNModel": ModelType.nn,
    "VIME": ModelType.nn,
}

Filter performs removal of unsuitable features

Filtered files will be saved here: resources/tabzilla/filtered

In [5]:
md_filter = TabzillaFilter(
    features_folder="formatted",
    metrics_folder="formatted",
    nan_threshold=0.5,
    fill_func="mean",
    funcs_to_exclude=[
        "count",
        "histogram",
        "iq_range",
        "median",
        "quantiles",
        "range",
    ],
    models_list=list(models.keys()),
    test_mode=False,
    value_threshold=1e6,
)

filtered_features = md_filter.handle_features(to_rewrite=to_rewrite)
filtered_metrics = md_filter.handle_metrics(to_rewrite=to_rewrite)
print(filtered_features.shape)
print(filtered_metrics.shape)

File D:\python_projects\meta-select\resources\tabzilla\filtered\features.csv already exists. Skipping...
File D:\python_projects\meta-select\resources\tabzilla\filtered\metrics.csv already exists. Skipping...
(165, 220)
(1778, 19)


Target builder creates target with specific strategy (rank of absolute or relative performance, difference between best performing models)

Targets will be saved here: resources/tabzilla/target

In [6]:
raw_builder = TargetRawBuilder(
    md_source=md_source,
    features_folder="filtered",
    metrics_folder="filtered",
    metric_name=metric_name,
)

abs_perf_builder = TargetPerfBuilder(
    md_source=md_source,
    features_folder="filtered",
    metrics_folder="filtered",
    metric_name=metric_name,
    perf_type="abs",
    n_bins=2,
    strategy="quantile",
    test_mode=False,
)

rel_perf_builder = TargetPerfBuilder(
    md_source=md_source,
    features_folder="filtered",
    metrics_folder="filtered",
    metric_name=metric_name,
    perf_type="rel",
    n_bins=2,
    strategy="quantile",
    test_mode=False,
)

diff_builder = TargetDiffBuilder(
    classes=[t.value for t in ModelType],
    model_classes=models,
    md_source=md_source,
    features_folder="filtered",
    metrics_folder="filtered",
    metric_name=metric_name,
    test_mode=False,
)

raw_target = raw_builder.handle_metrics(to_rewrite=to_rewrite)
# abs_target = abs_perf_builder.handle_metrics(to_rewrite=to_rewrite)
# rel_target = rel_perf_builder.handle_metrics(to_rewrite=to_rewrite)
# diff_target = diff_builder.handle_metrics(to_rewrite=to_rewrite)
print(raw_target.shape)
# print(abs_target.shape)
# print(rel_target.shape)
# print(diff_target.shape)

File D:\python_projects\meta-select\resources\tabzilla\target\metrics__raw.csv already exists. Skipping...
(127, 15)
