# Filtering suitable datasets from OpenML

In [1]:
import openml
from sklearn.datasets import fetch_openml

import warnings
warnings.filterwarnings('ignore')

import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname('src'), '..')))

from src.logger import log
from src.visualize import table
from src.constants import *

datasets = openml.datasets.list_datasets(output_format='dataframe')
log(f"Found {len(datasets)} datasets on OpenML")

[32;20m[2025-06-19 10:15:53][37;20m | [0mFound 6270 datasets on OpenML


## Automatically Filter Datasets
The following filtering steps were applied to OpenML datasets:

1. **Initial Constraints**  
    - Status marked as **active**
    - Contain **no missing values**
    - Contain **no symbolic (categorical) features**

2. **Feature and Sample Bounds**  
    - Number of **numeric features** within a predefined range: `MIN_NUM_FEATURES` to `MAX_NUM_FEATURES`
    - Number of **instances (samples)** within a predefined range: `MIN_SAMPLES` to `MAX_SAMPLES`

3. **Deduplication**  
    - Sorting by dataset name and version (descending)
    - Removing duplicate names while keeping the latest version

4. **Exclusion of Sparse Datasets**  
    - Removing sparse datasets that could not be parsed

In [2]:
datasets = datasets[
    (datasets['NumberOfMissingValues'] == 0) &
    (datasets['NumberOfSymbolicFeatures'] == 0) &
    (datasets['status'] == 'active')
]
log(f"{len(datasets)} active datasets with numeric features, no missing values, and no symbolic features")

datasets = datasets[
    (datasets['NumberOfNumericFeatures'] >= MIN_NUM_FEATURES) &
    (datasets['NumberOfNumericFeatures'] <= MAX_NUM_FEATURES) &
    (datasets['NumberOfInstances'] >= MIN_SAMPLES) &
    (datasets['NumberOfInstances'] <= MAX_SAMPLES)
]
log(f"{len(datasets)} datasets with {MIN_NUM_FEATURES}-{MAX_NUM_FEATURES} numerical features and {MIN_SAMPLES}-{MAX_SAMPLES} instances")

datasets = datasets.sort_values(by=['name', 'version'], ascending=[True, False]).drop_duplicates(subset='name', keep='first')
log(f"{len(datasets)} datasets after removing duplicates")
        
sparse = []
for did in datasets['did']:
    try:
        fetch_openml(data_id=did, as_frame=True)
    except Exception as e:
        sparse.append(did)
datasets = datasets[~datasets['did'].isin(sparse)]
log(f"{len(datasets)} datasets after removing sparse datasets")

table([[row[k] for k in DATASET_KEYS] for _, row in datasets.iterrows()], DATASET_KEYS)

[32;20m[2025-06-19 10:15:53][37;20m | [0m1489 active datasets with numeric features, no missing values, and no symbolic features
[32;20m[2025-06-19 10:15:53][37;20m | [0m89 datasets with 8-25 numerical features and 500-1500 instances
[32;20m[2025-06-19 10:15:53][37;20m | [0m72 datasets after removing duplicates
[32;20m[2025-06-19 10:15:54][37;20m | [0m70 datasets after removing sparse datasets


|   did | name                                                 |   NumberOfInstances |   NumberOfFeatures |
|-------|------------------------------------------------------|---------------------|--------------------|
| 43465 | Boston-house-price-data                              |                 506 |                 14 |
| 43448 | Calculate-Concrete-Strength                          |                1030 |                  9 |
|  4353 | Concrete_Data                                        |                1030 |                  9 |
| 43701 | Contraceptive-Method-Choice                          |                1472 |                 10 |
| 46416 | Creditability-German-Credit-Data                     |                1000 |                 21 |
| 43384 | Diabetes-Data-Set                                    |                 768 |                  9 |
| 46254 | Diabetes_Dataset                                     |                 768 |                  9 |
| 43563 | Digital-currency---Time-series                       |                1000 |                 10 |
| 43540 | Emissions-by-Cars                                    |                 679 |                 13 |
| 43338 | Energy-Efficiency-Dataset                            |                 768 |                 10 |
| 43440 | Forest-Fire-Area                                     |                 517 |                 13 |
| 43807 | Forest-Fires-Data-Set-Portugal                       |                 517 |                 13 |
| 46815 | G20                                                  |                 627 |                 10 |
| 43808 | German-Credit-Data                                   |                1000 |                 21 |
| 46356 | GermanCreditData                                     |                1000 |                 22 |
| 43682 | Heart-Disease-Dataset-(Comprehensive)                |                1190 |                 12 |
| 43588 | Household-monthly-electricity-bill                   |                1000 |                 10 |
| 43403 | Indian-Liver-Patient-Patient-Records-KFolds-5folds   |                 583 |                 12 |
| 44203 | Intersectional-Bias-Assessment-(Testing-Data)        |                1000 |                 19 |
| 43582 | Pima-Indians-Diabetes                                |                 768 |                  9 |
| 43483 | Pima-Indians-Diabetes-Dataset                        |                 768 |                  9 |
| 43699 | QSAR-Bioconcentration-Classes-Data-Set               |                 779 |                 13 |
| 43581 | QSAR-Bioconcentration-classes-dataset                |                 779 |                 14 |
| 46585 | QSAR_Bioconcentration_classification                 |                 779 |                 13 |
| 47001 | QSAR_Bioconcentration_regression                     |                 779 |                 13 |
|  1028 | SWD                                                  |                1000 |                 11 |
| 42438 | Titanic                                              |                 891 |                  8 |
| 42351 | UCI-student-performance-por                          |                 649 |                 33 |
| 42464 | Waterstress                                          |                1188 |                 23 |
| 44994 | cars                                                 |                 804 |                 18 |
| 45054 | cmc                                                  |                1473 |                 10 |
| 43000 | cnn-stock-pred-dji                                   |                 522 |                 21 |
| 46917 | concrete_compressive_strength                        |                1030 |                  9 |
| 46572 | database                                             |                 768 |                  9 |
| 46555 | dataset_credit-approval                              |                 690 |                 16 |
| 46532 | dataset_credit-g                                     |                1000 |                 21 |
| 41553 | enb                                                  |                 768 |                 10 |
| 44960 | energy_efficiency                                    |                 768 |                  9 |
| 44962 | forest_fires                                         |                 517 |                 13 |
| 46587 | forestfires                                          |                 517 |                 13 |
|   595 | fri_c0_1000_10                                       |                1000 |                 11 |
|   654 | fri_c0_500_10                                        |                 500 |                 11 |
|   593 | fri_c1_1000_10                                       |                1000 |                 11 |
|   641 | fri_c1_500_10                                        |                 500 |                 11 |
|   606 | fri_c2_1000_10                                       |                1000 |                 11 |
|   627 | fri_c2_500_10                                        |                 500 |                 11 |
|   608 | fri_c3_1000_10                                       |                1000 |                 11 |
|   646 | fri_c3_500_10                                        |                 500 |                 11 |
|   623 | fri_c4_1000_10                                       |                1000 |                 11 |
|   604 | fri_c4_500_10                                        |                 500 |                 11 |
| 46604 | glioma_grading_clinical_and_mutation_features        |                 839 |                 24 |
| 46168 | grace                                                |                1000 |                  8 |
| 42999 | hungarian-chickenpox                                 |                 522 |                 21 |
| 41943 | ilpd-numeric                                         |                 583 |                 11 |
| 46761 | mental_health_detection                              |                 540 |                 16 |
| 43127 | mytestdataset                                        |                 992 |                 16 |
|   547 | no2                                                  |                 500 |                  8 |
|   522 | pm10                                                 |                 500 |                  8 |
| 46295 | qsar_aquatic_toxicity                                |                 546 |                  9 |
|   666 | rmftsa_ladata                                        |                 508 |                 11 |
|   223 | stock                                                |                 950 |                 10 |
| 46589 | student-performance-uci                              |                 649 |                 31 |
| 44029 | student_performance_por                              |                 649 |                 31 |
| 46731 | temperature_emissions_environmental_trends_2000_2024 |                1000 |                 10 |
| 42437 | titanic                                              |                 891 |                  8 |
| 42441 | titanic_1                                            |                 891 |                  8 |
| 42444 | titanic_2                                            |                 891 |                  8 |
| 42445 | titanic_3                                            |                 891 |                  8 |
| 42367 | treasury                                             |                1049 |                 16 |
| 42369 | weather_izmir                                        |                1461 |                 10 |

## Manually Filter Datasets
The following filtering steps were applied to OpenML datasets:

1. **Numeric features**
    - TDA requires pointwise distance, which only works for numeric data

2. **Continuous features**
    - Distances are meaningless for categorical numeric features
    - Removing data with categorical features

3. **Deduplication**  
    - Removing duplicates while keeping the latest version

4. **Friedman datasets**  
    - Datasets represent random functional releationships
    - Unsuitable due to the lack of topological structure

In [3]:
non_numeric = [43440, 43807, 44203, 43699, 43581, 46585, 47001, 42351,
44962, 46587, 46604, 43127, 46589, 44029, 43540, 46815, 43563, 46731]
datasets = datasets[~datasets['did'].isin(non_numeric)]
log(f"{len(datasets)} datasets after removing datasets with non-numeric features")

categorical = [43465, 43701, 46416, 43338, 43808, 43682, 43588, 43403,
1028, 44994, 45054, 46555, 46532, 41553, 41943, 46761, 43582, 46356, 46356,
42438, 42464, 44960, 46168, 547, 522, 46295, 42437, 42441, 42444, 42445]
datasets = datasets[~datasets['did'].isin(categorical)]
log(f"{len(datasets)} datasets after removing datasets with categorical features")

duplicates = [
    43448, 46917, # Keep: 4353
    46254, 43483, 46572, # Keep: 43384
]
datasets = datasets[~datasets['did'].isin(duplicates)]
log(f"{len(datasets)} datasets after removing duplicates")

friedman = [595, 609, 654, 649, 593, 612, 641, 631, 606, 599, 627, 597, 608, 628, 646, 617, 623, 604]
datasets = datasets[~datasets['did'].isin(friedman)]
log(f"{len(datasets)} datasets after removing Friedman datasets")

table([[row[k] for k in DATASET_KEYS] for _, row in datasets.iterrows()], DATASET_KEYS)

[32;20m[2025-06-19 10:15:54][37;20m | [0m52 datasets after removing datasets with non-numeric features
[32;20m[2025-06-19 10:15:54][37;20m | [0m23 datasets after removing datasets with categorical features
[32;20m[2025-06-19 10:15:54][37;20m | [0m18 datasets after removing duplicates
[32;20m[2025-06-19 10:15:54][37;20m | [0m8 datasets after removing Friedman datasets


|   did | name                 |   NumberOfInstances |   NumberOfFeatures |
|-------|----------------------|---------------------|--------------------|
|  4353 | Concrete_Data        |                1030 |                  9 |
| 43384 | Diabetes-Data-Set    |                 768 |                  9 |
| 43000 | cnn-stock-pred-dji   |                 522 |                 21 |
| 42999 | hungarian-chickenpox |                 522 |                 21 |
|   666 | rmftsa_ladata        |                 508 |                 11 |
|   223 | stock                |                 950 |                 10 |
| 42367 | treasury             |                1049 |                 16 |
| 42369 | weather_izmir        |                1461 |                 10 |