# Filtering suitable datasets from OpenML

In [1]:
import openml
from sklearn.datasets import fetch_openml

import warnings
warnings.filterwarnings('ignore')

import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname('src'), '..')))

from src.logger import log
from src.visualize import table
from src.constants import *

datasets = openml.datasets.list_datasets(output_format='dataframe')
log(f'Found {len(datasets)} datasets on OpenML')

[32;20m[2025-09-29 18:26:02][37;20m | [0mFound 6285 datasets on OpenML


## Automatically Filter Datasets
The following filtering steps were applied to OpenML datasets:

1. **Initial Constraints**  
    - Status marked as **active**
    - Contain **no missing values**
    - Contain **no symbolic (categorical) features**

2. **Feature and Sample Bounds**  
    - Number of **numeric features** within a predefined range: `MIN_NUM_FEATURES` to `MAX_NUM_FEATURES`
    - Number of **instances (samples)** within a predefined range: `MIN_SAMPLES` to `MAX_SAMPLES`

3. **Deduplication**  
    - Sorting by dataset name and version (descending)
    - Removing duplicate names while keeping the latest version

4. **Exclusion of Sparse Datasets**  
    - Removing sparse datasets that could not be parsed

In [2]:
datasets = datasets[
    (datasets['NumberOfMissingValues'] == 0) &
    (datasets['NumberOfSymbolicFeatures'] == 0) &
    (datasets['status'] == 'active')
]
log(f'{len(datasets)} active datasets with numeric features, no missing values, and no symbolic features')

datasets = datasets[
    (datasets['NumberOfNumericFeatures'] >= MIN_NUM_FEATURES) &
    (datasets['NumberOfNumericFeatures'] <= MAX_NUM_FEATURES) &
    (datasets['NumberOfInstances'] >= MIN_SAMPLES) &
    (datasets['NumberOfInstances'] <= MAX_SAMPLES)
]
log(f'{len(datasets)} datasets with {MIN_NUM_FEATURES}-{MAX_NUM_FEATURES} numerical features and {MIN_SAMPLES}-{MAX_SAMPLES} instances')

datasets = datasets.sort_values(by=['name', 'version'], ascending=[True, False]).drop_duplicates(subset='name', keep='first')
log(f'{len(datasets)} datasets after removing duplicates')
        
unloadable = []
for did in datasets['did']:
    try:
        fetch_openml(data_id=did, as_frame=True)
    except Exception as e:
        unloadable.append(did)
datasets = datasets[~datasets['did'].isin(unloadable)]
log(f'{len(datasets)} datasets after removing unloadable datasets')

table([[row[k] for k in DATASET_KEYS] for _, row in datasets.sort_values(by='did').iterrows()], DATASET_KEYS)

[32;20m[2025-09-29 18:26:02][37;20m | [0m1499 active datasets with numeric features, no missing values, and no symbolic features
[32;20m[2025-09-29 18:26:02][37;20m | [0m177 datasets with 8-50 numerical features and 500-5000 instances
[32;20m[2025-09-29 18:26:02][37;20m | [0m152 datasets after removing duplicates
[32;20m[2025-09-29 18:26:10][37;20m | [0m148 datasets after removing unloadable datasets


|   did | name                                                                                                                         |   NumberOfInstances |   NumberOfFeatures |
|-------|------------------------------------------------------------------------------------------------------------------------------|---------------------|--------------------|
|   223 | stock                                                                                                                        |                 950 |                 10 |
|   504 | analcatdata_supreme                                                                                                          |                4052 |                  8 |
|   522 | pm10                                                                                                                         |                 500 |                  8 |
|   547 | no2                                                                                                                          |                 500 |                  8 |
|   581 | fri_c3_500_25                                                                                                                |                 500 |                 26 |
|   582 | fri_c1_500_25                                                                                                                |                 500 |                 26 |
|   584 | fri_c4_500_25                                                                                                                |                 500 |                 26 |
|   586 | fri_c3_1000_25                                                                                                               |                1000 |                 26 |
|   589 | fri_c2_1000_25                                                                                                               |                1000 |                 26 |
|   592 | fri_c4_1000_25                                                                                                               |                1000 |                 26 |
|   593 | fri_c1_1000_10                                                                                                               |                1000 |                 11 |
|   595 | fri_c0_1000_10                                                                                                               |                1000 |                 11 |
|   598 | fri_c0_1000_25                                                                                                               |                1000 |                 26 |
|   604 | fri_c4_500_10                                                                                                                |                 500 |                 11 |
|   606 | fri_c2_1000_10                                                                                                               |                1000 |                 11 |
|   608 | fri_c3_1000_10                                                                                                               |                1000 |                 11 |
|   620 | fri_c1_1000_25                                                                                                               |                1000 |                 26 |
|   623 | fri_c4_1000_10                                                                                                               |                1000 |                 11 |
|   627 | fri_c2_500_10                                                                                                                |                 500 |                 11 |
|   633 | fri_c0_500_25                                                                                                                |                 500 |                 26 |
|   641 | fri_c1_500_10                                                                                                                |                 500 |                 11 |
|   643 | fri_c2_500_25                                                                                                                |                 500 |                 26 |
|   646 | fri_c3_500_10                                                                                                                |                 500 |                 11 |
|   654 | fri_c0_500_10                                                                                                                |                 500 |                 11 |
|   666 | rmftsa_ladata                                                                                                                |                 508 |                 11 |
|  1028 | SWD                                                                                                                          |                1000 |                 11 |
|  4353 | Concrete_Data                                                                                                                |                1030 |                  9 |
|  4540 | ParkinsonSpeechDatasetwithMultipleTypesofSoundRecordings                                                                     |                1039 |                 29 |
|  4551 | WaveformDatabaseGenerator                                                                                                    |                5000 |                 22 |
| 23516 | debutanizer                                                                                                                  |                2394 |                  8 |
| 41491 | wq                                                                                                                           |                1060 |                 30 |
| 41553 | enb                                                                                                                          |                 768 |                 10 |
| 41943 | ilpd-numeric                                                                                                                 |                 583 |                 11 |
| 41944 | Sick_numeric                                                                                                                 |                3772 |                 30 |
| 42184 | Wine                                                                                                                         |                1599 |                 12 |
| 42188 | premier_league_with_tda                                                                                                      |                2565 |                 20 |
| 42197 | Premier_League_matches                                                                                                       |                2961 |                 17 |
| 42351 | UCI-student-performance-por                                                                                                  |                 649 |                 33 |
| 42367 | treasury                                                                                                                     |                1049 |                 16 |
| 42369 | weather_izmir                                                                                                                |                1461 |                 10 |
| 42437 | titanic                                                                                                                      |                 891 |                  8 |
| 42438 | Titanic                                                                                                                      |                 891 |                  8 |
| 42441 | titanic_1                                                                                                                    |                 891 |                  8 |
| 42444 | titanic_2                                                                                                                    |                 891 |                  8 |
| 42445 | titanic_3                                                                                                                    |                 891 |                  8 |
| 42464 | Waterstress                                                                                                                  |                1188 |                 23 |
| 42636 | Long                                                                                                                         |                4477 |                 20 |
| 42999 | hungarian-chickenpox                                                                                                         |                 522 |                 21 |
| 43000 | cnn-stock-pred-dji                                                                                                           |                 522 |                 21 |
| 43127 | mytestdataset                                                                                                                |                 992 |                 16 |
| 43222 | 22SafetyBehaviouDuringCOVID-19                                                                                               |                 515 |                 26 |
| 43252 | IEEE80211aa-GATS                                                                                                             |                3782 |                 34 |
| 43323 | CorporateCreditRating                                                                                                        |                2029 |                 31 |
| 43338 | Energy-Efficiency-Dataset                                                                                                    |                 768 |                 10 |
| 43344 | Corporate-Credit-Rating                                                                                                      |                2029 |                 31 |
| 43352 | PS4-Games                                                                                                                    |                1584 |                 10 |
| 43384 | Diabetes-Data-Set                                                                                                            |                 768 |                  9 |
| 43386 | Spotify---All-Time-Top-2000s-Mega-Dataset                                                                                    |                1994 |                 15 |
| 43402 | Stock-Market-NIFTY50-Index-Data                                                                                              |                3509 |                  9 |
| 43403 | Indian-Liver-Patient-Patient-Records-KFolds-5folds                                                                           |                 583 |                 12 |
| 43406 | Red-Wine-data-set                                                                                                            |                1599 |                 12 |
| 43437 | Gender-Recognition-by-Voice                                                                                                  |                3168 |                 21 |
| 43440 | Forest-Fire-Area                                                                                                             |                 517 |                 13 |
| 43442 | Is-this-a-good-customer                                                                                                      |                1723 |                 14 |
| 43448 | Calculate-Concrete-Strength                                                                                                  |                1030 |                  9 |
| 43465 | Boston-house-price-data                                                                                                      |                 506 |                 14 |
| 43483 | Pima-Indians-Diabetes-Dataset                                                                                                |                 768 |                  9 |
| 43540 | Emissions-by-Cars                                                                                                            |                 679 |                 13 |
| 43563 | Digital-currency---Time-series                                                                                               |                1000 |                 10 |
| 43581 | QSAR-Bioconcentration-classes-dataset                                                                                        |                 779 |                 14 |
| 43582 | Pima-Indians-Diabetes                                                                                                        |                 768 |                  9 |
| 43588 | Household-monthly-electricity-bill                                                                                           |                1000 |                 10 |
| 43623 | Boston-Weather-Data-Jan-2013---Apr-2018                                                                                      |                3749 |                 24 |
| 43682 | Heart-Disease-Dataset-(Comprehensive)                                                                                        |                1190 |                 12 |
| 43695 | Red-Wine-Quality                                                                                                             |                1599 |                 12 |
| 43696 | IBM-HR-Analytics-Employee-Attrition--Performance                                                                             |                1470 |                 35 |
| 43699 | QSAR-Bioconcentration-Classes-Data-Set                                                                                       |                 779 |                 13 |
| 43701 | Contraceptive-Method-Choice                                                                                                  |                1472 |                 10 |
| 43795 | The-2020-Pokemon-dataset                                                                                                     |                1013 |                 40 |
| 43807 | Forest-Fires-Data-Set-Portugal                                                                                               |                 517 |                 13 |
| 43808 | German-Credit-Data                                                                                                           |                1000 |                 21 |
| 43817 | English-Premier-League-Data-2009---2019                                                                                      |                3800 |                 22 |
| 43826 | Personal-Loan-Modeling                                                                                                       |                5000 |                 13 |
| 43894 | ibm-employee-attrition                                                                                                       |                1470 |                 35 |
| 44029 | student_performance_por                                                                                                      |                 649 |                 31 |
| 44203 | Intersectional-Bias-Assessment-(Testing-Data)                                                                                |                1000 |                 19 |
| 44212 | Reading_Hydro                                                                                                                |                1000 |                 27 |
| 44960 | energy_efficiency                                                                                                            |                 768 |                  9 |
| 44962 | forest_fires                                                                                                                 |                 517 |                 13 |
| 44971 | white_wine                                                                                                                   |                4898 |                 12 |
| 44972 | red_wine                                                                                                                     |                1599 |                 12 |
| 44994 | cars                                                                                                                         |                 804 |                 18 |
| 45033 | abalone                                                                                                                      |                4177 |                  8 |
| 45054 | cmc                                                                                                                          |                1473 |                 10 |
| 45536 | Contaminant-detection-in-packaged-cocoa-hazelnut-spread-jars-using-Microwaves-Sensing-and-Machine-Learning-9.0GHz(Urbinati)  |                2400 |                 31 |
| 45537 | Contaminant-detection-in-packaged-cocoa-hazelnut-spread-jars-using-Microwaves-Sensing-and-Machine-Learning-9.5GHz(Urbinati)  |                2400 |                 31 |
| 45538 | Contaminant-detection-in-packaged-cocoa-hazelnut-spread-jars-using-Microwaves-Sensing-and-Machine-Learning-10.0GHz(Urbinati) |                2400 |                 31 |
| 45539 | Contaminant-detection-in-packaged-cocoa-hazelnut-spread-jars-using-Microwaves-Sensing-and-Machine-Learning-10.5GHz(Urbinati) |                2400 |                 31 |
| 45540 | Contaminant-detection-in-packaged-cocoa-hazelnut-spread-jars-using-Microwaves-Sensing-and-Machine-Learning-11.0GHz(Urbinati) |                2400 |                 31 |
| 45929 | DATASETBANK                                                                                                                  |                2000 |                 12 |
| 45950 | Heart_Failure_Prediction                                                                                                     |                5000 |                 13 |
| 46131 | gbsg                                                                                                                         |                2232 |                  9 |
| 46142 | metabric                                                                                                                     |                1903 |                 11 |
| 46144 | support                                                                                                                      |                4000 |                 16 |
| 46168 | grace                                                                                                                        |                1000 |                  8 |
| 46254 | Diabetes_Dataset                                                                                                             |                 768 |                  9 |
| 46255 | Student_Performance_Dataset                                                                                                  |                2392 |                 15 |
| 46264 | mabbob_ela_as_2d_classify                                                                                                    |                1120 |                 46 |
| 46265 | mabbob_ela_as_2d_regression_DiagonalCMA                                                                                      |                1120 |                 46 |
| 46266 | mabbob_ela_as_2d_regression_DifferentialEvolution                                                                            |                1120 |                 46 |
| 46267 | mabbob_ela_as_2d_regression_modcma                                                                                           |                1120 |                 46 |
| 46268 | mabbob_ela_as_2d_regression_modde                                                                                            |                1120 |                 46 |
| 46269 | mabbob_ela_as_2d_regression_RCobyla                                                                                          |                1120 |                 46 |
| 46270 | mabbob_ela_as_5d_classify                                                                                                    |                1120 |                 46 |
| 46271 | mabbob_ela_as_5d_regression_DiagonalCMA                                                                                      |                1120 |                 46 |
| 46272 | mabbob_ela_as_5d_regression_DifferentialEvolution                                                                            |                1120 |                 46 |
| 46273 | mabbob_ela_as_5d_regression_modcma                                                                                           |                1120 |                 46 |
| 46274 | mabbob_ela_as_5d_regression_modde                                                                                            |                1120 |                 46 |
| 46275 | mabbob_ela_as_5d_regression_RCobyla                                                                                          |                1120 |                 46 |
| 46295 | qsar_aquatic_toxicity                                                                                                        |                 546 |                  9 |
| 46356 | GermanCreditData                                                                                                             |                1000 |                 22 |
| 46381 | APL_20_24                                                                                                                    |                3260 |                 27 |
| 46416 | Creditability-German-Credit-Data                                                                                             |                1000 |                 21 |
| 46502 | Credit_Approval_Classification                                                                                               |                1000 |                 51 |
| 46532 | dataset_credit-g                                                                                                             |                1000 |                 21 |
| 46551 | Corporate_Credit                                                                                                             |                2029 |                 31 |
| 46555 | dataset_credit-approval                                                                                                      |                 690 |                 16 |
| 46572 | database                                                                                                                     |                 768 |                  9 |
| 46585 | QSAR_Bioconcentration_classification                                                                                         |                 779 |                 13 |
| 46587 | forestfires                                                                                                                  |                 517 |                 13 |
| 46589 | student-performance-uci                                                                                                      |                 649 |                 31 |
| 46597 | Estimation_of_Obesity_Levels                                                                                                 |                2111 |                 17 |
| 46604 | glioma_grading_clinical_and_mutation_features                                                                                |                 839 |                 24 |
| 46607 | hepatitis_c_virus_hcv_for_egyptian_patients                                                                                  |                1385 |                 29 |
| 46618 | ECOLI70                                                                                                                      |                2000 |                 46 |
| 46620 | magic_niab                                                                                                                   |                2000 |                 44 |
| 46635 | Fetal_cardiotocography_dataset                                                                                               |                2126 |                 40 |
| 46731 | temperature_emissions_environmental_trends_2000_2024                                                                         |                1000 |                 10 |
| 46735 | sustainable_development_report_zero_hunger                                                                                   |                4140 |                 21 |
| 46762 | air-quality-and-pollution-assessment                                                                                         |                5000 |                 10 |
| 46764 | football-player-position                                                                                                     |                3611 |                 12 |
| 46815 | G20                                                                                                                          |                 627 |                 10 |
| 46883 | google_qa_answer_type_reason_explanation                                                                                     |                4863 |                 40 |
| 46884 | google_qa_question_type_reason_explanation                                                                                   |                4863 |                 40 |
| 46917 | concrete_compressive_strength                                                                                                |                1030 |                  9 |
| 47001 | QSAR_Bioconcentration_regression                                                                                             |                 779 |                 13 |
| 47012 | sample_parquet_zip                                                                                                           |                1000 |                 31 |
| 47038 | 001                                                                                                                          |                1716 |                 25 |

## Manually Filter Datasets
The following filtering steps were applied to OpenML datasets:

1. **Numeric features**
    - TDA requires pointwise distance, which only works for numeric data

2. **Continuous features**
    - Distances are meaningless for categorical numeric features
    - Removing data with categorical features

3. **Deduplication**  
    - Removing duplicates while keeping the latest version

4. **Friedman datasets**  
    - Datasets represent random functional releationships
    - Unsuitable due to the lack of topological structure

In [3]:
non_numeric = [42351, 43127, 43323, 43344, 43352, 43386, 43440, 43442, 43540, 43563, 43581, 43696, 43699, 
               43795, 43807, 43817, 43894, 44029, 44203, 44212, 44962, 46381, 46551, 46585, 46587, 46589,
               46597, 46604, 46635, 46731, 46735, 46815, 46883, 46884, 47001, 47012, 47038]
datasets = datasets[~datasets['did'].isin(non_numeric)]
log(f'{len(datasets)} datasets after removing datasets with non-numeric features')

categorical = [504, 522, 547, 1028, 4540, 41491, 41553, 41943, 41944, 42188, 42197, 42437, 42438, 42441,
               42444, 42445, 42464, 42636, 43222, 43252, 43338, 43403, 43465, 43582, 43588, 43682, 43701,
               43808, 43826, 44960, 44994, 45033, 45054, 45929, 45950, 46131, 46142, 46144, 46168, 46255,
               46295, 46356, 46356, 46416, 46502, 46532, 46555, 46607, 46618, 46761]
datasets = datasets[~datasets['did'].isin(categorical)]
log(f'{len(datasets)} datasets after removing datasets with categorical features')

duplicates = [
    43448, 46917, # Keep: 4353 (Concrete Data)
    46254, 43483, 46572, # Keep: 43384 (Diabetes Dataset)
    43406, 42184, 44972, # Keep: 43695 (Red Wine Quality)
    45539, 45540, 45536, 45537 # Keep: 45538 (Microwaves Sensoring)
]
datasets = datasets[~datasets['did'].isin(duplicates)]
log(f'{len(datasets)} datasets after removing duplicates')

synthetic_with_low_lopology = [
    595, 598, 654, 633, 593, 620, 641, 582, 606, 589, 627, 643, 608, 586, 646, 581, 623, 592, 604, 584, # Friedman
    46264, 46265, 46266, 46269, 46267, 46268, 46270, 46271, 46272, 46275, 46273, 46274, # MA-BBOB
    4551, 45538, 46620
]
datasets = datasets[~datasets['did'].isin(synthetic_with_low_lopology)]
log(f'{len(datasets)} datasets after removing synthetic datasets with low topology')

table([[row[k] for k in DATASET_KEYS] for _, row in datasets.sort_values(by='did').iterrows()], DATASET_KEYS)

[32;20m[2025-09-29 18:26:10][37;20m | [0m111 datasets after removing datasets with non-numeric features
[32;20m[2025-09-29 18:26:10][37;20m | [0m63 datasets after removing datasets with categorical features
[32;20m[2025-09-29 18:26:10][37;20m | [0m51 datasets after removing duplicates
[32;20m[2025-09-29 18:26:10][37;20m | [0m16 datasets after removing synthetic datasets with low topology


|   did | name                                    |   NumberOfInstances |   NumberOfFeatures |
|-------|-----------------------------------------|---------------------|--------------------|
|   223 | stock                                   |                 950 |                 10 |
|   666 | rmftsa_ladata                           |                 508 |                 11 |
|  4353 | Concrete_Data                           |                1030 |                  9 |
| 23516 | debutanizer                             |                2394 |                  8 |
| 42367 | treasury                                |                1049 |                 16 |
| 42369 | weather_izmir                           |                1461 |                 10 |
| 42999 | hungarian-chickenpox                    |                 522 |                 21 |
| 43000 | cnn-stock-pred-dji                      |                 522 |                 21 |
| 43384 | Diabetes-Data-Set                       |                 768 |                  9 |
| 43402 | Stock-Market-NIFTY50-Index-Data         |                3509 |                  9 |
| 43437 | Gender-Recognition-by-Voice             |                3168 |                 21 |
| 43623 | Boston-Weather-Data-Jan-2013---Apr-2018 |                3749 |                 24 |
| 43695 | Red-Wine-Quality                        |                1599 |                 12 |
| 44971 | white_wine                              |                4898 |                 12 |
| 46762 | air-quality-and-pollution-assessment    |                5000 |                 10 |
| 46764 | football-player-position                |                3611 |                 12 |