In [None]:
'''
https://docs.openml.org/benchmark/

The OpenML100 was a predecessor of the OpenML-CC18, consisting of 100 classification datasets. 
We recommend that you use the OpenML-CC18 instead, 
because the OpenML100 suffers from some teething issues in the design of benchmark suites.
'''

In [1]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
import openml
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

warnings.filterwarnings('ignore')


In [2]:
benchmark_suite = openml.study.get_suite('OpenML-CC18')


benchmark_suite = openml.study.get_suite('OpenML-CC18')


for dataset_id in benchmark_suite.data:
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )
    
    print(f"Dataset ID: {dataset_id}, Number of Instances: {X.shape[0]}, Number of Features: {X.shape[1]}")



Dataset ID: 3, Number of Instances: 3196, Number of Features: 36
Dataset ID: 6, Number of Instances: 20000, Number of Features: 16
Dataset ID: 11, Number of Instances: 625, Number of Features: 4
Dataset ID: 12, Number of Instances: 2000, Number of Features: 216
Dataset ID: 14, Number of Instances: 2000, Number of Features: 76
Dataset ID: 15, Number of Instances: 699, Number of Features: 9
Dataset ID: 16, Number of Instances: 2000, Number of Features: 64
Dataset ID: 18, Number of Instances: 2000, Number of Features: 6
Dataset ID: 22, Number of Instances: 2000, Number of Features: 47
Dataset ID: 23, Number of Instances: 1473, Number of Features: 9
Dataset ID: 28, Number of Instances: 5620, Number of Features: 64
Dataset ID: 29, Number of Instances: 690, Number of Features: 15
Dataset ID: 31, Number of Instances: 1000, Number of Features: 20
Dataset ID: 32, Number of Instances: 10992, Number of Features: 16
Dataset ID: 37, Number of Instances: 768, Number of Features: 8
Dataset ID: 44, Nu



Dataset ID: 6332, Number of Instances: 540, Number of Features: 37
Dataset ID: 1461, Number of Instances: 45211, Number of Features: 16
Dataset ID: 4538, Number of Instances: 9873, Number of Features: 32
Dataset ID: 1478, Number of Instances: 10299, Number of Features: 561
Dataset ID: 23381, Number of Instances: 500, Number of Features: 12
Dataset ID: 40499, Number of Instances: 5500, Number of Features: 40
Dataset ID: 40668, Number of Instances: 67557, Number of Features: 42




Dataset ID: 40966, Number of Instances: 1080, Number of Features: 77
Dataset ID: 40982, Number of Instances: 1941, Number of Features: 27




Dataset ID: 40994, Number of Instances: 540, Number of Features: 18
Dataset ID: 40983, Number of Instances: 4839, Number of Features: 5
Dataset ID: 40975, Number of Instances: 1728, Number of Features: 6
Dataset ID: 40984, Number of Instances: 2310, Number of Features: 16
Dataset ID: 40979, Number of Instances: 2000, Number of Features: 240
Dataset ID: 40996, Number of Instances: 70000, Number of Features: 784
Dataset ID: 41027, Number of Instances: 44819, Number of Features: 6
Dataset ID: 23517, Number of Instances: 96320, Number of Features: 21
Dataset ID: 40923, Number of Instances: 92000, Number of Features: 1024
Dataset ID: 40927, Number of Instances: 60000, Number of Features: 3072
Dataset ID: 40978, Number of Instances: 3279, Number of Features: 1558
Dataset ID: 40670, Number of Instances: 3186, Number of Features: 180
Dataset ID: 40701, Number of Instances: 5000, Number of Features: 20


In [3]:

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

for dataset_id in benchmark_suite.data:
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )

    categorical_indicator = np.array(categorical_indicator)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, ~categorical_indicator),
            ('cat', categorical_transformer, categorical_indicator)
        ])

    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', DecisionTreeClassifier())])

    scores = cross_val_score(clf, X, y, cv=5)
    print(f"Dataset ID: {dataset_id}, Cross-Validation Accuracy: {scores.mean():.3f}")



Dataset ID: 3, Cross-Validation Accuracy: 0.980
Dataset ID: 6, Cross-Validation Accuracy: 0.875
Dataset ID: 11, Cross-Validation Accuracy: 0.589
Dataset ID: 12, Cross-Validation Accuracy: 0.879
Dataset ID: 14, Cross-Validation Accuracy: 0.733
Dataset ID: 15, Cross-Validation Accuracy: 0.934
Dataset ID: 16, Cross-Validation Accuracy: 0.804
Dataset ID: 18, Cross-Validation Accuracy: 0.653
Dataset ID: 22, Cross-Validation Accuracy: 0.659
Dataset ID: 23, Cross-Validation Accuracy: 0.482
Dataset ID: 28, Cross-Validation Accuracy: 0.888
Dataset ID: 29, Cross-Validation Accuracy: 0.794
Dataset ID: 31, Cross-Validation Accuracy: 0.699
Dataset ID: 32, Cross-Validation Accuracy: 0.958
Dataset ID: 37, Cross-Validation Accuracy: 0.705
Dataset ID: 44, Cross-Validation Accuracy: 0.885
Dataset ID: 46, Cross-Validation Accuracy: 0.918
Dataset ID: 50, Cross-Validation Accuracy: 0.792
Dataset ID: 54, Cross-Validation Accuracy: 0.710
Dataset ID: 151, Cross-Validation Accuracy: 0.672
Dataset ID: 182, Cros



Dataset ID: 6332, Cross-Validation Accuracy: 0.530
Dataset ID: 1461, Cross-Validation Accuracy: 0.562
Dataset ID: 4538, Cross-Validation Accuracy: 0.365
Dataset ID: 1478, Cross-Validation Accuracy: 0.872
Dataset ID: 23381, Cross-Validation Accuracy: 0.516
Dataset ID: 40499, Cross-Validation Accuracy: 0.925
Dataset ID: 40668, Cross-Validation Accuracy: 0.563




Dataset ID: 40966, Cross-Validation Accuracy: 0.472
Dataset ID: 40982, Cross-Validation Accuracy: 0.533




Dataset ID: 40994, Cross-Validation Accuracy: 0.906
Dataset ID: 40983, Cross-Validation Accuracy: 0.976
Dataset ID: 40975, Cross-Validation Accuracy: 0.734
Dataset ID: 40984, Cross-Validation Accuracy: 0.903
Dataset ID: 40979, Cross-Validation Accuracy: 0.874
Dataset ID: 40996, Cross-Validation Accuracy: 0.794
Dataset ID: 41027, Cross-Validation Accuracy: 0.634
Dataset ID: 23517, Cross-Validation Accuracy: 0.505
Dataset ID: 40923, Cross-Validation Accuracy: 0.433
Dataset ID: 40927, Cross-Validation Accuracy: 0.263
Dataset ID: 40978, Cross-Validation Accuracy: 0.948
Dataset ID: 40670, Cross-Validation Accuracy: 0.906
Dataset ID: 40701, Cross-Validation Accuracy: 0.916
