In [1]:
import pandas as pd
import openml
import numpy as np
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import category_encoders as ce

In [2]:
datasets = openml.datasets.list_datasets(output_format="dataframe")
datasets.head()

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0


In [3]:
datasets = datasets[datasets.status == 'active']
print('datasets with status active:', datasets.shape[0])
datasets = datasets[datasets.NumberOfInstancesWithMissingValues == 0]
print('datasets without missing values:', datasets.shape[0])
datasets = datasets[datasets.NumberOfInstances > 300]
print('datasets with more than 100 instances:', datasets.shape[0])
datasets = datasets[datasets.NumberOfSymbolicFeatures == 0]
print('datasets without symbolic features:', datasets.shape[0])

datasets with status active: 5401
datasets without missing values: 4531
datasets with more than 100 instances: 3121
datasets without symbolic features: 930


for _, task in tqdm(datasets[datasets.NumberOfClasses == 2.0].iterrows()):
    dataset = openml.datasets.get_dataset(task.did, download_data=True, download_qualities=True, download_features_meta_data=True)
    x, y, z, d = dataset.get_data(dataset_format="dataframe")

    # Storing the class column independently
    class_name = dataset.default_target_attribute
    y = x[class_name]
    x.drop(columns=[class_name], inplace=True)

    # One hot encode all categorical features
    categorical = x.select_dtypes(include=['category']).columns
    if len(categorical) > 0:
        ohe = OneHotEncoder()
        ohe_features = ohe.fit_transform(x[categorical])
        ohe_df = pd.DataFrame(ohe_features.toarray())
        ohe_df.columns = [f'cat_{i}' for i in range(ohe_df.shape[1])]
        x = pd.concat([x, ohe_df], axis=1)
        x = x.drop(categorical, axis=1)

    # Label encode 'class' column
    le = LabelEncoder()
    y = le.fit_transform(y)

    x['target'] = y

    x.to_csv('data/train-datasets/' + task['name'] + '.csv', index=False)

In [5]:
benchmark = openml.study.get_suite('OpenML-CC18')
tasks = openml.tasks.list_tasks(output_format="dataframe", task_id=benchmark.tasks)
tasks

Unnamed: 0,tid,ttid,did,name,task_type,status,estimation_procedure,source_data,target_feature,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
3,3,TaskType.SUPERVISED_CLASSIFICATION,3,kr-vs-kp,Supervised Classification,active,10-fold Crossvalidation,3,class,1669,3.0,1527,2,37,3196,0,0,0,37
6,6,TaskType.SUPERVISED_CLASSIFICATION,6,letter,Supervised Classification,active,10-fold Crossvalidation,6,class,813,26.0,734,26,17,20000,0,0,16,1
11,11,TaskType.SUPERVISED_CLASSIFICATION,11,balance-scale,Supervised Classification,active,10-fold Crossvalidation,11,class,288,3.0,49,3,5,625,0,0,4,1
12,12,TaskType.SUPERVISED_CLASSIFICATION,12,mfeat-factors,Supervised Classification,active,10-fold Crossvalidation,12,class,200,10.0,200,10,217,2000,0,0,216,1
14,14,TaskType.SUPERVISED_CLASSIFICATION,14,mfeat-fourier,Supervised Classification,active,10-fold Crossvalidation,14,class,200,10.0,200,10,77,2000,0,0,76,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167121,167121,TaskType.SUPERVISED_CLASSIFICATION,40923,Devnagari-Script,Supervised Classification,active,10-fold Crossvalidation,40923,character,2000,46.0,2000,46,1025,92000,0,0,1024,1
167124,167124,TaskType.SUPERVISED_CLASSIFICATION,40927,CIFAR_10,Supervised Classification,active,10-fold Crossvalidation,40927,class,6000,10.0,6000,10,3073,60000,0,0,3072,1
167125,167125,TaskType.SUPERVISED_CLASSIFICATION,40978,Internet-Advertisements,Supervised Classification,active,10-fold Crossvalidation,40978,class,2820,2.0,459,2,1559,3279,0,0,3,1556
167140,167140,TaskType.SUPERVISED_CLASSIFICATION,40670,dna,Supervised Classification,active,10-fold Crossvalidation,40670,class,1654,3.0,765,3,181,3186,0,0,0,181


In [9]:
benchmark = openml.study.get_suite('OpenML-CC18')
tasks = openml.tasks.list_tasks(output_format="dataframe", task_id=benchmark.tasks)

for _, task in tqdm(tasks.iterrows()):
    dataset = openml.datasets.get_dataset(task.did, download_data=True, download_qualities=True, download_features_meta_data=True)
    x, y, z, d = dataset.get_data(dataset_format="dataframe")

    # Storing the class column independently
    class_name = dataset.default_target_attribute
    y = x[class_name]
    x.drop(columns=[class_name], inplace=True)

    # One hot encode all categorical features
    categorical = x.select_dtypes(include=['category']).columns
    if len(categorical) > 0:
        loo = ce.OneHotEncoder()
        encoded_features = loo.fit_transform(x[categorical]) 
        encoded_features.columns = [f'cat_{i}' for i in range(encoded_features.shape[1])] 
        x = x.drop(categorical, axis=1)

    x.dropna(inplace=True, axis=1)
    # Normalizing and removing zero variance columns
    x = (x - x.mean()) / x.std()
    x.dropna(inplace=True, axis=1)

    if len(categorical) > 0:
        x = pd.concat([x, encoded_features], axis=1)

    # Label encode 'class' column
    le = LabelEncoder()
    y = le.fit_transform(y)

    x['target'] = y

    x.to_csv('data/OpenML-CC18/' + task['name'] + '.csv', index=False)

72it [15:04, 12.57s/it] 


In [None]:
benchmark = openml.study.get_suite(353)
tasks = openml.tasks.list_tasks(output_format="dataframe", task_id=benchmark.tasks)

for _, task in tqdm(tasks.iterrows()):
    dataset = openml.datasets.get_dataset(task.did, download_data=True, download_qualities=True, download_features_meta_data=True)
    x, y, z, d = dataset.get_data(dataset_format="dataframe")

    # Storing the class column independently
    class_name = dataset.default_target_attribute
    y = x[class_name]
    x = x.rename(columns={class_name: 'target'})

    # One hot encode all categorical features
    categorical = x.select_dtypes(include=['category']).columns.tolist()
    if task['name'] == 'forest_fires':
        categorical += ['month', 'day']

    if len(categorical) > 0:
        loo = ce.LeaveOneOutEncoder()
        encoded_features = loo.fit_transform(x[categorical], x['target']) 
        encoded_features.columns = [f'cat_{i}' for i in range(encoded_features.shape[1])] 

        x = pd.concat([x, encoded_features], axis=1)
        x = x.drop(categorical, axis=1)

    x.dropna(inplace=True, axis=1)
    # Normalizing and removing zero variance columns
    x = (x - x.mean()) / x.std()
    x.dropna(inplace=True, axis=1)

    x.to_csv('data/OpenML-CTR23/' + task['name'] + '.csv', index=False)