In [1]:
import openml
import os
import pandas as pd
import math

## For loading data
from pathlib import Path
from typing import Union

In [2]:
def load_dataset(path: Union[Path, str]) -> pd.DataFrame:
    return pd.read_csv(path, index_col=0)


def load_rankings(path: Union[Path, str]) -> pd.DataFrame:
    out = pd.read_csv(path, index_col=0, header=[0, 1, 2, 3])
    out.columns.name = ("dataset", "model", "tuning", "scoring")
    return out

In [3]:
dir_data = '../../data/raw/'

# File names
filename_dataset = 'dataset.csv'

# Create paths for given files
filepath_dataset = os.path.join(dir_data, filename_dataset)

# Load data
dataset = load_dataset(filepath_dataset)

In [4]:
unique_datasets = dataset.dataset.unique()

# Collect data from datasets

In [5]:
%%time

list_count_target = []
list_count_features = []
list_min_number_of_categories = []
list_max_number_of_categories = []
list_sum_of_categories = []
list_count_cat_features = []
list_count_non_cat_feature = []
list_ratio = []
list_avg_categories_per_cat_feature = []
list_sum_of_target_categories = []
list_count_of_cat_targets = []
list_count_of_non_cat_targets = []

for dataset_id in unique_datasets:
    print(dataset_id)
    # Get dataset:
    dataset_3 = openml.datasets.get_dataset(dataset_id=int(dataset_id))
    
    # Count of target variables
    count_target = len(dataset_3.default_target_attribute.split(','))
    names_of_targets = dataset_3.default_target_attribute.split(',')
    
    # Count of all features
    count_features = len(dataset_3.features) - count_target
    
    # Count of nominal = categorical features
    # Min number of categories of categorical features
    # Max number of categories of categorical features
    # Sum of categories of all categorical features
    min_number_of_categories = math.inf
    max_number_of_categories = - math.inf
    sum_of_categories = 0
    sum_of_target_categories = 0
    count_of_cat_targets = 0
    count_of_non_cat_targets = 0
    type_and_count = {}
    
    for k in dataset_3.features:
        # Operations on features
        if dataset_3.features[k].name not in names_of_targets:
            # Update min and max number of categories per features
            if dataset_3.features[k].data_type == 'nominal':
                sum_of_categories += len(dataset_3.features[k].nominal_values)
                if len(dataset_3.features[k].nominal_values) < min_number_of_categories:
                    min_number_of_categories = len(dataset_3.features[k].nominal_values)
                if len(dataset_3.features[k].nominal_values) > max_number_of_categories:
                    max_number_of_categories = len(dataset_3.features[k].nominal_values)
            
            # Count features per data type
            dt = dataset_3.features[k].data_type
            if type_and_count.get(dt) is None:
                type_and_count[dt] = 1
            else:
                type_and_count[dt] = type_and_count[dt] + 1
        
        # Operations on targets
        else:
            if dataset_3.features[k].data_type == 'nominal':
                sum_of_target_categories += len(dataset_3.features[k].nominal_values)
                count_of_cat_targets += 1
            else:
                count_of_non_cat_targets += 1
        
    # Count of non-categorical features
    if type_and_count.get('nominal') == None:
        count_non_cat_feature = count_features
    else:
        count_non_cat_feature = count_features - type_and_count.get('nominal')
    
    # Ratio categorical / all features
    if type_and_count.get('nominal') == None:
        ratio = 0
    else:
        ratio = type_and_count.get('nominal') / count_features
    
    # Avg number of categories of categorical features
    if type_and_count.get('nominal') == None:
        avg_categories_per_cat_feature = 0
    else:
        avg_categories_per_cat_feature = sum_of_categories / type_and_count.get('nominal')
    
    
    # Update lists
    list_count_target.append(count_target)
    list_count_features.append(count_features)
    list_min_number_of_categories.append(min_number_of_categories)
    list_max_number_of_categories.append(max_number_of_categories)
    list_sum_of_categories.append(sum_of_categories)
    list_count_cat_features.append(type_and_count.get('nominal'))
    list_count_non_cat_feature.append(count_non_cat_feature)
    list_ratio.append(ratio)
    list_avg_categories_per_cat_feature.append(avg_categories_per_cat_feature)
    list_sum_of_target_categories.append(sum_of_target_categories)
    list_count_of_cat_targets.append(count_of_cat_targets)
    list_count_of_non_cat_targets.append(count_of_non_cat_targets)

# Create pandas df
data = {
    'dataset': unique_datasets,
    'count_target': list_count_target,
    'sum_of_target_categories': list_sum_of_target_categories,
    'count_of_cat_targets': list_count_of_cat_targets,
    'count_of_non_cat_targets': list_count_of_non_cat_targets,
    'count_features': list_count_features,
    'min_number_of_categories': list_min_number_of_categories,
    'max_number_of_categories': list_max_number_of_categories,
    'sum_of_categories': list_sum_of_categories,
    'count_cat_features': list_count_cat_features,
    'ratio_categorical_all': list_ratio,
    'avg_categories_per_cat_feature': list_avg_categories_per_cat_feature
}
meta_df = pd.DataFrame(data)

3
29
31
38
50
51
56
333
334
451
470
881
956
959
981
1037
1111
1112
1114
1169
1235
1461
1463
1486
1506
1511
1590
6332
23381
40536
40945
40981
40999
41005
41007
41162
41224


Could not download file from http://openml1.win.tue.nl/dataset41224/dataset_41224.pq: Bucket does not exist or is private.


42178
42343
42344
42738
42750
43098
43607
43890
43892
43896
43897
43900
43922
CPU times: user 103 ms, sys: 21.6 ms, total: 124 ms
Wall time: 276 ms


In [6]:
meta_df.head(50)

Unnamed: 0,dataset,count_target,sum_of_target_categories,count_of_cat_targets,count_of_non_cat_targets,count_features,min_number_of_categories,max_number_of_categories,sum_of_categories,count_cat_features,ratio_categorical_all,avg_categories_per_cat_feature
0,3,1,2,1,0,36,2.0,3.0,74,36.0,1.0,2.055556
1,29,1,2,1,0,15,2.0,14.0,41,9.0,0.6,4.555556
2,31,1,2,1,0,20,2.0,11.0,56,13.0,0.65,4.307692
3,38,1,2,1,0,29,1.0,5.0,46,22.0,0.758621,2.090909
4,50,1,2,1,0,9,3.0,3.0,27,9.0,1.0,3.0
5,51,1,5,1,0,13,2.0,4.0,19,7.0,0.538462,2.714286
6,56,1,2,1,0,16,2.0,2.0,32,16.0,1.0,2.0
7,333,1,2,1,0,6,2.0,4.0,17,6.0,1.0,2.833333
8,334,1,2,1,0,6,2.0,4.0,17,6.0,1.0,2.833333
9,451,1,2,1,0,5,2.0,10.0,15,3.0,0.6,5.0


# Analyze the new data 

Look for:
- constant columns $\rightarrow$ then drop them
- inf and -inf $\rightarrow$ replace them with 0
- Null values $\rightarrow$ depending on column
    - in 'count_cat_features' replace with 0 
    - so far I did not see any more NaN values

In [7]:
# Drop constant columns
clean_meta_df = meta_df.loc[:, (meta_df != meta_df.iloc[0]).any()] 

In [8]:
# Replace inf and -inf with -1
clean_meta_df = clean_meta_df.replace(to_replace={math.inf: 0, -math.inf: 0})

In [9]:
# Check for null values
clean_meta_df.isna().any()

dataset                           False
sum_of_target_categories          False
count_of_cat_targets              False
count_of_non_cat_targets          False
count_features                    False
min_number_of_categories          False
max_number_of_categories          False
sum_of_categories                 False
count_cat_features                 True
ratio_categorical_all             False
avg_categories_per_cat_feature    False
dtype: bool

In [10]:
clean_meta_df = clean_meta_df.fillna(0)

In [11]:
clean_meta_df.head(50)

Unnamed: 0,dataset,sum_of_target_categories,count_of_cat_targets,count_of_non_cat_targets,count_features,min_number_of_categories,max_number_of_categories,sum_of_categories,count_cat_features,ratio_categorical_all,avg_categories_per_cat_feature
0,3,2,1,0,36,2.0,3.0,74,36.0,1.0,2.055556
1,29,2,1,0,15,2.0,14.0,41,9.0,0.6,4.555556
2,31,2,1,0,20,2.0,11.0,56,13.0,0.65,4.307692
3,38,2,1,0,29,1.0,5.0,46,22.0,0.758621,2.090909
4,50,2,1,0,9,3.0,3.0,27,9.0,1.0,3.0
5,51,5,1,0,13,2.0,4.0,19,7.0,0.538462,2.714286
6,56,2,1,0,16,2.0,2.0,32,16.0,1.0,2.0
7,333,2,1,0,6,2.0,4.0,17,6.0,1.0,2.833333
8,334,2,1,0,6,2.0,4.0,17,6.0,1.0,2.833333
9,451,2,1,0,5,2.0,10.0,15,3.0,0.6,5.0


In [12]:
clean_meta_df.describe()

Unnamed: 0,dataset,sum_of_target_categories,count_of_cat_targets,count_of_non_cat_targets,count_features,min_number_of_categories,max_number_of_categories,sum_of_categories,count_cat_features,ratio_categorical_all,avg_categories_per_cat_feature
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,18828.9,1.92,0.9,0.1,46.74,13.54,1947.62,5682.36,17.9,0.547164,259.324921
std,20535.49872,0.804071,0.303046,0.303046,82.95556,72.671531,5349.504783,17532.846483,23.628286,0.325347,779.446982
min,3.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,956.75,2.0,1.0,0.0,9.0,2.0,3.0,21.0,5.0,0.255601,2.64011
50%,1508.5,2.0,1.0,0.0,16.0,2.0,10.5,55.0,8.5,0.563859,4.403846
75%,41939.5,2.0,1.0,0.0,40.5,2.0,105.5,301.25,25.0,0.828125,12.28125
max,43922.0,5.0,1.0,1.0,477.0,513.0,21838.0,71504.0,136.0,1.0,4376.6


## Findings and manual checks

Manual checks: 
- Check dataset 1111, 1112, 1114. They have all the same values
  $\Rightarrow$ matches with the information given online (by visiting the website)
- Check dataset 40999, 41005, 41007. They have all the same values
  $\Rightarrow$ matches with the information given online (by visiting the website)

Findings: 
- Mostly we are dealing with classification problems. Only 5 datasets are designed for regression problems. 
- Only few datasets with > 100 features

## Save dataframe

In [13]:
clean_meta_df.to_csv('../../data/preprocessed/dataset_info.csv')