In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import openml
import os
import pandas as pd
import math

## For loading data
from pathlib import Path
from typing import Union

In [3]:
# Functions to read data
def load_dataset(path: Union[Path, str]) -> pd.DataFrame:
    return pd.read_csv(path, index_col=0)


def load_rankings(path: Union[Path, str]) -> pd.DataFrame:
    out = pd.read_csv(path, index_col=0, header=[0, 1, 2, 3])
    out.columns.name = ("dataset", "model", "tuning", "scoring")
    return out

In [4]:
dir_data = '../../data/raw/'

# File names
filename_dataset = 'dataset.csv'

# Create paths for given files
filepath_dataset = os.path.join(dir_data, filename_dataset)

# Load data
dataset = load_dataset(filepath_dataset)

In [5]:
unique_encoders = dataset.encoder.unique()
unique_models = dataset.model.unique()
unique_scoring = dataset.scoring.unique()
unique_datasets = dataset.dataset.unique()
# unique_tuning = dataset.tuning.unique()

# Idea explained

The basic idea is to generate a bunch of features. 
The best features will later be selected by a feature selection algorithm like RFECV, MRMR or something else.
The starting point is the ```dataset``` feature, which indicates the id of the dataset in [openml.org](https://www.openml.org/). 
Therefore, the [openml API](https://openml.github.io/openml-python/main/api.html#) is used. 

The first concept for creating the features is: 

![image](../../data/dataset_FE.svg)

### ToDos
[ ] Research and test openMLStudy

[ ] Research and test openMLTask

[ ] Research and test openMLRun

[ ] Research and test openml.datasets.list_qualities

# dataset_agg

### Get lists of possible attributes and an intersection

In [6]:
# Get intersection of keys which are in all datasets
list_of_keys = [set(openml.datasets.get_dataset(dataset_id=int(dataset_id)).qualities.keys()) for dataset_id in unique_datasets]
intersection = set.intersection(*list_of_keys)
intersection

Could not download file from http://openml1.win.tue.nl/dataset41224/dataset_41224.pq: Bucket does not exist or is private.


{'AutoCorrelation',
 'Dimensionality',
 'MajorityClassPercentage',
 'MajorityClassSize',
 'MinorityClassPercentage',
 'MinorityClassSize',
 'NumberOfBinaryFeatures',
 'NumberOfClasses',
 'NumberOfFeatures',
 'NumberOfInstances',
 'NumberOfInstancesWithMissingValues',
 'NumberOfMissingValues',
 'NumberOfNumericFeatures',
 'NumberOfSymbolicFeatures',
 'PercentageOfBinaryFeatures',
 'PercentageOfInstancesWithMissingValues',
 'PercentageOfMissingValues',
 'PercentageOfNumericFeatures',
 'PercentageOfSymbolicFeatures'}

In [7]:
# List of all attributes given by opeml
attribute_set = set()
for dataset_id in unique_datasets:
    attribute_list = list(openml.datasets.get_dataset(dataset_id=int(dataset_id)).qualities.keys())
    attribute_set.update(attribute_list)
attribute_set

Could not download file from http://openml1.win.tue.nl/dataset41224/dataset_41224.pq: Bucket does not exist or is private.


{'AutoCorrelation',
 'CfsSubsetEval_DecisionStumpAUC',
 'CfsSubsetEval_DecisionStumpErrRate',
 'CfsSubsetEval_DecisionStumpKappa',
 'CfsSubsetEval_NaiveBayesAUC',
 'CfsSubsetEval_NaiveBayesErrRate',
 'CfsSubsetEval_NaiveBayesKappa',
 'CfsSubsetEval_kNN1NAUC',
 'CfsSubsetEval_kNN1NErrRate',
 'CfsSubsetEval_kNN1NKappa',
 'ClassEntropy',
 'DecisionStumpAUC',
 'DecisionStumpErrRate',
 'DecisionStumpKappa',
 'Dimensionality',
 'EquivalentNumberOfAtts',
 'J48.00001.AUC',
 'J48.00001.ErrRate',
 'J48.00001.Kappa',
 'J48.0001.AUC',
 'J48.0001.ErrRate',
 'J48.0001.Kappa',
 'J48.001.AUC',
 'J48.001.ErrRate',
 'J48.001.Kappa',
 'MajorityClassPercentage',
 'MajorityClassSize',
 'MaxAttributeEntropy',
 'MaxKurtosisOfNumericAtts',
 'MaxMeansOfNumericAtts',
 'MaxMutualInformation',
 'MaxNominalAttDistinctValues',
 'MaxSkewnessOfNumericAtts',
 'MaxStdDevOfNumericAtts',
 'MeanAttributeEntropy',
 'MeanKurtosisOfNumericAtts',
 'MeanMeansOfNumericAtts',
 'MeanMutualInformation',
 'MeanNoiseToSignalRatio',


At first I will keep __all__ features and not just the ones, which are in every dataset present. 
Threrefore, I will create additional features in the dataset_agg table. 

Create a mapping of the attributes I want to create to the ones given by openml.

| My feature idea | Related feature from openml | Description |
| :- | :- | :- |
| row_count | NumberOfInstances | The number of instances = The number of rows in the dataset |
| column_count | NumberOfFeatures | The total number of features + targets |
| null_value_count | NumberOfMissingValues | Number of occuring null values |
| rows_with_null_values_count | NumberOfInstancesWithMissingValues | Number of rows with null values |
| columns_with_null_values_count |  | Number of features containing null values |
| ratio_of_null_values_to_all |  | $ = \dfrac{\text{null_value_count}}{\text{row_count} \times \text{total_feature_count}}$ |
| categorical_features_count |  | Self explaining. Give a __suggestion__ by calculation. But it has to be checked manually, since there can also be numerical features, which are just other category names. (e.g. the *geo_level_1_id* in the earthquake dataset)  |
| non_categorical_features_count |  | Self explaining. But has also to be checked manually. |
| ratio_of_categorical_features_to_all |  | $ = \dfrac{\text{categorical_features_count}}{\text{total_feature_count}} $ |
| sum_of_all_categories |  | Sum of the number of categories over all categorical values. Has to be checked manually. |
| categorical_target_variables_count |  | The number of classification tasks |
| non_categorical_target_variables_count |  | The number of regression tasks |
| categorical_target_values_sum | NumberOfClasses | The sum of classes to predict over all target variables |
| total_feature_count |  | The number of features to predict the target(s) |
| min_number_of_categories_per_cat_feature |  | Min number of categories in a categorical feature |
| max_number_of_categories_per_cat_feature |  | Max number of categories in a categorical feature |
| avg_number_of_categories_per_cat_feature |  | Avg number of categories per categorical feature |

### Create dataset and save it

In [8]:
# Init empty lists for feature values
list_dataset_id = []
list_row_count = []
list_column_count = []
list_null_value_count = []
list_rows_with_null_values_count = []
list_columns_with_null_values_count = []
list_ratio_of_null_values_to_all = []
list_categorical_features_count = []
list_non_categorical_features_count = []
list_ratio_of_categorical_features_to_all = []
list_sum_of_all_categories = []
list_categorical_target_variables_count = []
list_non_categorical_target_variables_count = []
list_categorical_target_values_sum = []
list_total_feature_count = []
list_min_number_of_categories_per_cat_feature = []
list_max_number_of_categories_per_cat_feature = []
list_avg_number_of_categories_per_cat_feature = []

In [9]:
# Remove the features already used above
attributs_to_remove_from_feature_set = set(["NumberOfInstances", "NumberOfMissingValues", "NumberOfInstancesWithMissingValues", "NumberOfClasses", "NumberOfFeatures"])

# Create lists for all attributes in the set
add_feature_list = attribute_set - attributs_to_remove_from_feature_set

# Create dict with lists for the features to add
feature_list_dict = {}
for feature_name in add_feature_list:
    feature_list_dict[feature_name] = []

In [10]:
def row_count(dataset):
    """
    Returns the count of rows in the provided dataset.

            Parameters:
                    dataset (openml.datasets.OpenMLDataset): A dataset object from openml.org

            Returns:
                    row_count (int): The number of rows in the provided dataset object 
    """
    return dataset.qualities.get('NumberOfInstances')

In [11]:
def column_count(dataset):
    return dataset.qualities.get('NumberOfFeatures')

In [12]:
def null_value_count(dataset):
    return dataset.qualities.get('NumberOfMissingValues')

In [13]:
def rows_with_null_values_count(dataset):
    return dataset.qualities.get('NumberOfInstancesWithMissingValues')

In [14]:
def columns_with_null_values_count(X):
    return sum(X.isna().any())

In [15]:
def ratio_of_null_values_to_all(dataset, X):
    return (null_value_count(dataset)) / (total_feature_count(X) * row_count(dataset))

In [16]:
def categorical_features_count(dataset):
    categorical_features_count = 0
    
    for k in dataset.features:
        # Operations on features
        if dataset.features[k].name not in dataset.default_target_attribute.split(','):
            if dataset.features[k].data_type in ['nominal', 'string']:
                categorical_features_count += 1
    
    #return sum(categorical_indicator)
    return categorical_features_count

In [17]:
def non_categorical_features_count(X, dataset):
    return total_feature_count(X) - categorical_features_count(dataset)

In [18]:
def ratio_of_categorical_features_to_all(X, dataset):
    return categorical_features_count(dataset) / total_feature_count(X)

In [19]:
def sum_of_all_categories(dataset, attribute_names):
    # ToDo: Maybe use the categorical indicator map
    sum_of_categories = 0
    
    for k in dataset.features:
        # Operations on features
        if dataset.features[k].name not in dataset.default_target_attribute.split(','):
            # Update min and max number of categories per features
            if dataset.features[k].data_type == 'nominal':
                sum_of_categories += len(dataset.features[k].nominal_values)
            if dataset.features[k].data_type == 'string':
                if dataset.features[k].name in attribute_names:
                    tmp = X[dataset.features[k].name].unique()
                    sum_of_categories += len(tmp)
    
    return sum_of_categories

In [20]:
def categorical_target_variables_count(dataset):
    count_of_cat_targets = 0
    
    for k in dataset.features:
        # Operations on features
        if dataset.features[k].name in dataset.default_target_attribute.split(','):
            if dataset.features[k].data_type in ['nominal', 'string']:
                count_of_cat_targets += 1
    
    return count_of_cat_targets

In [21]:
def non_categorical_target_variables_count(dataset):
    count_of_non_cat_targets = 0
    
    for k in dataset.features:
        # Operations on features
        if dataset.features[k].name in dataset.default_target_attribute.split(','):
            if dataset.features[k].data_type not in ['nominal', 'string']:
                count_of_non_cat_targets += 1
    
    return count_of_non_cat_targets

In [22]:
def categorical_target_values_sum(dataset):
    return dataset.qualities.get('NumberOfClasses')

In [23]:
def total_feature_count(X):
    return X.shape[1]

In [24]:
def min_number_of_categories_per_cat_feature(dataset, X, attribute_names):
    min_number_of_categories = math.inf
    
    for k in dataset.features:
        # Operations on features
        if dataset.features[k].name not in dataset.default_target_attribute.split(','):
            # Update min and max number of categories per features
            if dataset.features[k].data_type == 'nominal':
                if len(dataset.features[k].nominal_values) < min_number_of_categories:
                    min_number_of_categories = len(dataset.features[k].nominal_values)
            if dataset.features[k].data_type == 'string':
                if dataset.features[k].name in attribute_names:
                    tmp = X[dataset.features[k].name].unique()
                    if len(tmp) < min_number_of_categories:
                        min_number_of_categories = len(tmp)
    
    return min_number_of_categories

In [25]:
def max_number_of_categories_per_cat_feature(dataset, X, attribute_names):
    max_number_of_categories = -math.inf
    
    for k in dataset.features:
        # Operations on features
        if dataset.features[k].name not in dataset.default_target_attribute.split(','):
            # Update min and max number of categories per features
            if dataset.features[k].data_type == 'nominal':
                if len(dataset.features[k].nominal_values) > max_number_of_categories:
                    max_number_of_categories = len(dataset.features[k].nominal_values)
            if dataset.features[k].data_type == 'string':
                if dataset.features[k].name in attribute_names:
                    tmp = X[dataset.features[k].name].unique()
                    if len(tmp) > max_number_of_categories:
                        max_number_of_categories = len(tmp)
    
    return max_number_of_categories

In [26]:
def avg_number_of_categories_per_cat_feature(dataset, categorical_indicator, attribute_names):
    return sum_of_all_categories(dataset, attribute_names) / categorical_features_count(dataset)

In [27]:
def get_predefined_feature(dataset, feature_name):
    return dataset.qualities.get(feature_name)

In [28]:
# Traverse all unique datasets, call the functions and collect the information
for dataset_id in unique_datasets:
    print(dataset_id)
    
    # Get openml dataset object with the current id
    dataset = openml.datasets.get_dataset(dataset_id=int(dataset_id))
    
    # Get dataset
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute, dataset_format="dataframe"
    )
    
    # Apply functions
    list_dataset_id.append(dataset_id)
    list_row_count.append(row_count(dataset))
    list_column_count.append(column_count(dataset))
    list_null_value_count.append(null_value_count(dataset))
    list_rows_with_null_values_count.append(rows_with_null_values_count(dataset))
    list_columns_with_null_values_count.append(columns_with_null_values_count(X))
    list_ratio_of_null_values_to_all.append(ratio_of_null_values_to_all(dataset, X))
    list_categorical_features_count.append(categorical_features_count(dataset))
    list_non_categorical_features_count.append(non_categorical_features_count(X, dataset))
    list_ratio_of_categorical_features_to_all.append(ratio_of_categorical_features_to_all(X, dataset))
    list_sum_of_all_categories.append(sum_of_all_categories(dataset, attribute_names))
    list_categorical_target_variables_count.append(categorical_target_variables_count(dataset))
    list_non_categorical_target_variables_count.append(non_categorical_target_variables_count(dataset))
    list_categorical_target_values_sum.append(categorical_target_values_sum(dataset))
    list_total_feature_count.append(total_feature_count(X))
    list_min_number_of_categories_per_cat_feature.append(min_number_of_categories_per_cat_feature(dataset, X, attribute_names))
    list_max_number_of_categories_per_cat_feature.append(max_number_of_categories_per_cat_feature(dataset, X, attribute_names))
    list_avg_number_of_categories_per_cat_feature.append(avg_number_of_categories_per_cat_feature(dataset, categorical_indicator, attribute_names))
    
    # Iterate over the attributes in qualities
    for feature_name in add_feature_list:
        updated_list = feature_list_dict[feature_name]
        updated_list.append(get_predefined_feature(dataset, feature_name))
        feature_list_dict[feature_name] = updated_list

3
29
31
38
50
51
56
333
334
451
470
881
956
959
981
1037
1111
1112
1114
1169
1235
1461
1463
1486
1506
1511
1590
6332
23381
40536
40945
40981
40999
41005
41007
41162
41224


Could not download file from http://openml1.win.tue.nl/dataset41224/dataset_41224.pq: Bucket does not exist or is private.


42178
42343
42344
42738
42750
43098
43607
43890
43892
43896
43897
43900
43922


In [29]:
# Create a pandas dataframe and save it
feature_list_dict['dataset_id'] = list_dataset_id
feature_list_dict['row_count'] = list_row_count
feature_list_dict['column_count'] = list_column_count
feature_list_dict['null_value_count'] = list_null_value_count
feature_list_dict['rows_with_null_values_count'] = list_rows_with_null_values_count
feature_list_dict['columns_with_null_values_count'] = list_columns_with_null_values_count
feature_list_dict['ratio_of_null_values_to_all'] = list_ratio_of_null_values_to_all
feature_list_dict['categorical_features_count'] = list_categorical_features_count
feature_list_dict['non_categorical_features_count'] = list_non_categorical_features_count
feature_list_dict['ratio_of_categorical_features_to_all'] = list_ratio_of_categorical_features_to_all
feature_list_dict['sum_of_all_categories'] = list_sum_of_all_categories
feature_list_dict['categorical_target_variables_count'] = list_categorical_target_variables_count
feature_list_dict['non_categorical_target_variables_count'] = list_non_categorical_target_variables_count
feature_list_dict['categorical_target_values_sum'] = list_categorical_target_values_sum
feature_list_dict['total_feature_count'] = list_total_feature_count
feature_list_dict['min_number_of_categories_per_cat_feature'] = list_min_number_of_categories_per_cat_feature
feature_list_dict['max_number_of_categories_per_cat_feature'] = list_max_number_of_categories_per_cat_feature
feature_list_dict['avg_number_of_categories_per_cat_feature'] = list_avg_number_of_categories_per_cat_feature

dataset_agg = pd.DataFrame(feature_list_dict)

In [30]:
dataset_agg.head(50)

Unnamed: 0,CfsSubsetEval_kNN1NKappa,Quartile2KurtosisOfNumericAtts,REPTreeDepth2Kappa,CfsSubsetEval_DecisionStumpErrRate,MeanNominalAttDistinctValues,Quartile3KurtosisOfNumericAtts,MeanStdDevOfNumericAtts,REPTreeDepth1Kappa,RandomTreeDepth1AUC,MaxSkewnessOfNumericAtts,...,non_categorical_features_count,ratio_of_categorical_features_to_all,sum_of_all_categories,categorical_target_variables_count,non_categorical_target_variables_count,categorical_target_values_sum,total_feature_count,min_number_of_categories_per_cat_feature,max_number_of_categories_per_cat_feature,avg_number_of_categories_per_cat_feature
0,0.881192,,0.972417,0.059136,2.027027,,,0.972417,0.945136,,...,0,1.0,74,1,0,2.0,36,2,3,2.055556
1,0.710175,15.348781,0.710515,0.143478,4.2,91.789567,901.509141,0.710515,0.767221,13.140655,...,6,0.6,41,1,0,2.0,15,2,14,4.555556
2,0.305697,0.919781,0.222543,0.273,4.0,1.649274,407.047619,0.222543,0.658024,1.949628,...,7,0.65,56,1,0,2.0,20,2,11,4.307692
3,0.775972,8.871304,0.871287,0.025186,2.086957,90.942488,19.053878,0.871287,0.785261,13.882653,...,7,0.758621,46,1,0,2.0,29,1,5,2.090909
4,0.419222,,0.529614,0.23904,2.9,,,0.529614,0.770363,,...,0,1.0,27,1,0,2.0,9,3,3,3.0
5,0.542654,0.810318,0.494327,0.210884,2.625,3.379606,19.599081,0.494327,0.755854,1.548824,...,6,0.538462,19,1,0,2.0,13,2,4,2.714286
6,0.903872,,0.903237,0.045977,2.0,,,0.903237,0.962917,,...,0,1.0,32,1,0,2.0,16,2,2,2.0
7,0.492806,,0.758993,0.253597,2.714286,,,0.758993,0.934598,,...,0,1.0,17,1,0,2.0,6,2,4,2.833333
8,0.0,,-0.011301,0.342762,2.714286,,,-0.011301,0.623651,,...,0,1.0,17,1,0,2.0,6,2,4,2.833333
9,0.987864,-0.449467,1.0,0.128,4.25,-0.398137,15.395027,1.0,0.946626,0.446142,...,2,0.6,15,1,0,2.0,5,2,10,5.0


In [31]:
dataset_agg.to_csv('../../data/preprocessed/dataset_agg.csv')

In [32]:
dataset_agg.isna().any()

CfsSubsetEval_kNN1NKappa                     True
Quartile2KurtosisOfNumericAtts               True
REPTreeDepth2Kappa                           True
CfsSubsetEval_DecisionStumpErrRate           True
MeanNominalAttDistinctValues                 True
                                            ...  
categorical_target_values_sum               False
total_feature_count                         False
min_number_of_categories_per_cat_feature    False
max_number_of_categories_per_cat_feature    False
avg_number_of_categories_per_cat_feature    False
Length: 120, dtype: bool

# features

In [33]:
list_feature_id = []
list_dataset_id = []
list_null_values_count = []
list_non_null_values_count = []
list_ratio_of_nan_values_to_all = []
list_min_value = []
list_max_value = []
list_avg_value = []
list_median_value = []
list_25_quantile = []
list_75_quantile = []
list_unique_values_count = []
list_data_type = []
list_is_target = []

In [69]:
def feature_id(feature, dataset_id):
    return str(dataset_id) + str(feature.index)

In [35]:
def null_values_count(feature):
    return feature.number_missing_values

In [36]:
def non_null_values_count(feature, X):
    return X.shape[0] - null_values_count(feature)

In [37]:
def ratio_of_nan_values_to_all(feature, X):
    return null_values_count(feature) / X.shape[0]

In [38]:
def min_value(feature, X):
    min_value = None
    if feature.data_type not in ['nominal', 'string']:
        if feature.name in X.columns and len(X[feature.name].dropna()) != 0:
            min_value = min(X[feature.name].dropna())
    
    return min_value

In [39]:
def max_value(feature, X):
    max_value = None
    if feature.data_type not in ['nominal', 'string']:
        if feature.name in X.columns and len(X[feature.name].dropna()) != 0:
            max_value = max(X[feature.name].dropna())
    
    return max_value

In [40]:
def avg_value(feature, X):
    avg_value = None
    if feature.data_type not in ['nominal', 'string']:
        if feature.name in X.columns and len(X[feature.name].dropna()) != 0:
            avg_value = X[feature.name].mean()
    
    return avg_value

In [41]:
def median_value(feature, X):
    median = None
    if feature.data_type not in ['nominal', 'string']:
        if feature.name in X.columns and len(X[feature.name].dropna()) != 0:
            median = X[feature.name].median()
    
    return median

In [42]:
def quantile_25(feature, X):
    quantile_25 = None
    if feature.data_type not in ['nominal', 'string']:
        if feature.name in X.columns and len(X[feature.name].dropna()) != 0:
            quantile_25 = X[feature.name].quantile(q=0.25)
    
    return quantile_25

In [43]:
def quantile_75(feature, X):
    quantile_75 = None
    if feature.data_type not in ['nominal', 'string']:
        if feature.name in X.columns and len(X[feature.name].dropna()) != 0:
            quantile_75 = X[feature.name].quantile(q=0.75)
    
    return quantile_75

In [44]:
def unique_values_count(feature, X, y, dataset):
    unique_values_count = 0
    if feature.name in dataset.default_target_attribute.split(','):
        unique_values_count = len(y.unique())
    else:
        if feature.name in X.columns:
            unique_values_count = len(X[feature.name].unique())
    
    return unique_values_count

In [45]:
def data_type(feature):
    return feature.data_type

In [46]:
def is_target(feature, dataset):
    return feature in dataset.default_target_attribute.split(',') 

In [47]:
for dataset_id in unique_datasets:
    print(dataset_id)
    
    # Get openml dataset object with the current id
    dataset = openml.datasets.get_dataset(dataset_id=int(dataset_id))
    
    # Get dataset
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute, dataset_format="dataframe"
    )
    
    # Iterate over features
    for k in dataset.features:
        feature = dataset.features[k]
        
        # Call functions
        list_feature_id.append(feature_id(feature, dataset_id))
        list_dataset_id.append(dataset_id)
        list_null_values_count.append(null_values_count(feature))
        list_non_null_values_count.append(non_null_values_count(feature, X))
        list_ratio_of_nan_values_to_all.append(ratio_of_nan_values_to_all(feature, X))
        list_min_value.append(min_value(feature, X))
        list_max_value.append(max_value(feature, X))
        list_avg_value.append(avg_value(feature, X))
        list_median_value.append(median_value(feature, X))
        list_25_quantile.append(quantile_25(feature, X))
        list_75_quantile.append(quantile_75(feature, X))
        list_unique_values_count.append(unique_values_count(feature, X, y, dataset))
        list_data_type.append(data_type(feature))
        list_is_target.append(is_target(feature, dataset))

3
29
31
38
50
51
56
333
334
451
470
881
956
959
981
1037
1111
1112
1114
1169
1235
1461
1463
1486
1506
1511
1590
6332
23381
40536
40945
40981
40999
41005
41007
41162


Could not download file from http://openml1.win.tue.nl/dataset41224/dataset_41224.pq: Bucket does not exist or is private.


41224
42178
42343
42344
42738
42750
43098
43607
43890
43892
43896
43897
43900
43922


In [48]:
features_data = {}

features_data['feature_id'] = list_feature_id
features_data['dataset_id'] = list_dataset_id
features_data['null_values_count'] = list_null_values_count
features_data['non_null_values_count'] = list_non_null_values_count
features_data['ratio_of_nan_values_to_all'] = list_ratio_of_nan_values_to_all
features_data['min_value'] = list_min_value
features_data['max_value'] = list_max_value
features_data['avg_value'] = list_avg_value
features_data['median_value'] = list_median_value
features_data['25_quantile'] = list_25_quantile
features_data['75_quantile'] = list_75_quantile
features_data['unique_values_count'] = list_unique_values_count
features_data['data_type'] = list_data_type
features_data['is_target'] = list_is_target

features = pd.DataFrame(features_data)

In [49]:
features.head()

Unnamed: 0,feature_id,dataset_id,null_values_count,non_null_values_count,ratio_of_nan_values_to_all,min_value,max_value,avg_value,median_value,25_quantile,75_quantile,unique_values_count,data_type,is_target
0,0,3,0,3196,0.0,,,,,,,2,nominal,False
1,1,3,0,3196,0.0,,,,,,,2,nominal,False
2,2,3,0,3196,0.0,,,,,,,2,nominal,False
3,3,3,0,3196,0.0,,,,,,,2,nominal,False
4,4,3,0,3196,0.0,,,,,,,2,nominal,False


In [50]:
features.isna().sum()

feature_id                       0
dataset_id                       0
null_values_count                0
non_null_values_count            0
ratio_of_nan_values_to_all       0
min_value                     1081
max_value                     1081
avg_value                     1081
median_value                  1081
25_quantile                   1081
75_quantile                   1081
unique_values_count              0
data_type                        0
is_target                        0
dtype: int64

In [51]:
features.shape

(2387, 14)

In [60]:
len(features['feature_id'].unique())

478

In [52]:
features.to_csv('../../data/preprocessed/features.csv')

# correlations

Correlation metrics


| Data type of feature 1 | Data type of feature 2 | Metric |
| :- | :- | :- |
| Binary | Binary | Cramers V |
| Binary | Categorical | Cramers V |
| Binary | Numeric/Continous | F-value from ANOVA, Point biserial |
| Categorical | Categorical | Cramers V |
| Categorical | Numeric/Continous | F-value from ANOVA |
| Numeric/Continous | Numeric/Continous | Pearson, Spearman |

Mutual Information is used for all

In [101]:
correlations_data = {
    'feature_1_id': list(),
    'feature_2_id': list(),
    'method'      : list(),
    'correlation' : list()
}

list_feature_1_id = []
list_feature_2_id = []
list_method = []
list_correlation = []

In [72]:
from sklearn import preprocessing
from scipy.stats import chi2_contingency
import numpy as np


def cramers_V(feature_1, feature_2):
    label_enc = preprocessing.LabelEncoder()
    feature_1 = label_enc.fit_transform(feature_1)
    feature_2 = label_enc.fit_transform(feature_2)
    
    crosstab = np.array(pd.crosstab(feature_1, feature_2, rownames=None, colnames=None))  # Cross table building
    stat = chi2_contingency(crosstab)[0]  # Keeping of the test statistic of the Chi2 test
    obs = np.sum(crosstab)  # Number of observations
    mini = min(crosstab.shape) - 1  # Take the minimum value between the columns and the rows of the cross table
    
    return (stat / (obs * mini))

In [73]:
import numpy as np


def pearson_correlation(x, y):
    # Calculate the mean of x and y
    mean_x = np.mean(x)
    mean_y = np.mean(y)

    # Calculate the differences from the mean
    diff_x = x - mean_x
    diff_y = y - mean_y

    # Calculate the numerator and denominator of the correlation coefficient formula
    numerator = np.sum(diff_x * diff_y)
    denominator = np.sqrt(np.sum(diff_x**2) * np.sum(diff_y**2))

    # Calculate the Pearson correlation coefficient
    correlation = numerator / denominator

    return correlation

In [74]:
from scipy.stats import pointbiserialr


def point_biserialr(bin_feature, cont_feature):
    return pointbiserialr(bin_feature, cont_feature)

In [100]:
import numpy as np
from scipy import stats


def ANOVA_f_value(data, cat_feature_name, cont_feature_name):
    grouped_data = data.groupby(cat_feature_name)[cont_feature_name].apply(list)
    if len(grouped_data) > 1:
        res = stats.f_oneway(*grouped_data)[0]
    else:
        res = math.nan
    
    return res

In [None]:
for dataset_id in unique_datasets:
    print(dataset_id)
    
    # Get openml dataset object with the current id
    dataset = openml.datasets.get_dataset(dataset_id=int(dataset_id))
    
    # Get dataset
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute, dataset_format="dataframe"
    )
    
    # Iterate over the features
    for k in dataset.features:
        feature_1 = dataset.features[k]
        
        for i in range(k, len(dataset.features)):
            feature_2 = dataset.features[i]
            
            if feature_1.name in X.columns and feature_2.name in X.columns:
                # Differ between data types
                if feature_1.data_type in ['string', 'nominal'] and feature_2.data_type in ['string', 'nominal']:
                    # Both are binary or categorical -> Cramers V
                    method = "Cramers V"
                    corr = cramers_V(X[feature_1.name], X[feature_2.name])

                elif feature_1.data_type in ['string', 'nominal']:
                    # Other value is not categorical, so its numerical
                    # --> F value from ANOVA
                    method = "ANOVA"
                    corr = ANOVA_f_value(data=X[[feature_1.name, feature_2.name]], cat_feature_name=feature_1.name, cont_feature_name=feature_2.name)
                    
                elif feature_2.data_type in ['string', 'nominal']:
                    # Other value is not categorical, so its numerical
                    # --> F value from ANOVA
                    method = "ANOVA"
                    corr = ANOVA_f_value(data=X[[feature_1.name, feature_2.name]], cat_feature_name=feature_2.name, cont_feature_name=feature_1.name)
                    
                else:
                    # Only numerical 
                    # --> Use pearson
                    method = "Pearson"
                    corr = pearson_correlation(X[feature_1.name], X[feature_2.name])

                # Write feature ids
                #correlations_data['feature_1_id'] = correlations_data['feature_1_id'].append(feature_id(feature_1, dataset_id))
                #correlations_data['feature_2_id'] = correlations_data['feature_2_id'].append(feature_id(feature_2, dataset_id))
                #correlations_data['method'] = correlations_data['method'].append(method)
                #correlations_data['correlation'] = correlations_data['correlation'].append(corr)
                list_feature_1_id.append(feature_id(feature_1, dataset_id))
                list_feature_2_id.append(feature_id(feature_2, dataset_id))
                list_method.append(method)
                list_correlation.append(corr)
                
                # Write also the other way 
                if feature_1.name != feature_2.name:
                    list_feature_1_id.append(feature_id(feature_2, dataset_id))
                    list_feature_2_id.append(feature_id(feature_1, dataset_id))
                    list_method.append(method)
                    list_correlation.append(corr)

3
29




31




38


  correlation = numerator / denominator
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  correlation = numerator / denominator
  return (stat / (obs * mini))
  correlation = numerator / denominator
  return (stat / (obs * mini))
  correlation = numerator / denominator
  return (stat / (obs * mini))
  correlation = numerator / denominator
  return (stat / (obs * mini))
  correlation = numerator / denominator
  return (stat / (obs * mini))
  return (stat / (obs * mini))
  correlation = numerator / denominator


50
51


  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator


56
333
334
451
470
881
956
959
981
1037




1111


  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator


  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator


  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator


  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator


  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator


  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator


  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator
  correlation = numerator / denominator


In [None]:
# ToDo: Create pandas dataframe
correlations_data = {}

correlations_data['feature_1_id'] = list_feature_1_id
correlations_data['feature_2_id'] = list_feature_2_id
correlations_data['method'] = list_method
correlations_data['correlation'] = list_correlation

correlations = pd.DataFrame(correlations_data)

In [None]:
correlations.to_csv('../../data/preprocessed/correlations.csv')

# evaluation 

- No tuning is used for all
- Standard parameters used for all

After some tests I found out, that one can only generate tests on active datasets. 

In [6]:
datasets_active_state = openml.datasets.check_datasets_active(unique_datasets)

print("Inactive datasets: ")
for d in datasets_active_state:
    if not datasets_active_state[d]:
        print(f"Need manual experiment setup for dataset id {d}")

Inactive datasets: 
Need manual experiment setup for dataset id 41224


At first I will perform tests on all other datasets. If there is time in the end, I will also look at a manual approach for dataset 41224.

At first authenticate to openml as described in https://openml.github.io/openml-python/develop/examples/20_basic/introduction_tutorial.html#sphx-glr-examples-20-basic-introduction-tutorial-py



### Iteration over all

In [7]:
unique_scoring

array(['ACC', 'AUC', 'F1'], dtype=object)

In [87]:
list_dataset_id = []
list_model = []  # unique_models = ['DTC' 'KNC' 'LGBMC' 'LR' 'SVC']
list_encoding = []
list_scoring = []
list_folds = []
list_cv_score = []
list_std_dev = []
list_dropped_na = []

In [86]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

import numpy as np
import time


datasets_active_state = openml.datasets.check_datasets_active(unique_datasets)
#eval_measures = ['predictive_accuracy', 'area_under_roc_curve', 'f_measure']
evaluation_metrics = {
    'ACC': 'predictive_accuracy',
    'AUC': 'area_under_roc_curve',
    'F1' : 'f_measure'
}


for dataset_id in unique_datasets:
    print(dataset_id)
    
    if datasets_active_state[d]:
        # Dataset is active
        # Define the classifier and preprocessing steps
        imputer = SimpleImputer(strategy='most_frequent')
        # ToDo: Implement the encoders and iterate over them
        encoder = OneHotEncoder(categories='auto', sparse_output=False, handle_unknown='ignore')
        encoder_string = "OHE"
        scaler = StandardScaler()
        
        for model_string in unique_models:
            print(f"  {model_string}")
            
            # Choose classifier
            if model_string == "DTC":
                classifier = DecisionTreeClassifier()
            elif model_string == "KNC":
                classifier = KNeighborsClassifier()
            elif model_string == "LGBMC":
                classifier = LGBMClassifier()
            elif model_string == "LR":
                classifier = LinearRegression()
            elif model_string == "SVC":
                classifier = SVC()
            else:
                print(f"Classifier '{model_string}' is not implemented!")
                continue
            
            # Set up the pipeline
            pipeline = Pipeline(steps=[
                ('imputer', imputer),
                ('encoder', encoder),
                ('scaler', scaler),
                ('classifier', classifier)
            ])

            # Select task if it does not exist
            existing_tasks = openml.tasks.list_tasks(
                task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
            )
            
            for scoring in evaluation_metrics.keys():
                print(f"    {scoring} --- {evaluation_metrics[scoring]}")
                
                # Prepare filter
                task_filter = ((existing_tasks['did'] == dataset_id) & (existing_tasks['estimation_procedure'] == '10-fold Crossvalidation') & (existing_tasks['evaluation_measures'] == evaluation_metrics[scoring]))
                #filtered_tasks = tasks[task_filter]

                # Check if task exists
                if existing_tasks[task_filter].shape[0] > 0:
                    # task exists

                    task = openml.tasks.get_task(task_id=existing_tasks[task_filter].iloc[0].tid)
                else:
                    # Create a new task and publish it
                    print("Create task")
                    new_task = openml.tasks.create_task(task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
                                                    dataset_id=3,
                                                    target_name="class",
                                                    evaluation_measure="f_measure",
                                                    estimation_procedure_id=1   # '10-fold Crossvalidation',
                    )
                    new_task.publish()

                    # Now list tasks again and select the new task
                    existing_tasks = openml.tasks.list_tasks(
                        task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
                    )
                    task = openml.tasks.get_task(task_id=existing_tasks[task_filter].iloc[0].tid)

                # Check if the run has already be performed, if yes load it    
                list_runs = openml.runs.list_runs(task=[task.id])
                list_runs_values = list(list_runs.values())
                if len(list_runs_values) == 0:
                    # Run the task
                    run = openml.runs.run_model_on_task(model=pipeline, task=task, seed=42)
                    run.publish()
                    run_id = run.id
                    
                else:
                    # Get the task which has already be run
                    run_id = list_runs_values[0]['run_id']
                
                # Run the model on the task
                run_result = openml.runs.get_run(run_id)
                
                # For some reason sometimes the results are contained in the run object and sometimes not
                if run_result.fold_evaluations is not None:
                    # Calculate results to get cv_score and std dev
                    cv_score = np.mean(list(run_result.fold_evaluations[evaluation_metrics[scoring]][0].values()))
                    std_dev = np.std(list(run_result.fold_evaluations[evaluation_metrics[scoring]][0].values()))
                else:
                    # Maybe needs to sleep for short after publishing the run
                    time.sleep(3)
                    
                    # Get evaluations
                    list_evaluations = openml.evaluations.list_evaluations(function=evaluation_metrics[scoring], runs=[run_id])
                    eval_result = list(list_evaluations.values())[0]
                    cv_score = eval_result.value
                    std_dev = math.nan
                    
                # Append results to list
                list_dataset_id.append(dataset_id)
                list_model.append(model_string)
                list_encoding.append(encoder_string)
                list_scoring.append(scoring)
                list_folds.append(10)
                list_cv_score.append(cv_score)
                list_std_dev.append(std_dev)

3
  DTC
    ACC --- predictive_accuracy
    AUC --- area_under_roc_curve
    F1 --- f_measure
  KNC
    ACC --- predictive_accuracy
    AUC --- area_under_roc_curve
    F1 --- f_measure
  LGBMC
    ACC --- predictive_accuracy
    AUC --- area_under_roc_curve
    F1 --- f_measure


KeyboardInterrupt: 

In [None]:
evaluations_data = {}

evaluations_data['dataset_id'] = list_dataset_id
evaluations_data['model'] = list_model
evaluations_data['encoding'] = list_encoding
evaluations_data['scoring'] = list_scoring
evaluations_data['folds'] = list_folds
evaluations_data['cv_score'] = list_cv_score
evaluations_data['std_dev'] = list_std_dev

evaluations = pd.DataFrame(data=evaluations_data)

In [None]:
evaluations.to_csv('../../data/preprocessed/evaluations.csv')