In [1]:
!pip install pathlib openml pandas pqdm oslo.concurrency


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
from __future__ import annotations
import pickle
import os

# from pqdm.processes import pqdm
from pathlib import Path
from typing import List, Union

import openml
import pandas as pd
from pqdm.threads import pqdm
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Benchmarks

In [144]:
suites = openml.study.list_suites(output_format="dataframe", status="all")

In [145]:
suites.head(4)

Unnamed: 0,id,alias,main_entity_type,name,status,creation_date,creator
14,14,OpenML100,task,"Collaborative, reproducible benchmarking and a...",in_preparation,2019-02-21 18:40:13,1
99,99,OpenML-CC18,task,OpenML-CC18 Curated Classification benchmark,active,2019-02-21 18:47:13,1
218,218,AutoML-Benchmark,task,AutoML Benchmark,in_preparation,2019-05-02 13:35:08,869
219,219,FOREX,task,Forex,in_preparation,2019-06-04 00:45:17,1


In [143]:
# testecc18
suites[suites["alias"] == "testecc18"]

Unnamed: 0,id,alias,main_entity_type,name,status,creation_date,creator,datasets_related_to_benchmark
253,253,testecc18,task,TesteCC18,in_preparation,2020-09-01 00:57:54,8598,"[3, 6, 11, 12, 14, 15, 16, 18, 22, 23, 28, 29,..."


In [133]:
len(openml.study.get_suite(14).tasks) == len(openml.study.get_suite(14).data)

True

In [41]:
openml.study.get_suite(14).tasks == openml.study.get_suite(14).data

False

In [42]:
def get_suite_tasks(suite_id: int) -> List[int] | None:
    # return openml.study.get_suite(suite_id).data
    return openml.study.get_suite(suite_id).tasks

In [43]:
tqdm.pandas()
suites["datasets_related_to_benchmark"] = suites["id"].progress_apply(
    lambda x: get_suite_tasks(x)
)

100%|██████████| 91/91 [00:10<00:00,  8.92it/s]


# Datasets

## Get all data

In [44]:
def get_dataset_description(
    dataset_id, download_data=False
) -> openml.datasets.dataset.OpenMLDataset:
    """
    Get the dataset description from OpenML using the dataset id

    Input: dataset_id (int) : The dataset id

    Returns: data (openml.datasets.dataset.OpenMLDataset) : The dataset object from OpenML
    """
    # TODO : Check for objects that do not have qualities being not downloaded properly
    # try:

    data = openml.datasets.get_dataset(
        dataset_id=dataset_id,
        download_data=download_data,
        download_qualities=True,
        download_features_meta_data=True,
    )

    return data

In [45]:
# install the package oslo.concurrency to ensure thread safety
def get_all_metadata_from_openml(n_jobs=10, download_data=False) -> Union[List, List]:
    """
    Description: Gets all the metadata from OpenML for the type of data specified in the config.

    This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.

    Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.
    """

    save_filename = f"kaggle_all_dataset_metadata.pkl"
    # Check if the file already exists
    if os.path.exists(save_filename):
        print("[INFO] File already exists. Loading from file.")
        with open(save_filename, "rb") as f:
            openml_data_object, data_id, all_objects = pickle.load(f)
        return openml_data_object, data_id, all_objects
    else:
        # the id column name is different for dataset and flow, so we need to handle that
        id_column_name = "did"

        # Gather all OpenML objects of the type of data
        print("[INFO] Getting dataset metadata.")
        all_objects = openml.datasets.list_datasets(output_format="dataframe")

        print("[INFO] Checking downloaded files and skipping them.")

        data_id = [
            int(all_objects.iloc[i][id_column_name]) for i in range(len(all_objects))
        ]

        # Initialize cache before using parallel (following OpenML python API documentation)
        print("[INFO] Initializing cache.")
        get_dataset_description(data_id[0])

        # Get all object metadata using n_jobs parallel threads from openml
        print("[INFO] Getting dataset metadata from OpenML.")
        openml_data_object = pqdm(data_id, get_dataset_description, n_jobs=10)

        # Save the metadata to a file
        print("[INFO] Saving metadata to file.")
        with open(save_filename, "wb") as f:
            pickle.dump((openml_data_object, data_id, all_objects), f)

        return openml_data_object, data_id, all_objects

## Create dataframe with combined attributes

In [46]:
def extract_attribute(attribute, attr_name):
    """
    Description: Extract an attribute from the OpenML object.

    Input: attribute (object) : The OpenML object

    Returns: The attribute value if it exists, else an empty string.
    """
    return getattr(attribute, attr_name, "")

In [47]:
def create_combined_information_df(
    data_id, descriptions, joined_qualities, joined_features
):
    """
    Description: Create a dataframe with the combined information of the OpenML object.

    Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object

    Returns: The dataframe with the combined information of the OpenML object.
    """
    return pd.DataFrame(
        {
            "did": data_id,
            "description": descriptions,
            "qualities": joined_qualities,
            "features": joined_features,
        }
    )

In [48]:
def combine_metadata(all_dataset_metadata, all_data_description_df):
    """
    Description: Combine the descriptions with the metadata table.

    Input: all_dataset_metadata (pd.DataFrame) : The metadata table,
    all_data_description_df (pd.DataFrame) : The descriptions

    Returns: The combined metadata table.
    """
    # Combine the descriptions with the metadata table
    all_dataset_metadata = pd.merge(
        all_dataset_metadata, all_data_description_df, on="did", how="inner"
    )

    # Create a single column that has a combined string of all the metadata and the description in the form of "column - value, column - value, ... description"

    # all_dataset_metadata["Combined_information"] = all_dataset_metadata.apply(
    #     merge_all_columns_to_string, axis=1
    # )
    return all_dataset_metadata

In [49]:
def join_attributes(attribute, attr_name):
    """
    Description: Join the attributes of the OpenML object.

    Input: attribute (object) : The OpenML object

    Returns: The joined attributes if they exist, else an empty string.
    example: "column - value, column - value, ..."
    """

    return (
        " ".join([f"{k} : {v}," for k, v in getattr(attribute, attr_name, {}).items()])
        if hasattr(attribute, attr_name)
        else ""
    )

In [50]:
def parse_attributes(attribute_str):
    """
    Reverse the join_attributes function
    """
    attributes = {}
    for item in attribute_str.split(","):
        if ":" in item:
            try:
                k, v = item.split(":")
                attributes[k.strip()] = v.strip()
            except ValueError:
                pass
    return attributes

In [51]:
def create_metadata_dataframe(
    openml_data_object, data_id, all_dataset_metadata, use_cache=False
):
    """
    Description: Creates a dataframe with all the metadata, joined columns with all information for the type of data specified in the config.

    Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, config (dict) : The config dictionary

    Returns: The combined metadata dataframe and the metadata table.
    """
    if use_cache == True:
        # If we are not training, we do not need to recreate the cache and can load the metadata from the files. If the files do not exist, raise an exception.
        try:
            with open(f"kaggle_all_dataset_metadata.csv", "r") as f:
                all_data_description_df = pd.read_csv(f)
            return all_data_description_df
        except:
            raise Exception(
                "Metadata files do not exist. Please run without use_cache."
            )
    else:
        descriptions = [
            extract_attribute(attr, "description") for attr in openml_data_object
        ]
        joined_qualities = [
            join_attributes(attr, "qualities") for attr in openml_data_object
        ]
        joined_features = [
            join_attributes(attr, "features") for attr in openml_data_object
        ]

        all_data_description_df = create_combined_information_df(
            data_id, descriptions, joined_qualities, joined_features
        )
        all_dataset_metadata = combine_metadata(
            all_dataset_metadata, all_data_description_df
        )

        # Expand the qualities column into multiple columns
        qualities_expanded = all_dataset_metadata["qualities"].apply(parse_attributes)
        expanded_df = pd.DataFrame(qualities_expanded.tolist())
        expanded_df = pd.concat([all_dataset_metadata, expanded_df], axis=1)

        expanded_df.to_csv(f"kaggle_all_dataset_metadata.csv", index=False)

        return expanded_df

## Comparison

In [52]:
def compare_dataset_hash(ds1, ds2, hash_fn):
    return hash_fn(ds1.to_string()) == hash_fn(ds2.to_string())

In [53]:
def get_dataset_sum(df, column_list):
    """
    Description: Get the sum of the columns in the dataframe
    """
    # convert all columns to numeric
    # df[column_list] = df[column_list].apply(pd.to_numeric, errors='coerce')
    df.loc[:, column_list] = df.loc[:, column_list].apply(
        pd.to_numeric, errors="coerce"
    )

    return abs(df[column_list].values.sum())

In [54]:
def compare_by_func(df1, df2, id1, id2, column_list1, column_list2, func):
    """
    Description: Compare two dataframes using a function given two ids and a list of columns to compare
    TODO : For df2, change the id column to whatever is needed, or add an argument
    """
    return func(df1[df1["did"] == id1], column_list1) == func(
        df2[df2["did"] == id2], column_list2
    )

# Collate everything
- This takes a bit of time, even with the dataframes downloaded. 

In [19]:
openml_data_object, data_id, all_metadata = get_all_metadata_from_openml(
    n_jobs=10, download_data=False
)

[INFO] Getting dataset metadata.
[INFO] Checking downloaded files and skipping them.
[INFO] Initializing cache.
[INFO] Getting dataset metadata from OpenML.


QUEUEING TASKS | : 100%|██████████| 5703/5703 [00:00<00:00, 100991.82it/s]
PROCESSING TASKS | : 100%|██████████| 5703/5703 [00:41<00:00, 138.33it/s]
COLLECTING RESULTS | : 100%|██████████| 5703/5703 [00:00<00:00, 340946.37it/s]


[INFO] Saving metadata to file.


In [110]:
# Create the combined metadata dataframe
metadata_df = create_metadata_dataframe(
    openml_data_object, data_id, all_metadata, use_cache=False
)

In [111]:
metadata_df[metadata_df["name"] == "wine-quality-red"]

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,...,RandomTreeDepth2AUC,RandomTreeDepth2ErrRate,RandomTreeDepth2Kappa,RandomTreeDepth3AUC,RandomTreeDepth3ErrRate,RandomTreeDepth3Kappa,StdvNominalAttDistinctValues,kNN1NAUC,kNN1NErrRate,kNN1NKappa
2466,40691,wine-quality-red,1,869,active,ARFF,681.0,6.0,10.0,6.0,...,,,,,,,0.0,,,


In [None]:
tasks = openml.tasks.list_tasks(output_format="dataframe", status="all")

## Getting matching benchmarks

In [103]:
tasks

Unnamed: 0,tid,ttid,did,name,task_type,status,estimation_procedure,evaluation_measures,source_data,target_feature,...,NumberOfNumericFeatures,NumberOfSymbolicFeatures,number_samples,cost_matrix,source_data_labeled,target_feature_event,target_feature_left,target_feature_right,quality_measure,target_value
0,1,TaskType.SUPERVISED_CLASSIFICATION,1,anneal,Supervised Classification,deactivated,10-fold Crossvalidation,predictive_accuracy,1,class,...,6.0,33.0,,,,,,,,
1,2,TaskType.SUPERVISED_CLASSIFICATION,2,anneal,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,2,class,...,6.0,33.0,,,,,,,,
2,3,TaskType.SUPERVISED_CLASSIFICATION,3,kr-vs-kp,Supervised Classification,active,10-fold Crossvalidation,,3,class,...,0.0,37.0,,,,,,,,
3,4,TaskType.SUPERVISED_CLASSIFICATION,4,labor,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,4,class,...,8.0,9.0,,,,,,,,
4,5,TaskType.SUPERVISED_CLASSIFICATION,5,arrhythmia,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,5,class,...,206.0,74.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259589,362076,TaskType.SUPERVISED_REGRESSION,46139,Cancer_Drug_Response_methylation,Supervised Regression,active,10-fold Crossvalidation,root_mean_squared_error,46139,Methotrexate_GDSC_1008,...,809.0,0.0,,,,,,,,
259590,362077,TaskType.SUPERVISED_REGRESSION,46140,Cancer_Drug_Response_mutation,Supervised Regression,active,10-fold Crossvalidation,root_mean_squared_error,46140,Methotrexate_GDSC_1008,...,34674.0,0.0,,,,,,,,
259591,362078,TaskType.SUPERVISED_REGRESSION,46141,Cancer_Drug_Response_copynumber,Supervised Regression,active,10-fold Crossvalidation,root_mean_squared_error,46141,Methotrexate_GDSC_1008,...,711.0,0.0,,,,,,,,
259592,362079,TaskType.SUPERVISED_REGRESSION,44963,physiochemical_protein,Supervised Regression,active,10-fold Crossvalidation,root_mean_squared_error,44963,RMSD,...,10.0,0.0,,,,,,,,


In [104]:
grouped_count_tasks = tasks.groupby("did").count().reset_index()
grouped_count_tasks.head()

Unnamed: 0,did,tid,ttid,name,task_type,status,estimation_procedure,evaluation_measures,source_data,target_feature,...,NumberOfNumericFeatures,NumberOfSymbolicFeatures,number_samples,cost_matrix,source_data_labeled,target_feature_event,target_feature_left,target_feature_right,quality_measure,target_value
0,1,29,29,29,29,29,26,9,29,17,...,29,29,8,0,0,0,0,0,0,0
1,2,52,52,52,52,52,31,9,52,41,...,52,52,10,1,0,0,0,0,20,20
2,3,44,44,44,44,44,34,11,44,33,...,44,44,10,0,0,0,0,0,8,8
3,4,29,29,29,29,29,28,9,29,18,...,29,29,11,0,0,0,0,0,0,0
4,5,32,32,32,32,32,31,9,32,21,...,32,32,12,0,0,0,0,0,0,0


In [101]:
grouped_count[grouped_count["did"] == 43555]

Unnamed: 0,did,tid,ttid,name,task_type,status,estimation_procedure,evaluation_measures,source_data,target_feature,...,NumberOfNumericFeatures,NumberOfSymbolicFeatures,number_samples,cost_matrix,source_data_labeled,target_feature_event,target_feature_left,target_feature_right,quality_measure,target_value


In [102]:
grouped_count[grouped_count["did"] == 40691]

Unnamed: 0,did,tid,ttid,name,task_type,status,estimation_procedure,evaluation_measures,source_data,target_feature,...,NumberOfNumericFeatures,NumberOfSymbolicFeatures,number_samples,cost_matrix,source_data_labeled,target_feature_event,target_feature_left,target_feature_right,quality_measure,target_value
19612,40691,24,24,24,24,24,23,3,24,13,...,24,24,8,0,0,0,0,0,0,0


In [117]:
datasets_with_tasks = grouped_count["did"].to_list()

In [112]:
metadata_df

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,...,RandomTreeDepth2AUC,RandomTreeDepth2ErrRate,RandomTreeDepth2Kappa,RandomTreeDepth3AUC,RandomTreeDepth3ErrRate,RandomTreeDepth3Kappa,StdvNominalAttDistinctValues,kNN1NAUC,kNN1NErrRate,kNN1NKappa
0,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,...,0.9296999989655875,0.0801781737193764,0.7953250436852635,0.9296999989655875,0.0801781737193764,0.7953250436852635,1.5576059718800395,0.8721948540771287,0.06347438752783964,0.8261102938928316
1,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,...,0.9451363376145694,0.05444305381727159,0.8908498653739637,0.9451363376145694,0.05444305381727159,0.8908498653739637,0.1643989873053572,0.9728427352982838,0.06914893617021277,0.8609676573571107
2,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,...,0.7500711237553342,0.2982456140350877,0.3376623376623376,0.7500711237553342,0.2982456140350877,0.3376623376623376,0.5270462766947299,0.7675675675675676,0.21052631578947367,0.5581395348837209
3,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,...,0.6217055780491048,0.5066371681415929,0.23296726097846543,0.6217055780491048,0.5066371681415929,0.23296726097846543,1.3341969312340396,0.5985703851111869,0.4646017699115044,0.21277866242038226
4,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,...,0.9080139521161272,0.1769,0.816017080292,0.9080139521161272,0.1769,0.816017080292,0.0,0.9731018928391187,0.05785,0.939833878217649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5698,46271,mabbob_ela_as_5d_regression_DiagonalCMA,1,43372,active,arff,,,,0.0,...,,,,,,,,,,
5699,46272,mabbob_ela_as_5d_regression_DifferentialEvolution,1,43372,active,arff,,,,0.0,...,,,,,,,,,,
5700,46273,mabbob_ela_as_5d_regression_modcma,1,43372,active,arff,,,,0.0,...,,,,,,,,,,
5701,46274,mabbob_ela_as_5d_regression_modde,1,43372,active,arff,,,,0.0,...,,,,,,,,,,


In [105]:
def get_openml_url_from_task_id(task_id):
    return f"https://www.openml.org/search?type=task&sort=runs&id={task_id}"

In [106]:
def get_openml_url_from_dataset_id(did):
    return f"https://www.openml.org/search?type=data&sort=runs&id={did}"

In [107]:
get_openml_url_from_task_id(297)

'https://www.openml.org/search?type=task&sort=runs&id=297'

In [108]:
get_openml_url_from_dataset_id(22)

'https://www.openml.org/search?type=data&sort=runs&id=22'

In [125]:
for did in tqdm(datasets_with_tasks):
    metadata_df.loc[metadata_df["did"] == did, "has_linked_task"] = True
metadata_df["has_linked_task"] = metadata_df["has_linked_task"].fillna(False)

  0%|          | 0/21491 [00:00<?, ?it/s]

100%|██████████| 21491/21491 [00:03<00:00, 7065.45it/s]
  metadata_df["has_linked_task"] = metadata_df["has_linked_task"].fillna(False)


In [129]:
metadata_df["openml_url"] = metadata_df["did"].apply(get_openml_url_from_dataset_id)

In [130]:
metadata_df.head(5)

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,...,RandomTreeDepth2Kappa,RandomTreeDepth3AUC,RandomTreeDepth3ErrRate,RandomTreeDepth3Kappa,StdvNominalAttDistinctValues,kNN1NAUC,kNN1NErrRate,kNN1NKappa,has_linked_task,openml_url
0,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,...,0.7953250436852635,0.9296999989655876,0.0801781737193764,0.7953250436852635,1.5576059718800397,0.8721948540771287,0.0634743875278396,0.8261102938928316,True,https://www.openml.org/search?type=data&sort=r...
1,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,...,0.8908498653739637,0.9451363376145694,0.0544430538172715,0.8908498653739637,0.1643989873053572,0.9728427352982838,0.0691489361702127,0.8609676573571107,True,https://www.openml.org/search?type=data&sort=r...
2,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,...,0.3376623376623376,0.7500711237553342,0.2982456140350877,0.3376623376623376,0.5270462766947299,0.7675675675675676,0.2105263157894736,0.5581395348837209,True,https://www.openml.org/search?type=data&sort=r...
3,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,...,0.2329672609784654,0.6217055780491048,0.5066371681415929,0.2329672609784654,1.3341969312340396,0.5985703851111869,0.4646017699115044,0.2127786624203822,True,https://www.openml.org/search?type=data&sort=r...
4,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,...,0.816017080292,0.9080139521161272,0.1769,0.816017080292,0.0,0.9731018928391189,0.05785,0.939833878217649,True,https://www.openml.org/search?type=data&sort=r...


In [127]:
metadata_df[metadata_df["name"] == "wine-quality-red"]

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,...,RandomTreeDepth2ErrRate,RandomTreeDepth2Kappa,RandomTreeDepth3AUC,RandomTreeDepth3ErrRate,RandomTreeDepth3Kappa,StdvNominalAttDistinctValues,kNN1NAUC,kNN1NErrRate,kNN1NKappa,has_linked_task
2466,40691,wine-quality-red,1,869,active,ARFF,681.0,6.0,10.0,6.0,...,,,,,,0.0,,,,True


In [128]:
metadata_df[metadata_df["name"] == "COVID-19-biotech-companies-on-stock-exchange(2020)"]

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,...,RandomTreeDepth2ErrRate,RandomTreeDepth2Kappa,RandomTreeDepth3AUC,RandomTreeDepth3ErrRate,RandomTreeDepth3Kappa,StdvNominalAttDistinctValues,kNN1NAUC,kNN1NErrRate,kNN1NKappa,has_linked_task
3744,43555,COVID-19-biotech-companies-on-stock-exchange(2...,1,30126,active,arff,,,,,...,,,,,,,,,,False


In [149]:
metadata_df.to_csv("kaggle_all_dataset_metadata_with_linked_tasks.csv", index=False)

# Mushroom

## Getting the metadata

In [22]:
# version 1 is the oldest one on openml
mushroom = metadata_df[
    (metadata_df["name"] == "mushroom") & (metadata_df["version"] == 1)
]
mushroom.to_csv("mushroom_metadata.csv", index=False)

In [25]:
mushroom

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,...,RandomTreeDepth2AUC,RandomTreeDepth2ErrRate,RandomTreeDepth2Kappa,RandomTreeDepth3AUC,RandomTreeDepth3ErrRate,RandomTreeDepth3Kappa,StdvNominalAttDistinctValues,kNN1NAUC,kNN1NErrRate,kNN1NKappa
19,24,mushroom,1,1,active,ARFF,4208.0,12.0,3916.0,2.0,...,0.999525,0.000492,0.999014,0.999525,0.000492,0.999014,3.180971,1.0,0.0,1.0


## Get the actual CSV

In [61]:
downloaded_data = get_dataset_description(24, download_data=True)
X, y, _, _ = downloaded_data.get_data(dataset_format="dataframe")

In [95]:
X.to_csv("mushroom_data.csv", index=False)

## Compare dataset hash

In [96]:
# Test using python inbuilt hash function, feel free to use your own
compare_dataset_hash(X, X.T, hash)

False

In [33]:
comparision_list = ["NumberOfInstances", "NumberOfFeatures", "NumberOfMissingValues"]

In [35]:
# Testing for different datasets
compare_by_func(
    metadata_df,
    metadata_df,
    24,
    25,
    comparision_list,
    comparision_list,
    get_dataset_sum,
)

False