In [94]:
!pip install pathlib openml pandas pqdm oslo.concurrency

Collecting oslo.concurrency
  Using cached oslo.concurrency-6.0.0-py3-none-any.whl (48 kB)
Collecting fasteners>=0.7.0
  Using cached fasteners-0.19-py3-none-any.whl (18 kB)
Collecting oslo.utils>=3.33.0
  Using cached oslo.utils-7.1.0-py3-none-any.whl (101 kB)
Collecting oslo.i18n>=3.15.3
  Using cached oslo.i18n-6.3.0-py3-none-any.whl (46 kB)
Collecting oslo.config>=5.2.0
  Using cached oslo.config-9.4.0-py3-none-any.whl (128 kB)
Collecting pbr!=2.1.0,>=2.0.0
  Using cached pbr-6.0.0-py2.py3-none-any.whl (107 kB)
Collecting debtcollector>=1.2.0
  Using cached debtcollector-3.0.0-py3-none-any.whl (23 kB)
Collecting rfc3986>=1.2.0
  Using cached rfc3986-2.0.0-py2.py3-none-any.whl (31 kB)
Collecting stevedore>=1.20.0
  Using cached stevedore-5.2.0-py3-none-any.whl (49 kB)
Collecting netaddr>=0.7.18
  Downloading netaddr-1.3.0-py3-none-any.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m334.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

In [6]:
import pickle
import os
# from pqdm.processes import pqdm
from pathlib import Path
from typing import List, Union

import openml
import pandas as pd
from pqdm.threads import pqdm

## Get all data

In [7]:
def get_dataset_description(dataset_id, download_data = False) -> openml.datasets.dataset.OpenMLDataset:

    """
    Get the dataset description from OpenML using the dataset id

    Input: dataset_id (int) : The dataset id

    Returns: data (openml.datasets.dataset.OpenMLDataset) : The dataset object from OpenML
    """
    # TODO : Check for objects that do not have qualities being not downloaded properly
    # try:
    
    data = openml.datasets.get_dataset(
        dataset_id=dataset_id,
        download_data=download_data,
        download_qualities=True,
        download_features_meta_data=True,
    )

    return data

In [8]:
# install the package oslo.concurrency to ensure thread safety
def get_all_metadata_from_openml(n_jobs = 10, download_data = False) -> Union[List, List]:
    """
    Description: Gets all the metadata from OpenML for the type of data specified in the config.
    
    This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.

    Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.
    """

    save_filename = f"kaggle_all_dataset_metadata.pkl"
    # Check if the file already exists
    if os.path.exists(save_filename):
        print("[INFO] File already exists. Loading from file.")
        with open(save_filename, "rb") as f:
            openml_data_object, data_id, all_objects = pickle.load(f)
        return openml_data_object, data_id, all_objects
    else:
        # the id column name is different for dataset and flow, so we need to handle that
        id_column_name = "did"

        # Gather all OpenML objects of the type of data
        print("[INFO] Getting dataset metadata.")
        all_objects = openml.datasets.list_datasets(output_format="dataframe")
    
        print("[INFO] Checking downloaded files and skipping them.")

        data_id = [
            int(all_objects.iloc[i][id_column_name])
            for i in range(len(all_objects))
        ]

        # Initialize cache before using parallel (following OpenML python API documentation)
        print("[INFO] Initializing cache.")
        get_dataset_description(data_id[0])

        # Get all object metadata using n_jobs parallel threads from openml
        print("[INFO] Getting dataset metadata from OpenML.")
        openml_data_object = pqdm(
            data_id, get_dataset_description, n_jobs=10, download_data = download_data
        )
        
        # Save the metadata to a file
        print("[INFO] Saving metadata to file.")
        with open(save_filename, "wb") as f:
            pickle.dump((openml_data_object, data_id, all_objects), f)

        return openml_data_object, data_id, all_objects

## Create dataframe with combined attributes

In [9]:
def extract_attribute(attribute, attr_name):
    """
    Description: Extract an attribute from the OpenML object.

    Input: attribute (object) : The OpenML object

    Returns: The attribute value if it exists, else an empty string.
    """
    return getattr(attribute, attr_name, "")

In [10]:
def create_combined_information_df(
    data_id, descriptions, joined_qualities, joined_features
):
    """
    Description: Create a dataframe with the combined information of the OpenML object.

    Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object

    Returns: The dataframe with the combined information of the OpenML object.
    """
    return pd.DataFrame(
        {
            "did": data_id,
            "description": descriptions,
            "qualities": joined_qualities,
            "features": joined_features,
        }
    )

In [11]:
def combine_metadata(all_dataset_metadata, all_data_description_df):
    """
    Description: Combine the descriptions with the metadata table.

    Input: all_dataset_metadata (pd.DataFrame) : The metadata table, 
    all_data_description_df (pd.DataFrame) : The descriptions

    Returns: The combined metadata table.
    """
    # Combine the descriptions with the metadata table
    all_dataset_metadata = pd.merge(
        all_dataset_metadata, all_data_description_df, on="did", how="inner"
    )

    # Create a single column that has a combined string of all the metadata and the description in the form of "column - value, column - value, ... description"

    # all_dataset_metadata["Combined_information"] = all_dataset_metadata.apply(
    #     merge_all_columns_to_string, axis=1
    # )
    return all_dataset_metadata

In [12]:

def join_attributes(attribute, attr_name):
    """
    Description: Join the attributes of the OpenML object.

    Input: attribute (object) : The OpenML object

    Returns: The joined attributes if they exist, else an empty string.
    example: "column - value, column - value, ..."
    """

    return (
        " ".join([f"{k} : {v}," for k, v in getattr(attribute, attr_name, {}).items()])
        if hasattr(attribute, attr_name)
        else ""
    )

In [13]:
def parse_attributes(attribute_str):
    """
    Reverse the join_attributes function
    """
    attributes = {}
    for item in attribute_str.split(','):
        if ':' in item:
            try:
                k, v = item.split(':')
                attributes[k.strip()] = v.strip()
            except ValueError:
                pass
    return attributes

In [14]:
def create_metadata_dataframe(
    openml_data_object, data_id, all_dataset_metadata, use_cache = False
):
    """
    Description: Creates a dataframe with all the metadata, joined columns with all information for the type of data specified in the config. 
    
    Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, config (dict) : The config dictionary

    Returns: The combined metadata dataframe and the metadata table.
    """
    if use_cache == True:
        # If we are not training, we do not need to recreate the cache and can load the metadata from the files. If the files do not exist, raise an exception.
        try:
            with open(f"kaggle_all_dataset_metadata.csv", "r") as f:
                all_data_description_df = pd.read_csv(f)
            return all_data_description_df
        except:
            raise Exception(
                "Metadata files do not exist. Please run without use_cache."
            )
    else:
        descriptions = [
            extract_attribute(attr, "description") for attr in openml_data_object
        ]
        joined_qualities = [
            join_attributes(attr, "qualities") for attr in openml_data_object
        ]
        joined_features = [
            join_attributes(attr, "features") for attr in openml_data_object
        ]

        all_data_description_df = create_combined_information_df(
            data_id, descriptions, joined_qualities, joined_features
        )
        all_dataset_metadata = combine_metadata(
            all_dataset_metadata, all_data_description_df
        )


        # Expand the qualities column into multiple columns
        qualities_expanded = all_dataset_metadata['qualities'].apply(parse_attributes)
        expanded_df = pd.DataFrame(qualities_expanded.tolist())
        expanded_df = pd.concat([all_dataset_metadata, expanded_df], axis=1)

        expanded_df.to_csv(
            f"kaggle_all_dataset_metadata.csv", index=False
        )

        return expanded_df

## Comparison

In [15]:
def compare_dataset_hash(ds1, ds2, hash_fn):
    return hash_fn(ds1.to_string()) == hash_fn(ds2.to_string())

In [32]:
def get_dataset_sum(df, column_list):
    """
    Description: Get the sum of the columns in the dataframe
    """
    # convert all columns to numeric
    # df[column_list] = df[column_list].apply(pd.to_numeric, errors='coerce')
    df.loc[:, column_list] = df.loc[:, column_list].apply(pd.to_numeric, errors='coerce')

    return abs(df[column_list].values.sum())

In [17]:

def compare_by_func(df1, df2,id1, id2, column_list1, column_list2, func):
    """
    Description: Compare two dataframes using a function given two ids and a list of columns to compare
    TODO : For df2, change the id column to whatever is needed, or add an argument
    """
    return func(df1[df1["did"] == id1], column_list1) == func(df2[df2["did"] == id2], column_list2)


# Collate everything
- This takes a bit of time, even with the dataframes downloaded. 

In [19]:
openml_data_object, data_id, all_metadata = get_all_metadata_from_openml(
        n_jobs=10, download_data=False
    )


[INFO] File already exists. Loading from file.


In [21]:
# Create the combined metadata dataframe
metadata_df  = create_metadata_dataframe(
    openml_data_object, data_id, all_metadata, use_cache=True
)


# Mushroom

## Getting the metadata

In [22]:
# version 1 is the oldest one on openml
mushroom = metadata_df[(metadata_df['name'] == 'mushroom') & (metadata_df['version'] == 1)]
mushroom.to_csv('mushroom_metadata.csv', index=False)

In [25]:
mushroom

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,...,RandomTreeDepth2AUC,RandomTreeDepth2ErrRate,RandomTreeDepth2Kappa,RandomTreeDepth3AUC,RandomTreeDepth3ErrRate,RandomTreeDepth3Kappa,StdvNominalAttDistinctValues,kNN1NAUC,kNN1NErrRate,kNN1NKappa
19,24,mushroom,1,1,active,ARFF,4208.0,12.0,3916.0,2.0,...,0.999525,0.000492,0.999014,0.999525,0.000492,0.999014,3.180971,1.0,0.0,1.0


## Get the actual CSV

In [61]:
downloaded_data = get_dataset_description(24, download_data=True)
X, y, _, _ = downloaded_data.get_data(dataset_format='dataframe')

In [95]:
X.to_csv('mushroom_data.csv', index=False)

## Compare dataset hash

In [96]:
# Test using python inbuilt hash function, feel free to use your own
compare_dataset_hash(X, X.T, hash)

False

In [33]:
comparision_list = ['NumberOfInstances', 'NumberOfFeatures', 'NumberOfMissingValues']

In [35]:
# Testing for different datasets
compare_by_func(metadata_df, metadata_df, 24, 25, comparision_list, comparision_list, get_dataset_sum)

False