In [1]:
import os
import pandas as pd
import yaml
import pyarrow.feather as feather

In [14]:
# Path to the benchmark directory
benchmark_path = os.path.expanduser("~/benchmark")

# Initialize an empty list to store metadata
metadata_list = []

# Traverse the benchmark directory
for dataset_name in os.listdir(benchmark_path):
    dataset_path = os.path.join(benchmark_path, dataset_name)

    # Check if the current item is a directory
    if os.path.isdir(dataset_path):
        # Find the TSV file in the directory
        tsv_file = next(
            (f for f in os.listdir(dataset_path) if f.endswith(".tsv.gz")), None
        )

        # Check if a TSV file was found
        if tsv_file:
            tsv_file_path = os.path.join(dataset_path, tsv_file)

            # Read the TSV file with pandas to get the number of rows and columns
            df = pd.read_csv(tsv_file_path, sep="\t", compression="gzip")
            y = df.iloc[:, -1]
            X = df.iloc[:, :-1]
            n, p = X.shape
            n_unique_df = X.drop_duplicates().shape[0]

            # Count unique values of y
            n_unique_y = y.nunique()

            # Count unique rows of X
            n_unique_X = X.drop_duplicates().shape[0]
            n_unique_X_avg = X.nunique().mean()
            n_constant = sum(X.nunique() == 1)
            n_binary = sum(X.nunique() == 2)
            n_categorical = sum((X.nunique() >= 3) & (X.nunique() <= 5))
            n_continuous = sum(X.nunique() > 5)

            # Column index
            idx_constant = [
                idx + 1 for idx, col in enumerate(X.columns) if X[col].nunique() == 1
            ]
            idx_binary = [
                idx + 1 for idx, col in enumerate(X.columns) if X[col].nunique() == 2
            ]
            idx_categorical = [
                idx + 1
                for idx, col in enumerate(X.columns)
                if 3 <= X[col].nunique() <= 5
            ]
            idx_continuous = [
                idx + 1 for idx, col in enumerate(X.columns) if X[col].nunique() > 5
            ]

            # Find the task in the metadata.yaml file
            metadata_file_path = os.path.join(dataset_path, "metadata.yaml")
            with open(metadata_file_path, "r") as metadata_file:
                metadata_content = yaml.safe_load(metadata_file)
                task = metadata_content.get("task", "N/A")

            # Append metadata to the list
            metadata_list.append(
                {
                    "dataset_name": dataset_name,
                    "n": n,
                    "p": p,
                    "task": task,
                    "n_unique_df": n_unique_df,
                    "n_unique_y": n_unique_y,
                    "n_unique_X": n_unique_X,
                    "n_unique_X_avg": n_unique_X_avg,
                    "n_constant": n_constant,
                    "n_binary": n_binary,
                    "n_categorical": n_categorical,
                    "n_continuous": n_continuous,
                    "p_unique_df": n_unique_df / n,
                    "p_unique_y": n_unique_y / n,
                    "p_unique_X": n_unique_X / n,
                    "p_unique_X_avg": n_unique_X_avg / n,
                    "p_binary": n_binary / p,
                    "p_categorical": n_categorical / p,
                    "p_continuous": n_continuous / p,
                    "idx_constant": idx_constant,
                    "idx_binary": idx_binary,
                    "idx_categorical": idx_categorical,
                    "idx_continuous": idx_continuous,
                }
            )

# Create a DataFrame from the metadata list
metadata_df = pd.DataFrame(metadata_list)

In [17]:
metadata_df_small = metadata_df[
    (metadata_df["p_unique_y"] >= 0.9)
    & (metadata_df["p_continuous"] == 1)
    & ((metadata_df["p"] >= 20) | ((metadata_df["p"] >= 10) & (metadata_df["n"] < 200)))
]
metadata_df_small = metadata_df_small[
    metadata_df_small["dataset_name"] != "207_autoPrice"
].reset_index(drop=True)
metadata_df_small

Unnamed: 0,dataset_name,n,p,task,n_unique_df,n_unique_y,n_unique_X,n_unique_X_avg,n_constant,n_binary,...,p_unique_y,p_unique_X,p_unique_X_avg,p_binary,p_categorical,p_continuous,idx_constant,idx_binary,idx_categorical,idx_continuous
0,651_fri_c0_100_25,100,25,regression,100,100,100,100.0,0,0,...,1.0,1.0,1.0,0.0,0.0,1.0,[],[],[],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,626_fri_c2_500_50,500,50,regression,500,500,500,500.0,0,0,...,1.0,1.0,1.0,0.0,0.0,1.0,[],[],[],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
2,621_fri_c0_100_10,100,10,regression,100,100,100,100.0,0,0,...,1.0,1.0,1.0,0.0,0.0,1.0,[],[],[],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
3,583_fri_c1_1000_50,1000,50,regression,1000,1000,1000,1000.0,0,0,...,1.0,1.0,1.0,0.0,0.0,1.0,[],[],[],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
4,633_fri_c0_500_25,500,25,regression,500,500,500,500.0,0,0,...,1.0,1.0,1.0,0.0,0.0,1.0,[],[],[],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
5,586_fri_c3_1000_25,1000,25,regression,1000,1000,1000,1000.0,0,0,...,1.0,1.0,1.0,0.0,0.0,1.0,[],[],[],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
6,645_fri_c3_500_50,500,50,regression,500,500,500,500.0,0,0,...,1.0,1.0,1.0,0.0,0.0,1.0,[],[],[],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
7,584_fri_c4_500_25,500,25,regression,500,500,500,500.0,0,0,...,1.0,1.0,1.0,0.0,0.0,1.0,[],[],[],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
8,618_fri_c3_1000_50,1000,50,regression,1000,1000,1000,1000.0,0,0,...,1.0,1.0,1.0,0.0,0.0,1.0,[],[],[],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
9,620_fri_c1_1000_25,1000,25,regression,1000,1000,1000,1000.0,0,0,...,1.0,1.0,1.0,0.0,0.0,1.0,[],[],[],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."


In [18]:
feather.write_feather(
    metadata_df,
    os.path.normpath("../results_blackbox/metadata.feather"),
)