In [1]:
# A script that goes through all the ChemPile datasets on Hugging Face. 
# The ultimate goal is to standardize the terminologies used in the datasets and ensure consistency.

! pip install datasets
import datasets
import pandas as pd
import numpy as np



In [2]:
from huggingface_hub import get_collection
from datasets import get_dataset_config_names
from datasets import get_dataset_split_names

collection = get_collection('maomlab/chempile-65ca7cc30077b4e77c2d8fa1')
maom_datasets = [item.item_id.split('/')[-1] for item in collection.items]

In [3]:
from datasets import load_dataset

data = []

for dataset in maom_datasets:
    if dataset == 'DrugMap_Ligandability':  # Skip this dataset because it isn't under 'maomlab'
        continue
    else:
        dataset_name = f"maomlab/{dataset}"
        configs = get_dataset_config_names(dataset_name)

        for config in configs:
            try:
                loaded_dataset = load_dataset(dataset_name, name=config, streaming=True)
                split_names = list(loaded_dataset.keys())

                for split in split_names:
                    features = loaded_dataset[split].features
                    columns_to_print = list(features.keys())[:10]  # Limit to the first 10 columns because some datasets have ~3000 columns but we don't need to print all of them
                
                    for column_name in columns_to_print:
                        column_info = features[column_name]
                        column_type = column_info.dtype
                        data.append([dataset, config, split, column_name, column_type])

            except Exception as e:
                print(f"Failed to load or process dataset '{dataset}' with configuration '{config}': {e}")            


Downloading readme:   0%|          | 0.00/7.68k [00:00<?, ?B/s]

In [6]:
df = pd.DataFrame(data, columns=['dataset', 'config', 'split', 'column_name', 'column_type'])
df.to_csv('/Users/parkhaneul/Library/Mobile Documents/com~apple~CloudDocs/Career/Maom Lab/ChemPile_structure_new.csv', index = False)