In [None]:
!pip install git+https://github.com/monarch-initiative/curate-gpt.git
!pip install huggingface_hub pyyaml pandas pyarrow

In [20]:
# Import necessary libraries
from huggingface_hub import HfApi, create_repo
import yaml

In [None]:
!wget https://data.monarchinitiative.org/monarch-kg/latest/monarch-kg.tar.gz
!tar -xvzf monarch-kg.tar.gz

In [None]:
!curategpt index -p stagedb -c monarch_kg -m openai: data/monarch-kg_nodes.tsv

In [None]:
import pandas as pd
from curate_gpt import ChromaDBAdapter

def fetch_embeddings_from_chromadb(path, collection):
    # Initialize the database adapter
    db = ChromaDBAdapter(path)
    
    # Fetch embeddings from the specified collection using get
    collection_obj = db.client.get_collection(name=collection)
    results = collection_obj.get(include=["embeddings"])
    
    # Extract embeddings
    embeddings = []
    for metadata in results['metadatas']:
        if 'embeddings' in metadata:
            embeddings.append(metadata['embeddings'])
        else:
            raise KeyError(f"Embeddings not found in metadata: {metadata}")
    
    return embeddings

def export_embeddings_to_parquet(path, collection, output_file):
    # Fetch embeddings
    embeddings = fetch_embeddings_from_chromadb(path, collection)
    
    # Convert embeddings to DataFrame
    df_embeddings = pd.DataFrame(embeddings)
    
    # Export DataFrame to Parquet file
    df_embeddings.to_parquet(output_file, engine='pyarrow')
    print(f"Embeddings have been successfully exported to {output_file}")

# Example usage
path_to_chromadb = './stagedb'
collection_name = 'monarch_kg'
output_parquet_file = 'monarch_text_embeddings.parquet'
output_parquet_file = '~/PythonProject/curate-gpt/notebooks/command-line/'

export_embeddings_to_parquet(path_to_chromadb, collection_name, output_parquet_file)

In [21]:
# Generate metadata in venomx format
metadata = {
    'description': 'Embeddings of the Monarch KG nodes, generated using curategpt and the nodes.tsv file from the Monarch KG version 2024-07-12',
    'model': {
        'name': 'text-embedding-ada-002'
    },
    'dataset': {
        'name': 'Monarch KG 2024-07-12',
        'url': 'https://data.monarchinitiative.org/monarch-kg/2024-07-12/'
    }
}

# Save the metadata to a YAML file
metadata_file_path = './metadata.yaml'
with open(metadata_file_path, 'w') as f:
    yaml.dump(metadata, f)

print(f"Metadata saved to {metadata_file_path}")

Metadata saved to ./metadata.yaml


In [19]:
# Upload to Hugging Face
repo_id = "biomedical-translator/monarch_kg_node_text_embeddings"
create_repo(repo_id, repo_type="dataset")

this_notebook_path = "index-monarch-kg.ipynb"

api = HfApi()
files_to_upload = [output_parquet_file, metadata_file_path, this_notebook_path]

for file in files_to_upload:
    api.upload_file(
        path_or_fileobj=file,
        path_in_repo=file,
        repo_id=repo_id,
        repo_type="dataset"
    )

print(f"Files uploaded to Hugging Face in repository: {repo_id}")

ValueError: Provided path: 'monarch_text_embeddings.parquet' is not a file on the local file system