In [20]:
import requests
import json
import pandas as pd
import time

In [2]:
def fetch_all_study_set(base_url="https://api.microbiomedata.org/nmdcschema/study_set", max_page_size=20):
    all_resources = []
    page_token = None

    while True:
        # Construct request URL with pagination
        params = {"max_page_size": max_page_size}
        if page_token:
            params["page_token"] = page_token

        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise error for failed requests
        data = response.json()

        # Store results
        if "resources" in data:
            all_resources.extend(data["resources"])

        # Check for next_page_token
        page_token = data.get("next_page_token")
        if not page_token:
            break  # Exit loop when no more pages

    return all_resources

In [3]:
# Fetch all documents
study_set_data = fetch_all_study_set()

In [4]:
print(f"Retrieved {len(study_set_data)} documents from study_set collection.")

Retrieved 32 documents from study_set collection.


In [5]:
# Save to JSON file
with open("study_set_data.json", "w") as json_file:
    json.dump(study_set_data, json_file, indent=4)
print("Saved study_set_data.json")

Saved study_set_data.json


In [18]:
# Convert main study set data to TSV
columns = [
    "id", "name", "description", "ecosystem", "ecosystem_category",
    "ecosystem_type", "ecosystem_subtype", "specific_ecosystem", "title",
    "study_category", "funding_sources", "gold_study_identifiers",
    "part_of", "websites", "pi_name", "pi_email", "pi_orcid", "pi_profile_image_url", "study_dois"
]

tsv_rows = []
credit_association_rows = []
dois_rows = []
publication_dois = []

for record in study_set_data:
    pi = record.get("principal_investigator", {})
    study_dois = "|".join([doi.get("doi_value", "") for doi in record.get("associated_dois", [])])
    row = {
        "id": record.get("id", ""),
        "name": record.get("name", ""),
        "description": record.get("description", ""),
        "ecosystem": record.get("ecosystem", ""),
        "ecosystem_category": record.get("ecosystem_category", ""),
        "ecosystem_type": record.get("ecosystem_type", ""),
        "ecosystem_subtype": record.get("ecosystem_subtype", ""),
        "specific_ecosystem": record.get("specific_ecosystem", ""),
        "title": record.get("title", ""),
        "study_category": record.get("study_category", ""),
        "funding_sources": "|".join(record.get("funding_sources", [])),
        "gold_study_identifiers": "|".join(record.get("gold_study_identifiers", [])),
        "part_of": "|".join(record.get("part_of", [])),
        "websites": "|".join(record.get("websites", [])),
        "pi_name": pi.get("has_raw_value", ""),
        "pi_email": pi.get("email", ""),
        "pi_orcid": pi.get("orcid", ""),
        "pi_profile_image_url": pi.get("profile_image_url", ""),
        "study_dois": study_dois
    }
    tsv_rows.append(row)

    # Process credit associations separately
    for credit in record.get("has_credit_associations", []):
        credit_row = {
            "study_id": record.get("id", ""),
            "name": credit["applies_to_person"].get("name", ""),
            "email": credit["applies_to_person"].get("email", ""),
            "orcid": credit["applies_to_person"].get("orcid", ""),
            "applied_roles": "|".join(credit.get("applied_roles", []))
        }
        credit_association_rows.append(credit_row)

    # Process associated DOIs separately
    for doi in record.get("associated_dois", []):
        doi_row = {
            "study_id": record.get("id", ""),
            "doi_value": doi.get("doi_value", ""),
            "doi_category": doi.get("doi_category", ""),
            "doi_provider": doi.get("doi_provider", "")
        }
        dois_rows.append(doi_row)

        # Filter publication DOIs
        if doi.get("doi_category") == "publication_doi":
            publication_dois.append(doi.get("doi_value"))

# Save main study set TSV
df = pd.DataFrame(tsv_rows, columns=columns)
ts_file_path = "study_set_data.tsv"
df.to_csv(ts_file_path, sep="\t", index=False)
print("Saved study_set_data.tsv")

# Save credit associations TSV
df_credit = pd.DataFrame(credit_association_rows, columns=["study_id", "name", "email", "orcid", "applied_roles"])
credit_ts_file_path = "credit_associations.tsv"
df_credit.to_csv(credit_ts_file_path, sep="\t", index=False)
print("Saved credit_associations.tsv")

# Save associated DOIs TSV
df_dois = pd.DataFrame(dois_rows, columns=["study_id", "doi_value", "doi_category", "doi_provider"])
dois_ts_file_path = "associated_dois.tsv"
df_dois.to_csv(dois_ts_file_path, sep="\t", index=False)
print("Saved associated_dois.tsv")


Saved study_set_data.tsv
Saved credit_associations.tsv
Saved associated_dois.tsv


In [22]:
# Query CrossRef for citation metadata
def get_crossref_metadata(doi):
    """Fetch metadata from CrossRef API for a given DOI."""
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json().get("message", {})
        return data  # Store the entire useful response

    print(f"Failed to retrieve metadata for {doi} (Status Code: {response.status_code})")
    return None

# Dictionary to store metadata
citations = {}

# Fetch metadata for each DOI
for doi in publication_dois:
    print(f"Fetching metadata for {doi}")
    citations[doi] = get_crossref_metadata(doi)
    time.sleep(1)  # Avoid rate limits

# Save the citation data as JSON
with open("doi_metadata.json", "w") as json_file:
    json.dump(citations, json_file, indent=4)

print("Saved DOI metadata to doi_metadata.json")

Fetching metadata for doi:10.1128/mSystems.00045-18
Fetching metadata for doi:10.1101/2022.12.12.520098
Fetching metadata for doi:10.1038/s41564-022-01266-x
Fetching metadata for doi:10.1111/1462-2920.16314
Fetching metadata for doi:10.1111/mec.16891
Fetching metadata for doi:10.1021/acs.estlett.0c00748
Fetching metadata for doi:10.1128/msystems.00768-19
Fetching metadata for doi:10.1371/journal.pone.0228165
Fetching metadata for doi:10.1016/j.geoderma.2021.115674
Fetching metadata for doi:10.1029/2022JG006889
Fetching metadata for doi:10.1002/ppp.2200
Fetching metadata for doi:10.1038/s41467-023-36515-y
Fetching metadata for doi:10.1002/lno.11306
Fetching metadata for doi:10.1038/s41597-024-03069-7
Fetching metadata for doi:10.1038/s41564-020-00861-0
Saved DOI metadata to doi_metadata.json


In [13]:
# Google Sheet URL (STREAMS guidelines -> STREAMS_final)
sheet_url = "https://docs.google.com/spreadsheets/d/1lrwZZCHf9ik-TRrMEyN-NJrmZ8uhG4NyDuPH9xl5G1E/export?format=csv&gid=2118203602"

# Load the Google Sheet into a Pandas DataFrame
df = pd.read_csv(sheet_url)

# Save as TSV file
tsv_file_path = "streams_final.tsv"


# Drop the 6th and 7th columns (index-based, zero-indexed)
# deleting: Yes/No/NA,Comments or location in manuscript
# which still leaves: Example(s), Present in the manuscript?, Comments or location in manuscript
df = df.drop(df.columns[[5, 6]], axis=1)
df.to_csv(tsv_file_path, sep="\t", index=False)

print(f"Saved data to {tsv_file_path}")


Saved data to streams_final.tsv
