In [1]:
import requests
import json
import pandas as pd
import time
import fitz
import os
import re
from datetime import datetime

In [2]:
def fetch_all_study_set(base_url="https://api.microbiomedata.org/nmdcschema/study_set", max_page_size=20):
    all_resources = []
    page_token = None

    while True:
        # Construct request URL with pagination
        params = {"max_page_size": max_page_size}
        if page_token:
            params["page_token"] = page_token

        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise error for failed requests
        data = response.json()

        # Store results
        if "resources" in data:
            all_resources.extend(data["resources"])

        # Check for next_page_token
        page_token = data.get("next_page_token")
        if not page_token:
            break  # Exit loop when no more pages

    return all_resources

In [3]:
# Fetch all documents
study_set_data = fetch_all_study_set()

In [4]:
print(f"Retrieved {len(study_set_data)} documents from study_set collection.")

Retrieved 32 documents from study_set collection.


In [5]:
# Save to JSON file
with open("study_set_data.json", "w") as json_file:
    json.dump(study_set_data, json_file, indent=4)
print("Saved study_set_data.json")

Saved study_set_data.json


In [6]:
# Convert main study set data to TSV
columns = [
    "id", "name", "description", "ecosystem", "ecosystem_category",
    "ecosystem_type", "ecosystem_subtype", "specific_ecosystem", "title",
    "study_category", "funding_sources", "gold_study_identifiers",
    "part_of", "websites", "pi_name", "pi_email", "pi_orcid", "pi_profile_image_url", "study_dois"
]

tsv_rows = []
credit_association_rows = []
dois_rows = []
publication_dois = []

for record in study_set_data:
    pi = record.get("principal_investigator", {})
    study_dois = "|".join([doi.get("doi_value", "") for doi in record.get("associated_dois", [])])
    row = {
        "id": record.get("id", ""),
        "name": record.get("name", ""),
        "description": record.get("description", ""),
        "ecosystem": record.get("ecosystem", ""),
        "ecosystem_category": record.get("ecosystem_category", ""),
        "ecosystem_type": record.get("ecosystem_type", ""),
        "ecosystem_subtype": record.get("ecosystem_subtype", ""),
        "specific_ecosystem": record.get("specific_ecosystem", ""),
        "title": record.get("title", ""),
        "study_category": record.get("study_category", ""),
        "funding_sources": "|".join(record.get("funding_sources", [])),
        "gold_study_identifiers": "|".join(record.get("gold_study_identifiers", [])),
        "part_of": "|".join(record.get("part_of", [])),
        "websites": "|".join(record.get("websites", [])),
        "pi_name": pi.get("has_raw_value", ""),
        "pi_email": pi.get("email", ""),
        "pi_orcid": pi.get("orcid", ""),
        "pi_profile_image_url": pi.get("profile_image_url", ""),
        "study_dois": study_dois
    }
    tsv_rows.append(row)

    # Process credit associations separately
    for credit in record.get("has_credit_associations", []):
        credit_row = {
            "study_id": record.get("id", ""),
            "name": credit["applies_to_person"].get("name", ""),
            "email": credit["applies_to_person"].get("email", ""),
            "orcid": credit["applies_to_person"].get("orcid", ""),
            "applied_roles": "|".join(credit.get("applied_roles", []))
        }
        credit_association_rows.append(credit_row)

    # Process associated DOIs separately
    for doi in record.get("associated_dois", []):
        doi_row = {
            "study_id": record.get("id", ""),
            "doi_value": doi.get("doi_value", ""),
            "doi_category": doi.get("doi_category", ""),
            "doi_provider": doi.get("doi_provider", "")
        }
        dois_rows.append(doi_row)

        # Filter publication DOIs
        if doi.get("doi_category") == "publication_doi":
            publication_dois.append(doi.get("doi_value"))

# Save main study set TSV
df = pd.DataFrame(tsv_rows, columns=columns)
ts_file_path = "study_set_data.tsv"
df.to_csv(ts_file_path, sep="\t", index=False)
print("Saved study_set_data.tsv")

# Save credit associations TSV
df_credit = pd.DataFrame(credit_association_rows, columns=["study_id", "name", "email", "orcid", "applied_roles"])
credit_ts_file_path = "credit_associations.tsv"
df_credit.to_csv(credit_ts_file_path, sep="\t", index=False)
print("Saved credit_associations.tsv")

# Save associated DOIs TSV
df_dois = pd.DataFrame(dois_rows, columns=["study_id", "doi_value", "doi_category", "doi_provider"])
dois_ts_file_path = "associated_dois.tsv"
df_dois.to_csv(dois_ts_file_path, sep="\t", index=False)
print("Saved associated_dois.tsv")


Saved study_set_data.tsv
Saved credit_associations.tsv
Saved associated_dois.tsv


In [7]:
def get_crossref_metadata(doi):
    """Fetch metadata from CrossRef API for a given DOI."""
    doi = doi.replace("doi:", "")  # Ensure consistency in DOI format
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)

    if response.status_code == 200:
        return response.json().get("message", {})

    print(f"Failed to retrieve metadata for {doi} (Status Code: {response.status_code})")
    return None

In [8]:
eloe_fadrosh_dois = [
    "doi:10.1128/MRA.01361-19",
    "doi:10.1128/mra.01080-23",
    "doi:10.1038/s41597-019-0132-4",
    "doi:10.1038/s41597-024-04013-5",
    "doi:10.1186/s40168-020-00889-8",
]

In [9]:
misc_dois = [
    "doi:10.1093/nar/gkab990",
    "doi:10.1126/sciadv.adg7888",
    "doi:10.1371/journal.pone.0228165",
    "doi:10.3390/microorganisms9020357",
    "doi:10.3897/tdwgproceedings.1.20637",
    "doi:10.5194/acp-23-15783-2023",
]

In [10]:
combined_dois = publication_dois + eloe_fadrosh_dois + misc_dois

In [11]:
# Dictionary to store metadata
citations = {}

# Fetch metadata and handle "is-preprint-of"
for doi in combined_dois:
    clean_doi = doi.replace("doi:", "")  # Remove "doi:" prefix
    print(f"Fetching metadata for {clean_doi}")
    metadata = get_crossref_metadata(clean_doi)

    updated_doi = None
    updated_metadata = None

    if metadata:
        citations[clean_doi] = metadata

        # Extract "is-preprint-of" DOI from relation field
        relation = metadata.get("relation", {}).get("is-preprint-of", [])
        if relation and isinstance(relation, list) and isinstance(relation[0], dict):
            updated_doi = relation[0].get("id")
            if updated_doi:
                print(f"{clean_doi} was found to be a pre-print of {updated_doi}")
                print(f"Fetching metadata for published version: {updated_doi}")
                updated_metadata = get_crossref_metadata(updated_doi)
                if updated_metadata:
                    citations[updated_doi] = updated_metadata  # Save updated citation metadata

    time.sleep(1)  # Avoid rate limits

# Save the citation data as JSON
with open("doi_metadata.json", "w") as json_file:
    json.dump(citations, json_file, indent=4)
print("Saved DOI metadata to doi_metadata.json")



Fetching metadata for 10.1128/mSystems.00045-18
Fetching metadata for 10.1101/2022.12.12.520098
10.1101/2022.12.12.520098 was found to be a pre-print of 10.3389/fsoil.2023.1120425
Fetching metadata for published version: 10.3389/fsoil.2023.1120425
Fetching metadata for 10.1038/s41564-022-01266-x
Fetching metadata for 10.1111/1462-2920.16314
Fetching metadata for 10.1111/mec.16891
Fetching metadata for 10.1021/acs.estlett.0c00748
Fetching metadata for 10.1128/msystems.00768-19
Fetching metadata for 10.1371/journal.pone.0228165
Fetching metadata for 10.1016/j.geoderma.2021.115674
Fetching metadata for 10.1029/2022JG006889
Fetching metadata for 10.1002/ppp.2200
Fetching metadata for 10.1038/s41467-023-36515-y
Fetching metadata for 10.1002/lno.11306
Fetching metadata for 10.1038/s41597-024-03069-7
Fetching metadata for 10.1038/s41564-020-00861-0
Fetching metadata for 10.1128/MRA.01361-19
Fetching metadata for 10.1128/mra.01080-23
Fetching metadata for 10.1038/s41597-019-0132-4
Fetching met

In [15]:
# Prepare list to store rows for TSV
rows = []

# Ranked date fields for extracting Year
DATE_FIELDS = [
    "published-print", "published", "issued", "posted", "accepted", "created", "indexed"
]

for doi in combined_dois:  # Only include original DOIs in the TSV
    clean_doi = doi.replace("doi:", "")
    metadata = citations.get(clean_doi, {})
    updated_doi = None

    # Check if there's an updated DOI
    relation = metadata.get("relation", {}).get("is-preprint-of", [])
    if relation and isinstance(relation, list) and isinstance(relation[0], dict):
        updated_doi = relation[0].get("id")

    # Prefer metadata from the updated DOI if available
    source_metadata = citations.get(updated_doi, metadata)

    # Extract the best available Year
    year = ""
    year_type = ""
    for field in DATE_FIELDS:
        date_parts = source_metadata.get(field, {}).get("date-parts", [[None]])
        if date_parts and date_parts[0][0]:
            year = str(int(date_parts[0][0]))  # Convert to integer-like string
            year_type = field
            break

    row = {
        "DOI": clean_doi,
        "Updated DOI": updated_doi if updated_doi else "",
        "Title": "; ".join(source_metadata.get("title", [])),
        "Journal": "; ".join(source_metadata.get("container-title", [])),
        "Year": year,
        "Year Type": year_type,
        "Publisher": source_metadata.get("publisher", ""),
        "Authors": "; ".join([f"{a.get('given', '')} {a.get('family', '')}" for a in source_metadata.get("author", [])]),
        "Volume": source_metadata.get("volume", ""),
        "Issue": source_metadata.get("issue", ""),
        "Pages": source_metadata.get("page", ""),
        "DOI URL": source_metadata.get("URL", ""),
    }
    rows.append(row)

# Convert to DataFrame
df = pd.DataFrame(rows)

# Save as TSV
df.to_csv("doi_metadata.tsv", sep="\t", index=False)
print("Saved parsed citations to doi_metadata.tsv")

Saved parsed citations to parsed_citations.tsv


In [12]:
def extract_pdf_metadata(pdf_path):
    """Extract metadata from a PDF file."""
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata  # Extract metadata dictionary
        return metadata if metadata else {}
    except Exception as e:
        print(f"Error extracting metadata from {pdf_path}: {e}")
        return {}

def parse_pdf_date(pdf_date):
    """Convert Adobe PDF date format to YYYY-MM-DD."""
    match = re.match(r"D:(\d{4})(\d{2})(\d{2})", pdf_date)
    if match:
        year, month, day = match.groups()
        return f"{year}-{month}-{day}"
    return ""

def extract_doi(text):
    """Extract DOI from text."""
    match = re.search(r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+", text)
    return match.group(0) if match else ""

def extract_journal_info(text):
    """Extract volume, issue, and page numbers from subject field."""
    match = re.search(r"(\d{4})\.(\d+):?([\w\d-]*)", text)
    if match:
        year, volume, pages = match.groups()
        return year, volume, pages
    return "", "", ""




In [13]:
# Directory containing PDFs
pdf_dir = "../../../local/pdfs/"

# Collect metadata from all PDFs
pdf_metadata = {}

for filename in os.listdir(pdf_dir):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, filename)
        print(f"Processing: {filename}")
        pdf_metadata[filename] = extract_pdf_metadata(pdf_path)

# Save metadata to JSON
json_output_path = "pdf_metadata.json"
with open(json_output_path, "w") as json_file:
    json.dump(pdf_metadata, json_file, indent=4)

print(f"Saved PDF metadata to {json_output_path}")

# Convert to TSV format
rows = []
for filename, metadata in pdf_metadata.items():
    year, volume, pages = extract_journal_info(metadata.get("subject", ""))
    row = {
        "Filename": filename,
        "DOI": extract_doi(metadata.get("subject", "")),
        "Year": year,
        "Volume": volume,
        "Pages": pages,
        "Title": metadata.get("title", ""),
        "Author": metadata.get("author", ""),
        "Subject": metadata.get("subject", ""),
        "Keywords": metadata.get("keywords", ""),
        "Creation Date": parse_pdf_date(metadata.get("creationDate", "")),
        "Modification Date": parse_pdf_date(metadata.get("modDate", "")),
        "Producer": metadata.get("producer", ""),
        "Creator": metadata.get("creator", ""),
    }
    rows.append(row)

# Save as TSV
df = pd.DataFrame(rows)
tsv_output_path = "pdf_metadata.tsv"
df.to_csv(tsv_output_path, sep="\t", index=False)

print(f"Saved PDF metadata to {tsv_output_path}")


Processing: acp-23-15783-2023.pdf
Processing: bell-et-al-2020-metatranscriptomic-sequencing-of-a-cyanobacterial-soil-surface-consortium-with-and-without-a-diverse.pdf
Processing: Soil_Bacterial_Diversity_Is_Positively_Correlated_.pdf
Processing: BISS_article_20637.pdf
Processing: s40168-020-00889-8.pdf
Processing: sciadv.adg7888.pdf
Processing: s41597-024-04013-5.pdf
Processing: s41564-020-00861-0.pdf
Processing: 41564_2019_Article_449.pdf
Processing: gkab990.pdf
Processing: blair-et-al-2018-exploration-of-the-biosynthetic-potential-of-the-populus-microbiome.pdf
Processing: s41597-019-0132-4.pdf
Processing: Limnology   Oceanography - 2019 - Linz - Time‐series metatranscriptomes reveal conserved patterns between phototrophic and.pdf
Processing: file.pdf
Processing: microorganisms-09-00357-v2.pdf
Processing: Environmental Microbiology - 2022 - Olmsted - Environmental predictors of electroactive bacterioplankton in small boreal.pdf
Processing: 1-s2.0-S0016706121007540-main.pdf
Processing:

In [14]:
# Google Sheet URL (STREAMS guidelines -> STREAMS_final)
sheet_url = "https://docs.google.com/spreadsheets/d/1lrwZZCHf9ik-TRrMEyN-NJrmZ8uhG4NyDuPH9xl5G1E/export?format=csv&gid=2118203602"

# Load the Google Sheet into a Pandas DataFrame
df = pd.read_csv(sheet_url)

# Save as TSV file
tsv_file_path = "streams_final.tsv"


# Drop the 6th and 7th columns (index-based, zero-indexed)
# deleting: Yes/No/NA,Comments or location in manuscript
# which still leaves: Example(s), Present in the manuscript?, Comments or location in manuscript
df = df.drop(df.columns[[5, 6]], axis=1)
df.to_csv(tsv_file_path, sep="\t", index=False)

print(f"Saved data to {tsv_file_path}")


Saved data to streams_final.tsv
