In [73]:
import pandas as pd

df = pd.read_csv('/content/SB_publication_PMC.csv')
# Load the PMCIDs from the provided CSV file
pmcids_df = pd.read_csv('/content/pmcids_for_api_extraction.csv')


In [74]:
import requests
import time
import xml.etree.ElementTree as ET
from google.colab import userdata # Import userdata to access secrets

def get_pubmed_details(pmcid):
    """Fetches details for a given PMCID using NCBI BioC API and EFetch."""
    # BioC API URL for other details
    bioc_base_url = "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/"
    bioc_url = f"{bioc_base_url}/{pmcid}/unicode"

    # EFetch URL structure for Journal metadata
    efetch_base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
    db = "pmc"
    retmode = "xml"
    # Get the NCBI API key from Colab secrets
    ncbi_api_key = userdata.get('NCBI_API_KEY')
    tool = "my_data_extractor" # Replace with a name for your tool
    email = "your_email@example.com" # Replace with your email

    efetch_url = f"{efetch_base_url}db={db}&id={pmcid}&retmode={retmode}"
    if ncbi_api_key:
        efetch_url += f"&api_key={ncbi_api_key}"
    # Adding tool and email as recommended by NCBI
    efetch_url += f"&tool={tool}&email={email}"


    title = None
    authors = []
    abstract = None
    year = None
    journal = None
    doi_url = None


    # --- Use BioC API for most details ---
    try:
        bioc_response = requests.get(bioc_url)
        bioc_response.raise_for_status()

        bioc_root = ET.fromstring(bioc_response.content)

        # Extract details from BioC XML
        # Find the passage with section_type 'TITLE'
        title_passage = None
        for passage in bioc_root.findall('.//document/passage'):
            if passage.findtext('infon[@key="section_type"]') == 'TITLE':
                title_passage = passage
                break

        if title_passage is not None:
            title = title_passage.findtext('text')

            # Extract Year and DOI from the title passage infons
            for infon in title_passage.findall('infon'):
                if infon.get('key') == 'year':
                    year = infon.text
                elif infon.get('key') == 'article-id_doi':
                    doi = infon.text
                    if doi:
                        doi_url = f"https://doi.org/{doi}"

            # Extract Authors from title passage infons
            for infon in title_passage.findall('infon'):
                if infon.get('key', '').startswith('name_'):
                    name_value = infon.text
                    # Parse author names (surname and given names)
                    name_parts = name_value.split(';')
                    surname = None
                    given_names = None
                    for part in name_parts:
                        if part.startswith('surname:'):
                            surname = part.replace('surname:', '').strip()
                        elif part.startswith('given-names:'):
                            given_names = part.replace('given-names:', '').strip()
                    if surname or given_names:
                        author_name = f"{surname}, {given_names}" if surname and given_names else surname if surname else given_names
                        if author_name:
                            authors.append(author_name)

        # Find the passage with section_type 'ABSTRACT'
        abstract_passage = None
        for passage in bioc_root.findall('.//document/passage'):
            if passage.findtext('infon[@key="section_type"]') == 'ABSTRACT':
                abstract_passage = passage
                break

        if abstract_passage is not None:
            abstract = abstract_passage.findtext('text')


    except requests.exceptions.RequestException as e:
        print(f"HTTP Error fetching details from BioC for {pmcid}: {e}")
        # Continue to fetch journal even if BioC fails
    except Exception as e:
        print(f"Error processing BioC details for {pmcid}: {e}")
        # Continue to fetch journal even if BioC fails


    # --- Use EFetch API for Journal Title ---
    try:
        efetch_response = requests.get(efetch_url)
        efetch_response.raise_for_status()

        efetch_root = ET.fromstring(efetch_response.content)

        # Extract Journal Title using the specified path from EFetch XML
        journal_title_element = efetch_root.find('.//journal-title')
        if journal_title_element is not None:
            journal = journal_title_element.text

    except requests.exceptions.RequestException as e:
        print(f"HTTP Error fetching journal from EFetch for {pmcid}: {e}")
        journal = None # Ensure journal is None if EFetch fails
    except Exception as e:
        print(f"Error processing EFetch journal for {pmcid}: {e}")
        journal = None # Ensure journal is None if EFetch fails


    extracted_data = {
        'PMCID': pmcid,
        'Title': title,
        'Authors': "; ".join(authors) if authors else None,
        'Year': year,
        'Journal': journal,
        'DOI_URL': doi_url,
        'Abstract': abstract,
    }

    # Add a small delay to avoid overwhelming the API
    time.sleep(0.5)

    return extracted_data

In [75]:
# Apply the function to fetch details for the first 100 PMCIDs
paper_details = pmcids_df['PMCID'].head(100).apply(get_pubmed_details)

# Convert the results to a list of dictionaries
paper_details_list = paper_details.tolist()

# Filter out entries where the extraction failed (e.g., Title is None)
filtered_paper_details_list = [details for details in paper_details_list if details and details.get('Title') is not None]

# Create a new DataFrame from the filtered list of dictionaries
extracted_df = pd.DataFrame(filtered_paper_details_list)

# Display all rows of the new DataFrame
display(extracted_df)

Error processing BioC details for PMC3177255: syntax error: line 1, column 0
Error processing BioC details for PMC2915878: syntax error: line 1, column 0
Error processing BioC details for PMC6371294: syntax error: line 1, column 0
Error processing BioC details for PMC4826010: syntax error: line 1, column 0
Error processing BioC details for PMC6048781: syntax error: line 1, column 0
Error processing BioC details for PMC3005423: syntax error: line 1, column 0
Error processing BioC details for PMC3190158: syntax error: line 1, column 0
Error processing BioC details for PMC3508904: syntax error: line 1, column 0
Error processing BioC details for PMC3430326: syntax error: line 1, column 0
Error processing BioC details for PMC5018776: syntax error: line 1, column 0
Error processing BioC details for PMC4896697: syntax error: line 1, column 0
Error processing BioC details for PMC3774184: syntax error: line 1, column 0
Error processing BioC details for PMC4378170: syntax error: line 1, column 0

Unnamed: 0,PMCID,Title,Authors,Year,Journal,DOI_URL,Abstract
0,PMC4136787,Mice in Bion-M 1 Space Mission: Training and S...,"Andreev-Andrievskiy, Alexander; Popova, Anfisa...",2014,PLoS ONE,https://doi.org/10.1371/journal.pone.0104830,"After a 16-year hiatus, Russia has resumed its..."
1,PMC3630201,Microgravity Induces Pelvic Bone Loss through ...,"Blaber, Elizabeth A.; Dvorochkin, Natalya; Lee...",2013,PLoS ONE,https://doi.org/10.1371/journal.pone.0061372,Bone is a dynamically remodeled tissue that re...
2,PMC11988870,Microgravity and Cellular Biology: Insights in...,"López Garzón, Nelson Adolfo; Pinzón-Fernández,...",2025,International Journal of Molecular Sciences,https://doi.org/10.3390/ijms26073058,"Microgravity, defined by minimal gravitational..."
3,PMC7998608,Selective Proliferation of Highly Functional A...,"Mashiko, Takanobu; Kanayama, Koji; Saito, Nats...",2021,Cells,https://doi.org/10.3390/cells10030560,Therapeutic effects of adult stem-cell transpl...
4,PMC5587110,Microgravity validation of a novel system for ...,"Parra, Macarena; Jung, Jimmy; Hoang, Dzung; Hy...",2017,PLoS ONE,https://doi.org/10.1371/journal.pone.0183480,The International Space Station (ISS) National...
...,...,...,...,...,...,...,...
75,PMC5826609,Effects of spaceflight on the immunoglobulin r...,"Ward, Claire; Rettig, Trisha A.; Hlavacek, Sav...",2019,Life sciences in space research,https://doi.org/10.1016/j.lssr.2017.11.003,Spaceflight has been shown to suppress the ada...
76,PMC6366624,A comparison of unamplified and massively mult...,"Rettig, Trisha A.; Pecaut, Michael J.; Chapes,...",2020,FASEB bioAdvances,,Sequencing antibody repertoires has steadily b...
77,PMC11929063,"Preclinical development of TAK-754, a high-per...","Lengler, Johannes; Weiller, Markus; Rottenstei...",2025,Molecular Therapy. Methods & Clinical Development,https://doi.org/10.1016/j.omtm.2025.101424,This report concerns the preclinical developme...
78,PMC11929063,"Preclinical development of TAK-754, a high-per...","Lengler, Johannes; Weiller, Markus; Rottenstei...",2025,Molecular Therapy. Methods & Clinical Development,https://doi.org/10.1016/j.omtm.2025.101424,This report concerns the preclinical developme...


In [76]:
extracted_df.to_csv('cleaned.csv', index=False)
print("Filtered data saved to cleaned.csv")

Filtered data saved to cleaned.csv
