In [2]:
# pubmed_fetcher/module.py
import csv
import logging
import re
import time
from typing import List, Dict
import xml.etree.ElementTree as ET
import requests

logger = logging.getLogger(__name__)

# Keywords for filtering affiliations
ACADEMIC_TERMS = ["university", "college", "institute", "school", "faculty", "department", "center"]
CORPORATE_TERMS = ["pharma", "biotech", "therapeutics", "laboratories", "inc", "corp", "llc", "gmbh", "ltd"]

def affiliation_is_academic(affiliation: str) -> bool:
    lower_aff = affiliation.lower()
    return any(term in lower_aff for term in ACADEMIC_TERMS)

def affiliation_is_corporate(affiliation: str) -> bool:
    lower_aff = affiliation.lower()
    return any(term in lower_aff for term in CORPORATE_TERMS)

def get_pubmed_ids(query: str) -> List[str]:
    search_endpoint = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    parameters = {"db": "pubmed", "term": query, "retmode": "json", "retmax": 100}
    response = requests.get(search_endpoint, params=parameters)
    response.raise_for_status()
    return response.json().get("esearchresult", {}).get("idlist", [])

def retrieve_paper_details(pubmed_id: str) -> Dict[str, str]:
    fetch_endpoint = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    parameters = {"db": "pubmed", "id": pubmed_id, "retmode": "xml"}
    response = requests.get(fetch_endpoint, params=parameters)
    response.raise_for_status()
    tree = ET.fromstring(response.text)
    article = tree.find(".//PubmedArticle")
    return extract_metadata(article)

def extract_metadata(article) -> Dict[str, str]:
    title = article.findtext(".//ArticleTitle", default="")
    pub_date = article.findtext(".//PubDate/Year", default="")
    authors = article.findall(".//Author")

    corporate_authors = []
    company_names = set()
    contact_email = ""

    for person in authors:
        affiliation = person.findtext("AffiliationInfo/Affiliation", default="")
        last = person.findtext("LastName", default="")
        first = person.findtext("ForeName", default="")
        name = f"{last}, {first}"

        email_found = re.search(r"[\w\.-]+@[\w\.-]+", affiliation)
        if email_found:
            contact_email = email_found.group()

        if affiliation and not affiliation_is_academic(affiliation) and affiliation_is_corporate(affiliation):
            corporate_authors.append(name)
            company_names.add(affiliation)

    return {
        "PubmedID": article.findtext("MedlineCitation/PMID", default=""),
        "Title": title,
        "Publication Date": pub_date,
        "Non-academic Author(s)": "; ".join(corporate_authors),
        "Company Affiliation(s)": "; ".join(company_names),
        "Corresponding Author Email": contact_email,
    }

def gather_papers(query: str, debug: bool = False) -> List[Dict[str, str]]:
    if debug:
        logging.basicConfig(level=logging.DEBUG)

    ids = get_pubmed_ids(query)
    logger.debug(f"Total PubMed IDs fetched: {len(ids)}")
    results = []

    for index, pid in enumerate(ids):
        logger.debug(f"Processing PubMed ID {pid} ({index + 1} of {len(ids)})")
        try:
            info = retrieve_paper_details(pid)
            if info["Non-academic Author(s)"]:
                results.append(info)
        except Exception as err:
            logger.error(f"Failed to process ID {pid}: {err}")
        time.sleep(0.3)

    return results

def export_to_csv(filepath: str, records: List[Dict[str, str]]) -> None:
    with open(filepath, "w", newline="", encoding="utf-8") as output:
        writer = csv.DictWriter(output, fieldnames=records[0].keys())
        writer.writeheader()
        writer.writerows(records)


In [4]:

query = "COVID-19 vaccine"  # Change this to your topic
results = gather_papers(query, debug=True)

if results:
    csv_filename = "pubmed_results.csv"
    export_to_csv(csv_filename, results)
    print(f"✅ CSV generated: {csv_filename} with {len(results)} records")
else:
    print("❌ No corporate-affiliated authors found in the query.")


✅ CSV generated: pubmed_results.csv with 6 records


In [5]:
from google.colab import files
files.download("pubmed_results.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>