In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import pandas as pd
from Bio import Entrez
from bs4 import BeautifulSoup
import time
from tqdm import tqdm

In [None]:
# Email adress is required to use Bio.Entrez module
Entrez.email = "maron14f1@gmail.com"

# Search for open access COVID-related articles in PMC that also mention Obesity and Smoking
search_term = 'COVID-19 AND Obesity AND Smoking AND open access[filter]'
handle = Entrez.esearch(db="pmc", term=search_term, retmax=100000)
record = Entrez.read(handle)
pmc_ids = record["IdList"]

print(f"{len(pmc_ids)} open access articles found in PMC dataset.")

results = []

# Iterate over the IDs and download the full text of each article
for pmc_id in tqdm(pmc_ids, desc="Downloading articles"):
    try:
        fetch = Entrez.efetch(db="pmc", id=pmc_id, rettype="full", retmode="xml")
        xml_data = fetch.read()
        fetch.close()

        soup = BeautifulSoup(xml_data, "lxml-xml")

        # Extract title
        title_tag = soup.find("article-title")
        title = title_tag.get_text(strip=True) if title_tag else "N/A"

        # Extract abstract
        abstract_tag = soup.find("abstract")
        abstract = abstract_tag.get_text(separator=" ", strip=True) if abstract_tag else ""

        # Extract full text
        body_tag = soup.find("body")
        full_text = body_tag.get_text(separator="\n", strip=True) if body_tag else ""

        # Extract publication date
        pub_date_tag = soup.find("pub-date")
        published = pub_date_tag.get_text(strip=True) if pub_date_tag else "N/A"

        # Extract DOI
        doi_tag = soup.find("article-id", attrs={"pub-id-type": "doi"})
        doi = doi_tag.get_text(strip=True) if doi_tag else "N/A"

        # Save only if the full text is available
        if full_text.strip():
            results.append({
                "pmc_id": pmc_id,
                "title": title,
                "abstract": abstract,
                "full_text": full_text,
                "published": published,
                "doi": doi
            })

        # Delay added to respect NCBI rate limits.
        time.sleep(0.34)

    except Exception as e:
        tqdm.write(f"[Error] Article {pmc_id}: {e}")
        continue

print("\nNumber of articles successfully downloaded:", len(results))

In [None]:
df_arxiv = pd.DataFrame(results)
df_arxiv.to_csv("pmc_covid_and_keywords.csv", index=False)

In [None]:
df_arxiv.head()

Unnamed: 0,pmc_id,title,abstract,full_text,published,doi
0,12085265,Dreaming of motherhood: experiences of women w...,Abstract Objective. to describe the strategies...,Introduction\nInfertility is one of the main r...,2842025,10.17533/udea.iee.v43n1e06
1,12085247,Association Between Body Mass Index and Comorb...,Objective: There is limited evidence regarding...,1. Introduction\nMajor depressive disorder (MD...,2025,10.1155/da/6648190
2,12085188,Gender differences in correlation of biochemic...,Background The COVID-19 pandemic caused global...,Introduction\nCOVID-19 is a systemic infectiou...,2132025,10.5937/jomb0-49377
3,12085046,The impact of social determinants of health on...,"Background Rates of preterm birth, low birth w...",Background\nBirthing individuals and infants b...,1652025,10.1186/s12884-025-07693-y
4,12084870,Perfluorinated compounds exposure and atheroge...,Abstract Perfluorinated compounds (PFCs) are a...,Significance Statement\nThe pathogenesis and p...,52025,10.1093/pnasnexus/pgaf153
