In [1]:
import os
import re
import torch
import nltk
import numpy as np 
import pandas as pd
from Bio import Entrez
from typing import List
from dotenv import load_dotenv
from nltk.corpus import stopwords
from transformers import pipeline
from datasets import load_dataset
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
email = os.getenv("ENTREZ_EMAIL")
api_key = os.getenv("ENTREZ_API_KEY")

Entrez.email = email
Entrez.api_key = api_key

In [3]:
def fetch_abstract_ids(disease: str, max_articles: int) -> List[str]:
    try:
        with Entrez.esearch(db="pubmed", term=disease, retmax=max_articles) as handle:
            record = Entrez.read(handle)
        ids = record.get("IdList", [])
        return ids
    except Exception as e:
        raise e

In [4]:
def fetch_abstract_by_id(pmid: str) -> str:
    try:
        with Entrez.efetch(db="pubmed", id=pmid, rettype=["abstract"], retmode="text") as handle:
            return handle.read()
    except Exception as e:
        raise e

In [5]:
ids = fetch_abstract_ids(disease="cancer", max_articles=1)
abstracts = [fetch_abstract_by_id(pmid=id) for id in ids]
print(abstracts[0])

1. Thorac Cancer. 2025 Jun;16(12):e70116. doi: 10.1111/1759-7714.70116.

Repeated Anatomical Pulmonary Resection for Second Primary Nonsmall-Cell Lung 
Cancer: Safety and Short-Term Outcomes.

Forster C(1), Chriqui LE(2), Abdelnour-Berchtold E(2), Zellweger M(2), Perentes 
JY(2)(3), Krueger T(2)(3), Gonzalez M(2)(3).

Author information:
(1)Department of Thoracic Surgery, Centre Hospitalier du Valais Romand (CHVR), 
Sion, Switzerland.
(2)Department of Thoracic Surgery, Lausanne University Hospital (CHUV), 
Lausanne, Switzerland.
(3)Faculty of Biology and Medicine, University of Lausanne (UNIL), Lausanne, 
Switzerland.

BACKGROUND: Repeated anatomical pulmonary resections in second primary 
nonsmall-cell lung cancer (NSCLC) pose significant challenges due to prior 
surgery. This study evaluates the feasibility and short-term outcomes of 
repeated anatomical pulmonary resections for second primary NSCLC.
METHOD: We retrospectively reviewed all consecutive cases of repeated anatomical 
pu

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/venkateshmunaga/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/venkateshmunaga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/venkateshmunaga/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/venkateshmunaga/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
def abstract_tokenizor(abstract) -> list[str]:
    clean = re.sub(r"Author information:.*", "", abstract)
    clean = re.sub(r"(DOI|PMID|Erratum).*", "", clean)
    clean = re.sub(r"\(\d+\)", "", clean)
    clean = re.sub(r'^\d+\.\s+.*?doi:.*?\n+', '', clean, flags=re.IGNORECASE | re.DOTALL)
    # Remove all-uppercase title line (common in PubMed export)
    clean = re.sub(r'^[A-Z][A-Z\s\-]+\.?\n+', '', clean, flags=re.MULTILINE)
    # Remove author list (comma-separated names)
    clean = re.sub(r'^[A-Z][a-z]+.*?,.*?\n+', '', clean, flags=re.MULTILINE)
    # Remove affiliations (lines that mention university, hospital, dept)
    clean = re.sub(r'^(Department|Faculty|Hospital|University|Centre).*?\n+', '', clean, flags=re.MULTILINE)
    # Remove copyright / publisher lines
    clean = re.sub(r'©.*?\n+', '', clean)
    # Remove extra blank lines
    clean = re.sub(r'\n{2,}', '\n\n', clean)
    print(f"abstract: {clean}")
    lemmatizer = WordNetLemmatizer()
    lem_text = lemmatizer.lemmatize(clean)
    print(f"lem_text: {lem_text}")
    token_text = word_tokenize(lem_text)
    print(f"token_text: {token_text}")
    stop_words = set(stopwords.words('english'))
    print(f"stop_words: {stop_words}")
    token_text = " ".join([w for w in token_text if w.lower() not in stop_words])
    return token_text

In [8]:
text = abstract_tokenizor(abstract=abstracts[0])
print(f"text: {text}")

abstract: Repeated Anatomical Pulmonary Resection for Second Primary Nonsmall-Cell Lung 
Cancer: Safety and Short-Term Outcomes.

JY, Krueger T, Gonzalez M.

Switzerland.

BACKGROUND: Repeated anatomical pulmonary resections in second primary 
nonsmall-cell lung cancer (NSCLC) pose significant challenges due to prior 
surgery. This study evaluates the feasibility and short-term outcomes of 
repeated anatomical pulmonary resections for second primary NSCLC.
METHOD: We retrospectively reviewed all consecutive cases of repeated anatomical 
pulmonary resections for second primary NSCLC performed in our institution from 
January 2014 to December 2023.
RESULTS: A total of 55 patients (median age 68 years; interquartile range [IQR]: 
61.5-72) underwent repeated anatomical pulmonary resections for second primary 
NSCLC. Adenocarcinoma predominated in both primary (78.2%) and secondary (76.4%) 
cases. Video-assisted thoracoscopy (VATS) approach was used in 94.5% and 96.4% 
for first and repeate

In [9]:
device = (
    "mps" if torch.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

ner = pipeline(
    "ner",
    model="NeuML/pubmedbert-base-embeddings",
    tokenizer="NeuML/pubmedbert-base-embeddings",
    aggregation_strategy="simple",
    device=device  # uses the first GPU
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps


In [10]:
entities = ner(text)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [11]:
for ent in entities:
    print(f"{ent['word']}: {ent['entity_group']} (score: {ent['score']:.2f})")

repeated anatomical pulmonary resection second primary nonsmall: LABEL_0 (score: 0.61)
- cell: LABEL_1 (score: 0.53)
lung cancer : safety short - term outcomes: LABEL_0 (score: 0.59)
.: LABEL_1 (score: 0.53)
jy, krueger, gonzalez m: LABEL_0 (score: 0.63)
.: LABEL_1 (score: 0.51)
switzerland.: LABEL_0 (score: 0.62)
background: LABEL_1 (score: 0.51)
: repeated anatomical pulmonary resections second primary nonsmall: LABEL_0 (score: 0.60)
-: LABEL_1 (score: 0.52)
cell lung cancer: LABEL_0 (score: 0.55)
(: LABEL_1 (score: 0.51)
nsclc ) pose significant challenges due prior surgery. study evaluates feasibility short - term outcomes repeated anatomical pulmonary resections second primary nsclc. method : retrospectively reviewed consecutive cases repeated anatomical pulmonary resections second primary nsclc performed: LABEL_0 (score: 0.60)
institution: LABEL_1 (score: 0.51)
january 2014 december 2023. results : total: LABEL_0 (score: 0.63)
55 patients ( median: LABEL_1 (score: 0.54)
age 68: L

In [12]:
entities

[{'entity_group': 'LABEL_0',
  'score': np.float32(0.613693),
  'word': 'repeated anatomical pulmonary resection second primary nonsmall',
  'start': 0,
  'end': 63},
 {'entity_group': 'LABEL_1',
  'score': np.float32(0.52558565),
  'word': '- cell',
  'start': 63,
  'end': 68},
 {'entity_group': 'LABEL_0',
  'score': np.float32(0.59195745),
  'word': 'lung cancer : safety short - term outcomes',
  'start': 69,
  'end': 109},
 {'entity_group': 'LABEL_1',
  'score': np.float32(0.5250773),
  'word': '.',
  'start': 110,
  'end': 111},
 {'entity_group': 'LABEL_0',
  'score': np.float32(0.634433),
  'word': 'jy, krueger, gonzalez m',
  'start': 112,
  'end': 137},
 {'entity_group': 'LABEL_1',
  'score': np.float32(0.50840753),
  'word': '.',
  'start': 137,
  'end': 138},
 {'entity_group': 'LABEL_0',
  'score': np.float32(0.6200783),
  'word': 'switzerland.',
  'start': 139,
  'end': 152},
 {'entity_group': 'LABEL_1',
  'score': np.float32(0.51186335),
  'word': 'background',
  'start': 15