In [1]:
import os
import torch
import pandas as pd
from Bio import Entrez
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from typing import List
from src.utils.logger import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
email = os.getenv("ENTREZ_EMAIL")
api_key = os.getenv("ENTREZ_API_KEY")

Entrez.email = email
Entrez.api_key = api_key

In [3]:
def fetch_abstract_ids(disease: str, max_articles: int) -> List[str]:
    try:
        with Entrez.esearch(db="pubmed", term=disease, retmax=max_articles) as handle:
            record = Entrez.read(handle)
        ids = record.get("IdList", [])
        return ids
    except Exception as e:
        raise e

In [4]:
def fetch_abstract_by_id(pmid: str) -> str:
    try:
        with Entrez.efetch(db="pubmed", id=pmid, rettype=["abstract"], retmode="text") as handle:
            return handle.read()
    except Exception as e:
        raise e

In [5]:
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

model_name = "dmis-lab/biobert-base-cased-v1.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
model.to(device)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0


In [6]:
ids = fetch_abstract_ids(disease="cancer", max_articles=5)
abstract = [fetch_abstract_by_id(pmid=id) for id in ids]
print(abstract)

["1. Genet Med. 2025 Jun 21:101507. doi: 10.1016/j.gim.2025.101507. Online ahead of\n print.\n\nEUGENICS AND POLYGENIC EMBRYO SCREENING: PUBLIC, CLINICIAN, AND PATIENT \nPERCEPTIONS OF CONDITIONS VERSUS TRAITS.\n\nBarlevy D(1), Furrer RA(2), Kalapatapu A(3), Martinez A(3), Lencz T(4), Carmi \nS(5), Lázaro-Muñoz G(6), Pereira S(3).\n\nAuthor information:\n(1)Center for Medical Ethics and Health Policy, Baylor College of Medicine, \nHouston, TX, USA. Electronic address: dorit.barlevy@bcm.edu.\n(2)Department of Neurosurgery, Massachusetts General Hospital, Boston, MA, USA; \nHarvard Medical School, Boston, MA, USA.\n(3)Center for Medical Ethics and Health Policy, Baylor College of Medicine, \nHouston, TX, USA.\n(4)Institute of Behavioral Science, The Feinstein Institutes for Medical \nResearch, Northwell Health, Manhasset, NY, USA; Departments of Psychiatry and \nMolecular Medicine, Zucker School of Medicine at Hofstra/Northwell, Hempstead, \nNY, USA; Division of Research, Department of P

In [7]:
print(abstract[0])

1. Genet Med. 2025 Jun 21:101507. doi: 10.1016/j.gim.2025.101507. Online ahead of
 print.

EUGENICS AND POLYGENIC EMBRYO SCREENING: PUBLIC, CLINICIAN, AND PATIENT 
PERCEPTIONS OF CONDITIONS VERSUS TRAITS.

Barlevy D(1), Furrer RA(2), Kalapatapu A(3), Martinez A(3), Lencz T(4), Carmi 
S(5), Lázaro-Muñoz G(6), Pereira S(3).

Author information:
(1)Center for Medical Ethics and Health Policy, Baylor College of Medicine, 
Houston, TX, USA. Electronic address: dorit.barlevy@bcm.edu.
(2)Department of Neurosurgery, Massachusetts General Hospital, Boston, MA, USA; 
Harvard Medical School, Boston, MA, USA.
(3)Center for Medical Ethics and Health Policy, Baylor College of Medicine, 
Houston, TX, USA.
(4)Institute of Behavioral Science, The Feinstein Institutes for Medical 
Research, Northwell Health, Manhasset, NY, USA; Departments of Psychiatry and 
Molecular Medicine, Zucker School of Medicine at Hofstra/Northwell, Hempstead, 
NY, USA; Division of Research, Department of Psychiatry, The Zucker

In [8]:
from datasets import load_dataset

In [16]:
dataset = load_dataset("ncbi/ncbi_disease")

Downloading data: 1.14MB [00:00, 6.75MB/s]                  
Downloading data: 200kB [00:00, 687kB/s]                     
Downloading data: 206kB [00:00, 3.16MB/s]                    
Generating train split: 100%|██████████| 5433/5433 [00:00<00:00, 27855.83 examples/s]
Generating validation split: 100%|██████████| 924/924 [00:00<00:00, 27562.31 examples/s]
Generating test split: 100%|██████████| 941/941 [00:00<00:00, 27303.76 examples/s]


In [28]:
dataset["train"][456]

{'id': '456',
 'tokens': ['beta',
  '-',
  'Glucuronidase',
  'activity',
  'was',
  'undetectable',
  'in',
  'affected',
  'cat',
  'fibroblasts',
  'and',
  'restored',
  'by',
  'retroviral',
  'gene',
  'transfer',
  'of',
  'rat',
  'beta',
  '-',
  'glucuronidase',
  'cDNA',
  '.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 5433
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 924
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 941
    })
})