In [1]:
from Bio import Entrez
from dotenv import load_dotenv
import os
from concurrent.futures import ThreadPoolExecutor
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

load_dotenv()

# Load env vars
Entrez.email = os.getenv("ENTREZ_EMAIL")
Entrez.api_key = os.getenv("ENTREZ_API_KEY")
FETCH_LIMIT = int(os.getenv("PUBMED_FETCH_LIMIT", 10))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/venkateshmunaga/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/venkateshmunaga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/venkateshmunaga/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/venkateshmunaga/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:

def fetch_abstracts(disease, max_articles=400) -> list:
    handle = Entrez.esearch(db="pubmed", term=disease, retmax=max_articles)
    record = Entrez.read(handle)
    handle.close()
    ids = record["IdList"]

    abstracts = []
    for pmid in ids:
        summary_handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
        abstracts.append(summary_handle.read())
    return abstracts

In [3]:
text_list = fetch_abstracts(disease="cancer")

In [None]:


def preprocess_single_token(abstract):
    lemmatizer = WordNetLemmatizer()
    lem_text = lemmatizer.lemmatize(abstract)
    token_text = word_tokenize(lem_text)
    stop_words = set(stopwords.words('english'))  # Set for faster lookup
    token_text = [w for w in token_text if w.lower() not in stop_words]
    return token_text

# Main function with multiprocessing
def preprocess_abstracts_token(abstracts):
    with ThreadPoolExecutor() as pool:
        results = pool.map(preprocess_single_token, abstracts)
    return results

In [None]:
abstracts = preprocess_abstracts_token(text_list)

In [None]:
def label_tokens(tokens, entity_dict):
    labels = []
    i = 0
    while i < len(tokens):
        matched = False
        for length in range(5, 0, -1):  # Try n-gram matches
            span = tokens[i:i+length]
            phrase = ' '.join(span).lower()
            if phrase in entity_dict:
                labels.extend(["B-"+entity_dict[phrase]] + ["I-"+entity_dict[phrase]]*(length-1))
                i += length
                matched = True
                break
        if not matched:
            labels.append("O")
            i += 1
    return labels

In [None]:
def to_conll_format(sentences, labels_list, save_path="ner_dataset.conll"):
    with open(save_path, "w") as f:
        for tokens, labels in zip(sentences, labels_list):
            for tok, lab in zip(tokens, labels):
                f.write(f"{tok}\t{lab}\n")
            f.write("\n")


In [None]:
[1, 2, 3] + [1, 2, 3]