### Part 1: Identify the relevant sections in the text to send to LLM to extract MaXO terms

In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
# Ensure required NLTK downloads are completed
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /Users/niyone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/niyone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/niyone/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# Define a set of seed keywords relevant to MaXO terms
seed_keywords = ["treatment", "diagnosis", "therapy", "surgery", "medication"]

In [4]:
# Expand the keyword set using synonyms from WordNet
expanded_keywords = set(seed_keywords)
for keyword in seed_keywords:
    for synset in wn.synsets(keyword, pos=wn.NOUN):
        for lemma in synset.lemmas():
            expanded_keywords.add(lemma.name().replace('_', ' '))

In [7]:
expanded_keywords

{'OR',
 'diagnosing',
 'diagnosis',
 'discourse',
 'discussion',
 'handling',
 'intervention',
 'medicament',
 'medication',
 'medicinal drug',
 'medicine',
 'operating room',
 'operating theater',
 'operating theatre',
 'operation',
 'surgery',
 'surgical operation',
 'surgical procedure',
 'surgical process',
 'therapy',
 'treatment'}

In [11]:
# Let's say we have a paragraph from a PubMed article
pubmed_paragraph = """
In recent studies, therapy for acute myeloid leukemia (AML) has shown promising results. 
The diagnosis often involves a combination of blood tests, bone marrow examination, and genetic studies. 
Current treatment includes chemotherapy, targeted therapy, and stem cell transplantation. 
Early surgical intervention is not typically indicated; however, supportive medications can be essential.
"""

In [12]:
# Function to segment the text into sentences
def segment_text(text):
    return sent_tokenize(text)

# Function to check if a sentence contains any of the keywords
def contains_keywords(sentence, keywords):
    sentence = sentence.lower()
    sentence_tokens = word_tokenize(sentence)
    return any(keyword.lower() in sentence_tokens for keyword in keywords)

# Function to select relevant sentences
def select_relevant_sentences(text, keywords):
    sentences = segment_text(text)
    return [sentence for sentence in sentences if contains_keywords(sentence, keywords)]

In [13]:
# Identify relevant sentences
relevant_sentences = select_relevant_sentences(pubmed_paragraph, expanded_keywords)
print(relevant_sentences)

['\nIn recent studies, therapy for acute myeloid leukemia (AML) has shown promising results.', 'The diagnosis often involves a combination of blood tests, bone marrow examination, and genetic studies.', 'Current treatment includes chemotherapy, targeted therapy, and stem cell transplantation.', 'Early surgical intervention is not typically indicated; however, supportive medications can be essential.']
