### Part 1: Identify the relevant sections in the text to send to LLM to extract MaXO terms

In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
# Ensure required NLTK downloads are completed
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /Users/niyone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/niyone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/niyone/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# Define a set of seed keywords relevant to MaXO terms
seed_keywords = ["treatment", "diagnosis", "therapy", "surgery", "medication"]

In [4]:
# Expand the keyword set using synonyms from WordNet
expanded_keywords = set(seed_keywords)
for keyword in seed_keywords:
    for synset in wn.synsets(keyword, pos=wn.NOUN):
        for lemma in synset.lemmas():
            expanded_keywords.add(lemma.name().replace('_', ' '))

In [5]:
expanded_keywords

{'OR',
 'diagnosing',
 'diagnosis',
 'discourse',
 'discussion',
 'handling',
 'intervention',
 'medicament',
 'medication',
 'medicinal drug',
 'medicine',
 'operating room',
 'operating theater',
 'operating theatre',
 'operation',
 'surgery',
 'surgical operation',
 'surgical procedure',
 'surgical process',
 'therapy',
 'treatment'}

In [6]:
sample_pubmed_paragraph = """
ArithmeticErrorRed blood cell mechanical sensitivity improves in patients with sickle cell disease undergoing chronic transfusion after prolonged, 
subhemolytic shear exposure.BACKGROUND: Sickle cell disease (SCD) is a genetically inherited hemoglobinopathy in which deoxygenated hemoglobin S polymerizes, 
leading to stiff red blood cells (RBCs) and inefficient microcirculatory blood flow. Transfusion therapy acts as primary and secondary prevention of 
ischemic stroke in SCD. Whether blood transfusion alters the mechanical sensitivity (MS) of RBCs to prolonged subhemolytic shear stress (shear) is unknown. 
We hypothesized that individuals with SCD undergoing chronic blood transfusion would have improved sensitivity to shear, compared with patients not 
undergoing transfusion therapy. STUDY DESIGN AND METHODS: Blood suspensions from individuals with SCD not receiving (n = 15) and receiving (n = 15) 
chronic simple transfusion were conditioned to shear (1, 4, 16, 32, and 64 Pa) for various durations (1, 4, 16, 32, and 64 sec), and then deformability of
RBCs was immediately measured. Healthy young controls (n = 15) were included for reference. A surface mesh was interpolated using the data to determine the
effect of blood transfusion on MS of RBCs. RESULTS: There was impaired RBC deformability to prolonged supraphysiologic shear in both SCD groups; however, MS 
improved in transfused patients when exposed to prolonged physiologic shear. Furthermore, in the transfused patients with SCD, the threshold above which
subhemolytic damage occurs was similar to controls. CONCLUSION: We found that chronic transfusion therapy normalizes the MS threshold above which RBC 
subhemolytic damage occurs after prolonged shear exposure in SCD. An important and novel finding in transfused patients with SCD was the improvement in RBC
deformability in response to prolonged shear exposure over the physiologic range.
"""

In [7]:
# Function to segment the text into sentences
def segment_text(text):
    return sent_tokenize(text)

# Function to check if a sentence contains any of the keywords
def contains_keywords(sentence, keywords):
    sentence = sentence.lower()
    sentence_tokens = word_tokenize(sentence)
    return any(keyword.lower() in sentence_tokens for keyword in keywords)

# Function to select relevant sentences
def select_relevant_sentences(text, keywords):
    sentences = segment_text(text)
    return [sentence for sentence in sentences if contains_keywords(sentence, keywords)]

In [9]:
# Identify relevant sentences
relevant_sentences = select_relevant_sentences(sample_pubmed_paragraph, expanded_keywords)

In [10]:
relevant_sentences

['Transfusion therapy acts as primary and secondary prevention of \nischemic stroke in SCD.',
 'We hypothesized that individuals with SCD undergoing chronic blood transfusion would have improved sensitivity to shear, compared with patients not \nundergoing transfusion therapy.',
 'CONCLUSION: We found that chronic transfusion therapy normalizes the MS threshold above which RBC \nsubhemolytic damage occurs after prolonged shear exposure in SCD.']