This notebook demonstrated 3 ways to perform medical named entity recognition from a pdf and save retrieved entities in a file
NER Way 1 :-using spacy model

In [None]:
import fitz
import spacy

nlp = spacy.load('en_core_web_sm')

## Clean the text
def clean_and_tokenize(text):
    doc = nlp(text)
    clean_text = " ".join(token.text for token in doc if not token.is_space)
    return clean_text

def pdf_to_text(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {str(e)}")
        return None

def save_text_to_file(text, output_file):
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(text)
        print(f"Extracted text has been saved to '{output_file}'")
    except Exception as e:
        print(f"Error saving text to file: {str(e)}")

def count_words_in_text(text):
    if text:
        word_count = len(text.split())
        print(f"Total words in the extracted text: {word_count}")
    else:
        print("Cannot count words. Text is empty or extraction failed.")

# Example usage:
pdf_path = '/content/sample_data/patients.pdf'  # Replace with your PDF file path
output_file = '/content/sample_data/extracted_text.txt'

# Extract text from PDF
extracted_text = pdf_to_text(pdf_path)
cleaned_text = clean_and_tokenize(extracted_text)

# Save extracted text to a file
if cleaned_text:
    save_text_to_file(cleaned_text, output_file)

# Perform operations on the extracted text
count_words_in_text(cleaned_text)


Extracted text has been saved to '/content/sample_data/extracted_text.txt'
Total words in the extracted text: 314


In [None]:
Way 2 : Using open source model from Huggingface

In [None]:

def read_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except IOError as e:
        print(f"Error reading file: {str(e)}")
    except Exception as e:
        print(f"Unexpected error: {str(e)}")

file_path = '/content/sample_data/extracted_text.txt'  # Replace with your file path
file_text = read_file(file_path)

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all", model_max_length=512)
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
result = pipe(file_text)


grouped_entities = {}

# Group named entities by labels
for entity in result:
    label = entity["entity_group"]
    word = entity["word"]
    if label not in grouped_entities:
        grouped_entities[label] = []
    grouped_entities[label].append(word)

# Print named entities grouped by labels
for label, entities in grouped_entities.items():
    print(label + "_ =", ", ".join(entities))




Severity_ = severe
Sign_symptom_ = di, ##zziness, symptoms, rash, it, ##ching, swelling, symptoms, fever, symptoms, dark, adverse effects, short, symptoms, ins, tremor, anxiety, symptoms, headache, symptoms
Medication_ = ibuprofen, am, at, ##orvastat, met, ##op, met, ##oprol, al, ##pr, ##azolam, top
Time_ = within two hours
Duration_ = three days, the course of a week, three days, a few days, a two - week period, four - week, one week, two weeks
Biological_structure_ = skin, muscle, chest

Way 3 :-Using GLiNER

In [21]:
from gliner import GLiNER

# Initialize GLiNER with the base model
model = GLiNER.from_pretrained("urchade/gliner_large_bio-v0.1")

#Sample text for entity prediction
text = file_text

# Labels for entity prediction
labels = ["Disease","Symptoms","Drug","Medical treatment"] # for v2.1 use capital case for better performance

# Perform entity prediction
entities = model.predict_entities(text, labels, threshold=0.5)

# Initialize dictionaries to store entities by label
predicted_entities = {label: [] for label in labels}

# Group entities by their labels
for entity in entities:
    label = entity["label"]
    predicted_entities[label].append(entity["text"])

# Display predicted entities grouped by labels
for label, entities_list in predicted_entities.items():
    if entities_list:
        print(f"{label}s:")
        for entity in entities_list:
            print(f"- {entity}")
        print()

Diseases:
- cardiac conditions

Symptomss:
- nausea
- vomiting
- headache
- dizziness
- skin rash
- itching
- abdominal pain
- diarrhea
- fever
- muscle cramps
- weakness
- dark urine
- palpitations
- shortness of breath
- chest pain
- insomnia
- agitation
- tremors
- blurred vision
- confusion

Drugs:
- Ibuprofen
- Penicillin
- Amoxicillin
- Atorvastatin
- Metoprolol
- Metoprolol
- Alprazolam
- Topiramate

Medical treatments:
- cholesterol management
- anxiety management
- migraine prophylaxis



Diseases:
- cardiac conditions

Symptomss:
- nausea
- vomiting
- headache
- dizziness
- skin rash
- itching
- abdominal pain
- diarrhea
- fever
- muscle cramps
- weakness
- dark urine
- palpitations
- shortness of breath
- chest pain
- insomnia
- agitation
- tremors
- blurred vision
- confusion