In [None]:
import re
import json
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download required NLTK resources
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Common biomedical abbreviations and their expanded forms
bio_abbreviations = {
    "AD": "Alzheimer's disease",
    "MI": "myocardial infarction",
    "HTN": "hypertension",
    "DM": "diabetes mellitus",
    "CHF": "congestive heart failure",
    "COPD": "chronic obstructive pulmonary disease",
    "RA": "rheumatoid arthritis",
    "MS": "multiple sclerosis",
    "ASMD": "ASM-deficient Niemann-Pick disease",
    # Add more as needed
}

# Common misspellings of drug names
drug_spelling_corrections = {
    "acetaminophen": ["acetaminophen", "acetaminophine", "acetaminofin"],
    "ibuprofen": ["ibuprofen", "ibuprofin", "ibuprophen"],
    "amoxicillin": ["amoxicillin", "amoxicilin", "amoxicillan"],
    # Add more as needed
}

# Create reverse mapping for drug spelling corrections
drug_spelling_map = {}
for correct, variants in drug_spelling_corrections.items():
    for variant in variants:
        if variant != correct:
            drug_spelling_map[variant] = correct

In [None]:
def fix_encoding_issues(text):
    """
    Fix common encoding issues in text.

    Args:
        text (str): The input text with potential encoding issues

    Returns:
        str: Text with fixed encoding issues
    """
    # Replace common problematic characters
    replacements = {
        '\x92': "'",    # Right single quotation mark
        '\x93': '"',    # Left double quotation mark
        '\x94': '"',    # Right double quotation mark
        '\x96': '-',    # En dash
        '\x97': '-',    # Em dash
        '\xa0': ' ',    # Non-breaking space
        '&amp;': '&',   # HTML ampersand
        '&lt;': '<',    # HTML less than
        '&gt;': '>',    # HTML greater than
    }

    for old, new in replacements.items():
        text = text.replace(old, new)

    return text


In [None]:
def standardize_punctuation(text, keep_punctuation=True):
    """
    Standardize punctuation in text.

    Args:
        text (str): The input text
        keep_punctuation (bool): Whether to keep biomedically relevant punctuation

    Returns:
        str: Text with standardized punctuation
    """
    if keep_punctuation:
        # Replace multiple dashes with single dash (but keep the dash)
        text = re.sub(r'-+', '-', text)

        # Ensure spaces around punctuation except for specific cases
        # Keep punctuation in patterns like "COVID-19", "5-HTP", "50mg"
        for punct in [',', '.', ';', ':', '!', '?']:
            text = re.sub(f'(?<![A-Za-z0-9]){re.escape(punct)}', f' {punct} ', text)

        # Standardize parentheses with spaces
        text = re.sub(r'\(', ' ( ', text)
        text = re.sub(r'\)', ' ) ', text)

        # Fix spaces
        text = re.sub(r'\s+', ' ', text)
    else:
        # Remove punctuation entirely (not recommended for biomedical NER)
        text = text.translate(str.maketrans('', '', string.punctuation))

    return text.strip()

In [None]:
def normalize_case(text, preserve_case=False):
    """
    Normalize the case of text.

    Args:
        text (str): The input text
        preserve_case (bool): Whether to preserve the original case

    Returns:
        str: Text with normalized case
    """
    if not preserve_case:
        text = text.lower()
    return text

In [None]:
def expand_abbreviations(text, abbreviations=bio_abbreviations):
    """
    Expand common biomedical abbreviations.

    Args:
        text (str): The input text
        abbreviations (dict): Dictionary of abbreviations and their expanded forms

    Returns:
        str: Text with expanded abbreviations
    """
    # Create case-insensitive dictionary (all keys to lowercase)
    abbrev_lower = {k.lower(): v for k, v in abbreviations.items()}

    # Tokenize with word boundaries to handle punctuation
    words = re.findall(r'\b\w+\b', text)

    for word in words:
        lower_word = word.lower()

        if lower_word in abbrev_lower:
            # Replace the abbreviation with its expanded form
            pattern = r'\b' + re.escape(word) + r'\b'
            text = re.sub(pattern, abbrev_lower[lower_word], text)

    return text

In [None]:
def correct_drug_spelling(text, spelling_map=drug_spelling_map):
    """
    Correct common misspellings of drug names.

    Args:
        text (str): The input text
        spelling_map (dict): Dictionary mapping misspelled drugs to their correct spelling

    Returns:
        str: Text with corrected drug spellings
    """
    words = text.split()
    for i, word in enumerate(words):
        lower_word = word.lower()
        if lower_word in spelling_map:
            # Replace with correct spelling but preserve case pattern
            if word.isupper():
                words[i] = spelling_map[lower_word].upper()
            elif word[0].isupper():
                words[i] = spelling_map[lower_word].capitalize()
            else:
                words[i] = spelling_map[lower_word]
    return ' '.join(words)

In [None]:
def remove_stopwords(text, standard_stopwords, custom_stopwords=None):
    """
    Remove stopwords from text.

    Args:
        text (str): The input text
        standard_stopwords (set): Set of standard stopwords
        custom_stopwords (list, optional): List of custom stopwords

    Returns:
        str: Text with stopwords removed
    """
    # Tokenize text
    tokens = word_tokenize(text)

    # Filter out standard stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in standard_stopwords]

    # Filter out custom stopwords if provided
    if custom_stopwords:
        custom_stopwords_lower = [word.lower() for word in custom_stopwords]
        filtered_tokens = [token for token in filtered_tokens
                          if token.lower() not in custom_stopwords_lower]

    # Join tokens back into text
    return ' '.join(filtered_tokens)



In [None]:
def preprocess_text(text, preserve_case=False, keep_punctuation=True,
                   remove_stops=True, custom_stopwords=None):
    """
    Apply all preprocessing steps to the input text.

    Args:
        text (str): The input text
        preserve_case (bool): Whether to preserve the original case
        keep_punctuation (bool): Whether to keep biomedically relevant punctuation
        remove_stops (bool): Whether to remove stopwords
        custom_stopwords (list): Custom stopwords to remove

    Returns:
        str: Fully preprocessed text
    """
    if not text or not isinstance(text, str):
        return ""

    # Apply preprocessing steps in sequence
    text = fix_encoding_issues(text)
    text = standardize_punctuation(text, keep_punctuation)

    text = expand_abbreviations(text)
    text = correct_drug_spelling(text)

    if not preserve_case:
        text = normalize_case(text)

    # Remove stopwords if specified
    if remove_stops:
        standard_stopwords = set(stopwords.words('english'))
        text = remove_stopwords(text, standard_stopwords, custom_stopwords)

    # Ensure clean whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [None]:
def process_json_file(input_file_path, output_file_path, fields_to_process=None,
                     preserve_case=False, keep_punctuation=True, remove_stops=True):
    """
    Process text fields in a JSON file.

    Args:
        input_file_path (str): Path to the input JSON file
        output_file_path (str): Path to save the processed JSON file
        fields_to_process (list): List of specific fields to process
        preserve_case (bool): Whether to preserve the original case
        keep_punctuation (bool): Whether to keep biomedically relevant punctuation
        remove_stops (bool): Whether to remove stopwords
    """
    # Define custom stopwords for biomedical documents
    custom_stopwords = [
        "Warnings", "Precautions", "Use", "Specific", "Populations",
        "see", "contraindications", "indications", "dosage", "administration",
        "adverse", "reactions", "drug", "interactions", "clinical", "studies"
    ]

    # Load the JSON file
    with open(input_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # If specific fields are provided, only process those
    if fields_to_process:
        for field in fields_to_process:
            if field in data and isinstance(data[field], str):
                # Store original field value
                data[f"{field}_original"] = data[field]

                # Apply text preprocessing
                data[field] = preprocess_text(
                    data[field],
                    preserve_case=preserve_case,
                    keep_punctuation=keep_punctuation,
                    remove_stops=remove_stops,
                    custom_stopwords=custom_stopwords
                )
    else:
        # Process all string fields in the JSON
        processed_data = process_json_object(
            data,
            preserve_case=preserve_case,
            keep_punctuation=keep_punctuation,
            remove_stops=remove_stops,
            custom_stopwords=custom_stopwords
        )
        data = processed_data

    # Save the processed JSON
    with open(output_file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

    print(f"Processed JSON saved to {output_file_path}")

In [None]:
def process_json_object(obj, preserve_case=False, keep_punctuation=True,
                       remove_stops=True, custom_stopwords=None):
    """
    Recursively process a JSON object, preprocessing text fields.

    Args:
        obj: JSON object (dict, list, or primitive value)
        preserve_case (bool): Whether to preserve the original case
        keep_punctuation (bool): Whether to keep biomedically relevant punctuation
        remove_stops (bool): Whether to remove stopwords
        custom_stopwords (list): Custom stopwords to remove

    Returns:
        The processed JSON object
    """
    if isinstance(obj, dict):
        result = {}
        for key, value in obj.items():
            if isinstance(value, str):
                # Preprocess text fields
                result[f"{key}_original"] = value
                result[key] = preprocess_text(
                    value,
                    preserve_case=preserve_case,
                    keep_punctuation=keep_punctuation,
                    remove_stops=remove_stops,
                    custom_stopwords=custom_stopwords
                )
            else:
                # Recursively process non-string fields
                result[key] = process_json_object(
                    value,
                    preserve_case=preserve_case,
                    keep_punctuation=keep_punctuation,
                    remove_stops=remove_stops,
                    custom_stopwords=custom_stopwords
                )
        return result
    elif isinstance(obj, list):
        return [process_json_object(
            item,
            preserve_case=preserve_case,
            keep_punctuation=keep_punctuation,
            remove_stops=remove_stops,
            custom_stopwords=custom_stopwords
        ) for item in obj]
    else:
        # Return primitive values unchanged
        return obj

In [None]:
input_file = "sample_data/0bdf77ae-3639-49c1-b7c7-533f9d073084.json"  # Replace with your input file
output_file = "sample_data/0bdf77ae-3639-49c1-b7c7-533f9d073084_clean.json" # Custom stopwords for specific fields

# Process only specific fields
fields_to_process = ["contraindications", "indications", "warningsAndPrecautions", "adverseReactions"]

# Process the JSON file
process_json_file(
    input_file,
    output_file,
    fields_to_process=fields_to_process,
    preserve_case=False,
    keep_punctuation=True,
    remove_stops=True
)

Processed JSON saved to sample_data/0bdf77ae-3639-49c1-b7c7-533f9d073084_clean.json


In [23]:
from owlready2 import *
import json
import re
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

# Download necessary NLTK resources
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [41]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [24]:
# Load the ORDO ontology
print("Loading ORDO ontology...")
onto = get_ontology("https://www.orphadata.com/data/ontologies/ordo/last_version/ORDO_en_4.6.owl").load()
print("Ontology loaded successfully.")

Loading ORDO ontology...
Ontology loaded successfully.


In [25]:
# Create a dictionary of all disease terms in the ontology for faster lookup
disease_terms = {}
for cls in onto.classes():
    if hasattr(cls, "label") and cls.label:
        for label in cls.label:
            disease_terms[label.lower()] = cls.iri

    # Add synonyms
    for prop in ["hasExactSynonym", "hasRelatedSynonym", "hasNarrowSynonym"]:
        if hasattr(cls, prop):
            for synonym in getattr(cls, prop):
                disease_terms[synonym.lower()] = cls.iri

In [26]:
print(f"Loaded {len(disease_terms)} disease terms from ontology")

Loaded 15579 disease terms from ontology


In [27]:
def extract_drug_names(text):
    """Extract drug names from text using rule-based patterns"""

    # Common drug name suffixes by class
    drug_patterns = [
        r'\b\w+(?:mab|ximab|zumab|umab)\b',  # Monoclonal antibodies
        r'\b\w+(?:tinib|pib|nib|fib)\b',  # Kinase inhibitors
        r'\b\w+(?:olol)\b',  # Beta blockers
        r'\b\w+(?:pril|sartan)\b',  # ACE inhibitors and ARBs
        r'\b\w+(?:oxacin|cycline|cillin)\b',  # Antibiotics
        r'\b\w+(?:zepam|azepam|azolam)\b',  # Benzodiazepines
        r'\b\w+(?:statin)\b',  # Statins
        r'\b\w+(?:conazole)\b',  # Antifungals
        r'\b\w+(?:zosin)\b',  # Alpha blockers
        r'\b\w+(?:dipine|pazil)\b',  # Calcium channel blockers
        r'\b\w+(?:barb)\b',  # Barbiturates
        r'\b\w+(?:navir)\b',  # HIV protease inhibitors
        r'\b\w+(?:setron)\b',  # 5-HT3 antagonists

        # Common drug patterns with capitalization
        r'\b[A-Z][a-z]+(?:[A-Z][a-z]+)+\b',  # CamelCase drug names
        r'\b[A-Z][a-z]+\b'  # Capitalized words (potential brand names)
    ]

    # Find all matches
    all_matches = []
    for pattern in drug_patterns:
        matches = re.findall(pattern, text)
        all_matches.extend(matches)

    # Common words to exclude (non-drug words that might match patterns)
    exclude_words = ['section', 'system', 'central', 'nervous', 'treatment',
                     'therapy', 'usage', 'patients', 'studies', 'clinical',
                     'indication', 'contraindication', 'reaction']

    # Filter results
    filtered_matches = [m for m in all_matches
                       if m.lower() not in exclude_words
                       and len(m) > 3]

    # Remove duplicates while preserving order
    unique_matches = []
    for match in filtered_matches:
        if match not in unique_matches:
            unique_matches.append(match)

    return unique_matches

In [65]:
def extract_disease_entities(text):
    """Extract disease entities from biomedical text"""

    # Pre-process text
    sentences = sent_tokenize(text)

    # Generic patterns for disease entities that handle hyphenated terms
    disease_patterns = [
        r'(?:invasive|severe)\s+[\w-]+\s+infections?',
        r'[\w-]+\s+(?:disease|disorder|syndrome|deficiency)',
        r'(?:acute|chronic)\s+[\w-]+\s+[\w-]+',
        r'[\w-]+(?:-versus-[\w-]+)?\s+disease',
        r'[\w-]+\s+malignancies',
        r'invasive\s+(?:[\w-]+)\s+infections?',
        r'[\w-]+\s+leukemia',
        r'[\w-]+\s+lymphoma',
        r'[\w-]+\s+lysis\s+syndrome',
        r'(?:hyper|hypo)[\w-]+emia'
    ]

    # Find disease mentions
    disease_mentions = []

    # Search using general patterns
    for pattern in disease_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        disease_mentions.extend(matches)

    # Use NLP approach for more complex extractions
    for sentence in sentences:
        # First, preserve hyphenated terms by temporarily replacing hyphens with a special marker
        preserved_sentence = re.sub(r'(\w+)-(\w+)', r'\1_HYPHEN_\2', sentence)

        # Now tokenize with standard tokenizer
        tokens = word_tokenize(preserved_sentence)

        # Restore hyphens
        tokens = [token.replace('_HYPHEN_', '-') for token in tokens]

        # Apply POS tagging
        tagged = pos_tag(tokens)

        # Build noun phrases
        current_np = []
        noun_phrases = []

        for word, tag in tagged:
            if tag.startswith('JJ') or tag.startswith('NN'):
                current_np.append(word)
            elif current_np:
                if len(current_np) > 1:  # Only keep multi-word phrases
                    noun_phrases.append(' '.join(current_np))
                current_np = []

        if current_np and len(current_np) > 1:
            noun_phrases.append(' '.join(current_np))

        # Filter noun phrases to find disease candidates
        disease_indicators = ['disease', 'disorder', 'syndrome', 'infection',
                             'deficiency', 'malignancy', 'cancer', 'leukemia',
                             'lymphoma', 'transplant', 'neutropenia']

        for np in noun_phrases:
            if any(indicator in np.lower() for indicator in disease_indicators):
                disease_mentions.append(np)

    # Remove duplicates and normalize
    unique_diseases = []
    for disease in disease_mentions:
        normalized = disease.lower().strip()
        if normalized not in [d.lower() for d in unique_diseases] and len(normalized) > 3:
            unique_diseases.append(disease)

    return unique_diseases

In [60]:
def find_disease_in_ontology(disease_name, disease_terms_dict):
    """Find a disease in the ontology using the prebuilt dictionary"""
    search_name = disease_name.lower()

    # Try exact match first
    if search_name in disease_terms_dict:
        return disease_terms_dict[search_name]

    # Try substring matching with scoring
    matches = []
    for term, iri in disease_terms_dict.items():
        # Check if the disease name is a substring of the ontology term
        if search_name in term:
            similarity = len(search_name) / len(term)
            matches.append((iri, similarity))
        # Check if the ontology term is a substring of the disease name
        elif term in search_name:
            similarity = len(term) / len(search_name)
            matches.append((iri, similarity))

    # Sort by similarity score (higher is better)
    matches.sort(key=lambda x: x[1], reverse=True)

    if matches and matches[0][1] > 0.5:  # Threshold for decent match
        return matches[0][0]

    return None

In [61]:
def process_json_file(file_path):
    """Process a single JSON file to extract drugs and diseases"""
    with open(file_path, 'r') as f:
        data = json.load(f)

    # Extract sections of interest
    indications = data.get("indications", '')
    contraindications = data.get("contraindications", '')

    # Extract the main drug name
    main_drug = data.get('name', '').strip()

    # Extract additional drug names from the text
    indications_drugs = extract_drug_names(indications)
    contraindications_drugs = extract_drug_names(contraindications)

    # Extract disease mentions
    indications_diseases = extract_disease_entities(indications)
    contraindications_diseases = extract_disease_entities(contraindications)

    # Find diseases in ontology
    indications_disease_ids = []
    for disease in indications_diseases:
        disease_id = find_disease_in_ontology(disease, disease_terms)
        if disease_id:
            indications_disease_ids.append((disease, disease_id))

    contraindications_disease_ids = []
    for disease in contraindications_diseases:
        disease_id = find_disease_in_ontology(disease, disease_terms)
        if disease_id:
            contraindications_disease_ids.append((disease, disease_id))

    return {
        'file': os.path.basename(file_path),
        'main_drug': main_drug,
        'indication_drugs': indications_drugs,
        'contraindication_drugs': contraindications_drugs,
        'indication_diseases': indications_diseases,
        'indication_disease_ids': indications_disease_ids,
        'contraindication_diseases': contraindications_diseases,
        'contraindication_disease_ids': contraindications_disease_ids
    }

In [None]:
def process_directory(directory_path):
    """Process all JSON files in a directory"""
    results = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            result = process_json_file(file_path)
            results.append(result)
    return results

In [66]:
input_file = "sample_data/0cf064d0-cf65-4112-8817-ed864f16233e_clean.json"
result = process_json_file(input_file)

In [67]:
print(f"Main drug: {result['main_drug']}")

Main drug: XENPOZYME


In [47]:
print("\nIndication drugs:")
for drug in result['indication_drugs']:
    print(f"- {drug}")


Indication drugs:


In [48]:
print("\nContraindication drugs:")
for drug in result['contraindication_drugs']:
    print(f"- {drug}")


Contraindication drugs:


In [68]:
print("\nIndication diseases:")
for disease in result['indication_diseases']:
    print(f"- {disease}")

print("\nIndication diseases with IDs:")
for disease, disease_id in result['indication_disease_ids']:
    print(f"- {disease}: {disease_id}")

print("\nContraindication diseases:")
for disease in result['contraindication_diseases']:
    print(f"- {disease}")

print("\nContraindication diseases with IDs:")
for disease, disease_id in result['contraindication_disease_ids']:
    print(f"- {disease}: {disease_id}")

# To process all files in a directory:
# results = process_directory("/path/to/your/json/files")


Indication diseases:
- sphingomyelinase deficiency
- niemann-pick disease
- asm-deficient niemann-pick disease

Indication diseases with IDs:
- sphingomyelinase deficiency: http://www.orpha.net/ORDO/Orphanet_618899
- niemann-pick disease: http://www.orpha.net/ORDO/Orphanet_646

Contraindication diseases:

Contraindication diseases with IDs:


In [69]:
input_file2 = "sample_data/0cf064d0-cf65-4112-8817-ed864f16233e.json"
result = process_json_file(input_file2)

In [70]:
print(f"Main drug: {result['main_drug']}")

Main drug: XENPOZYME


In [71]:
print("\nIndication diseases:")
for disease in result['indication_diseases']:
    print(f"- {disease}")

print("\nIndication diseases with IDs:")
for disease, disease_id in result['indication_disease_ids']:
    print(f"- {disease}: {disease_id}")

print("\nContraindication diseases:")
for disease in result['contraindication_diseases']:
    print(f"- {disease}")

print("\nContraindication diseases with IDs:")
for disease, disease_id in result['contraindication_disease_ids']:
    print(f"- {disease}: {disease_id}")


Indication diseases:
- sphingomyelinase deficiency
- acid sphingomyelinase deficiency

Indication diseases with IDs:
- sphingomyelinase deficiency: http://www.orpha.net/ORDO/Orphanet_618899
- acid sphingomyelinase deficiency: http://www.orpha.net/ORDO/Orphanet_618899

Contraindication diseases:

Contraindication diseases with IDs:
