# 3. Symptom Vocabulary & NLP Pipeline

This notebook implements the NLP pipeline for processing user-reported symptoms.

## Objectives
1. Build canonical symptom vocabulary
2. Implement text preprocessing
3. Create spell correction system
4. Implement symptom extraction
5. Save vocabulary and pipeline

In [None]:
# Install required packages
!pip install spacy symspellpy rapidfuzz sentence-transformers pandas numpy
!python -m spacy download en_core_web_md

In [None]:
import pandas as pd
import numpy as np
import spacy
import json
from pathlib import Path
from symspellpy import SymSpell, Verbosity
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

# Load spaCy model
nlp = spacy.load('en_core_web_md')

# Initialize sentence transformer
model = SentenceTransformer('all-MiniLM-L6-v2')

## 1. Build Symptom Vocabulary

In [None]:
def build_symptom_vocabulary(df):
    """Create canonical symptom vocabulary from dataset columns.
    
    Args:
        df (pd.DataFrame): Processed dataset
        
    Returns:
        dict: Symptom vocabulary with metadata
    """
    # Get symptom columns (all except 'disease')
    symptom_cols = df.columns[1:].tolist()
    
    # Process symptom names
    vocab = {}
    for symptom in symptom_cols:
        # Clean symptom name
        clean_name = symptom.lower().replace('_', ' ')
        
        # Get embedding for semantic matching
        embedding = model.encode([clean_name])[0]
        
        # Store in vocabulary
        vocab[symptom] = {
            'clean_name': clean_name,
            'embedding': embedding.tolist(),
            'frequency': int(df[symptom].sum()),
            'alternatives': []  # Will be filled with common variations
        }
    
    return vocab

# Load processed data and build vocabulary
df = pd.read_csv('../data/processed/processed_data.csv')
symptom_vocab = build_symptom_vocabulary(df)

print(f"Built vocabulary with {len(symptom_vocab)} symptoms")

## 2. Text Preprocessing

In [None]:
def preprocess_text(text):
    """Preprocess user input text.
    
    Args:
        text (str): Raw user input
        
    Returns:
        str: Preprocessed text
    """
    # Convert to lowercase
    text = text.lower()
    
    # Process with spaCy
    doc = nlp(text)
    
    # Tokenize and lemmatize
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    return ' '.join(tokens)

# Test preprocessing
test_text = "I've been having severe headaches and feeling nauseous for the past few days"
print(f"Original: {test_text}")
print(f"Preprocessed: {preprocess_text(test_text)}")

## 3. Spell Correction

In [None]:
def initialize_spell_checker():
    """Initialize SymSpell for spell correction."""
    spell = SymSpell(max_dictionary_edit_distance=2)
    
    # Add symptom vocabulary to dictionary
    for symptom in symptom_vocab.values():
        clean_name = symptom['clean_name']
        words = clean_name.split()
        
        # Add individual words and full phrase
        for word in words:
            spell.create_dictionary_entry(word, 1)
        spell.create_dictionary_entry(clean_name, 1)
    
    return spell

def correct_spelling(text, spell_checker):
    """Correct spelling in text.
    
    Args:
        text (str): Input text
        spell_checker: SymSpell instance
        
    Returns:
        str: Text with corrected spelling
    """
    words = text.split()
    corrected = []
    
    for word in words:
        suggestions = spell_checker.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        if suggestions:
            corrected.append(suggestions[0].term)
        else:
            corrected.append(word)
    
    return ' '.join(corrected)

# Initialize spell checker
spell_checker = initialize_spell_checker()

# Test spell correction
test_text = "i have hedache and fver"
corrected = correct_spelling(test_text, spell_checker)
print(f"Original: {test_text}")
print(f"Corrected: {corrected}")

## 4. Symptom Extraction

In [None]:
def extract_symptoms(text, threshold_fuzzy=85, threshold_semantic=0.72):
    """Extract symptoms from text using fuzzy and semantic matching.
    
    Args:
        text (str): Preprocessed input text
        threshold_fuzzy (int): Threshold for fuzzy matching
        threshold_semantic (float): Threshold for semantic similarity
        
    Returns:
        tuple: (matched symptoms, binary vector)
    """
    matched_symptoms = []
    text_embedding = model.encode([text])[0]
    
    # Create binary vector
    binary_vector = np.zeros(len(symptom_vocab))
    
    for idx, (symptom, data) in enumerate(symptom_vocab.items()):
        clean_name = data['clean_name']
        
        # Try fuzzy matching first
        fuzzy_score = fuzz.partial_ratio(clean_name, text)
        if fuzzy_score >= threshold_fuzzy:
            matched_symptoms.append((symptom, 'fuzzy', fuzzy_score))
            binary_vector[idx] = 1
            continue
        
        # Try semantic matching
        semantic_score = np.dot(text_embedding, data['embedding'])
        if semantic_score >= threshold_semantic:
            matched_symptoms.append((symptom, 'semantic', semantic_score))
            binary_vector[idx] = 1
    
    return matched_symptoms, binary_vector

# Test symptom extraction
test_input = "I have a severe headache and high fever, feeling very weak"

# Preprocess
processed = preprocess_text(test_input)
corrected = correct_spelling(processed, spell_checker)

# Extract symptoms
matched, vector = extract_symptoms(corrected)

print("Input:", test_input)
print("\nMatched Symptoms:")
for symptom, method, score in matched:
    print(f"- {symptom} ({method}, score: {score:.2f})")

## 5. Save Vocabulary and Pipeline

In [None]:
def save_nlp_pipeline():
    """Save vocabulary and pipeline components."""
    # Save symptom vocabulary (without embeddings for JSON compatibility)
    vocab_json = {}
    for symptom, data in symptom_vocab.items():
        vocab_json[symptom] = {
            'clean_name': data['clean_name'],
            'frequency': data['frequency'],
            'alternatives': data['alternatives']
        }
    
    with open('../data/processed/symptom_vocab.json', 'w') as f:
        json.dump(vocab_json, f, indent=2)
    
    # Save embeddings separately (as numpy array)
    embeddings = np.array([data['embedding'] for data in symptom_vocab.values()])
    np.save('../data/processed/symptom_embeddings.npy', embeddings)
    
    print("Saved NLP pipeline components:")
    print("- Symptom vocabulary: symptom_vocab.json")
    print("- Symptom embeddings: symptom_embeddings.npy")

save_nlp_pipeline()

## 6. Pipeline Validation

In [None]:
def validate_pipeline():
    """Test pipeline with various example inputs."""
    test_cases = [
        "I have a terrible headache and fever",
        "feeling dizzy and nautious",  # Misspelled
        "my throat is sore and i'm coughing a lot",
        "experiencing chest pain and shortness of breath",
        ""  # Empty input
    ]
    
    print("Testing NLP Pipeline:\n")
    
    for text in test_cases:
        print(f"Input: {text}")
        
        if not text:
            print("↳ No symptoms provided\n")
            continue
            
        # Process text
        processed = preprocess_text(text)
        corrected = correct_spelling(processed, spell_checker)
        matched, vector = extract_symptoms(corrected)
        
        print(f"↳ Corrected: {corrected}")
        print("↳ Matched Symptoms:")
        for symptom, method, score in matched:
            print(f"  - {symptom} ({method}, {score:.2f})")
        print()

validate_pipeline()