# Step 3: Medical Entity Recognition and UMLS Linking
Using scispaCy for entity extraction and linking to UMLS codes.

In [ ]:
import spacy
import scispacy
from scispacy.umls_linking import UmlsEntityLinker
import pandas as pd

# Load dataset
df = pd.read_csv('../data/ehr_raw.csv')

# Load scispaCy model
nlp = spacy.load('en_core_sci_md')

# Add UMLS entity linker
linker = UmlsEntityLinker()
nlp.add_pipe(linker)

def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent._.umls_ents) for ent in doc.ents]
    return entities

def get_umls_codes(text):
    doc = nlp(text)
    codes = []
    for ent in doc.ents:
        if ent._.umls_ents:
            codes.append(ent._.umls_ents[0][0])  # best match
    return codes

# Example usage on diagnosis_notes
df['entities'] = df['diagnosis_notes'].apply(extract_entities)
df['umls_codes'] = df['diagnosis_notes'].apply(get_umls_codes)

df[['diagnosis_notes', 'entities', 'umls_codes']].head()