# Spacy Intro: use your own datasets

In [None]:
### https://github.com/wjbmattingly/freecodecamp_spacy
import numpy as np 
import datasets

In [None]:
# !python -m spacy download en_core_web_md

### Excercise 1: Named Entity Recognition (NER) with spaCy
- Which entities did spaCy detect?
- Which important terms were missed?

In [None]:
import spacy

# Load the small English pipeline
nlp = spacy.load("en_core_web_md")

# Example medical text
text = "The patient was prescribed 5 mg of Prednisone in Zurich."

doc = nlp(text)

# Print detected entities
for ent in doc.ents:
    print(ent.text, ent.label_)



### Exercise 2: Custom Rule-Based Matcher
- Describe the what the following code is doing.
- What are problems with coding like that?

In [None]:
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

# Add EntityRuler properly (v3 syntax)
ruler = nlp.add_pipe("entity_ruler", before="ner")

# Define patterns
dosage_pattern = [{"LIKE_NUM": True}, {"LOWER": {"IN": ["mg", "ml", "g"]}}]
med_suffix_pattern = [{"TEXT": {"REGEX": "(?i).*(ine|ol|pril|sartan|mab)$"}}]
med_list_pattern = [{"LOWER": {"IN": ["prednisone", "ibuprofen", "paracetamol", "metformin"]}}]
route_pattern = [{"LOWER": {"IN": ["po", "iv", "im", "sc"]}}]
freq_pattern = [{"LOWER": {"IN": ["once", "twice"]}}, {"LOWER": {"IN": ["daily", "weekly"]}}]

# Add patterns
ruler.add_patterns([
    {"label": "DOSAGE", "pattern": dosage_pattern},
    {"label": "MEDICATION", "pattern": med_suffix_pattern},
    {"label": "MEDICATION", "pattern": med_list_pattern},
    {"label": "ROUTE", "pattern": route_pattern},
    {"label": "FREQUENCY", "pattern": freq_pattern},
])

# Example text
text = "The patient was prescribed 5 mg of Prednisone in Zurich. Later, they took 10 ml ibuprofen PO twice daily."
doc = nlp(text)

print("Entities:")
for ent in doc.ents:
    print(f"- {ent.text!r:>12}  ->  {ent.label_}")


### Exercise 3: Train a tiny custom NER model in spaCy
- Run the code and inspect output.
- Try adding 5–6 new entity types (e.g., SYMPTOM, LAB_TEST, DEVICE, ROUTE, DURATION, FREQUENCY) with just 1–2 examples each to see how training reacts.
- Evaluate qualitatively: What does the model get right/wrong?

In [None]:
TRAIN_DATA = [
    ("The patient received 5 mg Prednisone.", 
     {"entities": [(20, 24, "DOSAGE"), (25, 35, "DRUG")]}),
    ("Pain in the left knee improved after ibuprofen.", 
     {"entities": [(12, 21, "BODY_PART"), (33, 42, "DRUG")]}),
    ("He was given 2 ml epinephrine IM.", 
     {"entities": [(12, 16, "DOSAGE"), (17, 27, "DRUG")]}),
    ("CT showed a 2 cm lesion in the liver.", 
     {"entities": [(12, 16, "MEASUREMENT"), (31, 36, "BODY_PART")]}),
    ("Administer 10 mg morphine intravenously.", 
     {"entities": [(11, 16, "DOSAGE"), (17, 25, "DRUG")]}),
    ("MRI confirmed swelling in the brain.", 
     {"entities": [(28, 33, "BODY_PART")]}),
    ("Patient reported headache, treated with aspirin.", 
     {"entities": [(17, 25, "BODY_PART"), (40, 47, "DRUG")]}),
    ("She received 250 mg amoxicillin for 5 days.", 
     {"entities": [(13, 19, "DOSAGE"), (20, 31, "DRUG")]}),
    ("X-ray revealed fracture in the right arm.", 
     {"entities": [(35, 43, "BODY_PART")]}),
    ("The doctor prescribed 20 mg omeprazole daily.", 
     {"entities": [(24, 29, "DOSAGE"), (30, 40, "DRUG")]}),
    ("Ultrasound detected a 5 cm cyst in the kidney.", 
     {"entities": [(24, 28, "MEASUREMENT"), (40, 46, "BODY_PART")]}),
    ("Patient complained of chest pain, given nitroglycerin.", 
     {"entities": [(22, 27, "BODY_PART"), (35, 48, "DRUG")]}),
    ("Treatment started with 8 mg dexamethasone.", 
     {"entities": [(22, 26, "DOSAGE"), (27, 40, "DRUG")]}),
    ("Examination revealed tumor in the stomach.", 
     {"entities": [(32, 39, "BODY_PART")]}),
    ("She was prescribed 50 mg sertraline at night.", 
     {"entities": [(19, 24, "DOSAGE"), (25, 35, "DRUG")]}),
]

In [None]:
import spacy
from spacy.training.example import Example
import random

# from train_data import TRAIN_DATA

# Blank English pipeline
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# Add labels
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Convert to spaCy examples
examples = []
for text, ann in TRAIN_DATA:
    doc = nlp.make_doc(text)
    examples.append(Example.from_dict(doc, ann))

# Training loop
optimizer = nlp.initialize()
for epoch in range(20):
    random.shuffle(examples)
    losses = {}
    for ex in examples:
        nlp.update([ex], sgd=optimizer, losses=losses)
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}, Losses: {losses}")

# Test on new text
test_text = "The nurse gave 10 mg morphine for arm pain."
doc = nlp(test_text)
print([(ent.text, ent.label_) for ent in doc.ents])


### Extras with Spacy

In [None]:
### https://spacy.io/api/annotation#pos-tagging
sentence1 = list(doc.sents)[0]
from spacy import displacy
displacy.render(doc, style="dep")

In [None]:
import requests
import spacy

def fetch_wikipedia_extract(title: str, lang: str = "de") -> str:
    url = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "extracts",
        "explaintext": True,
        "redirects": 1,         
        "titles": title,
        "format": "json",
        "formatversion": 2
    }
    headers = {
        "User-Agent": "YourAppName/1.0 (contact@example.com)"
    }
    r = requests.get(url, params=params, headers=headers, timeout=15)
    r.raise_for_status()  # wirft HTTPError bei 4xx/5xx

    # Should be JSON 
    ctype = r.headers.get("Content-Type", "")
    if "application/json" not in ctype:
        # Debuggen
        snippet = r.text[:300].replace("\n", " ")
        raise ValueError(f"Unexpected content-type: {ctype}. First bytes: {snippet!r}")

    data = r.json()
    pages = data.get("query", {}).get("pages", [])
    if not pages or "extract" not in pages[0]:
        raise ValueError(f"No extract found for title={title!r}")
    return pages[0]["extract"]

text = fetch_wikipedia_extract("Sprachtechnologie", lang="de")

nlp = spacy.load("en_core_web_md")

doc = nlp(text)
print("Tokens:", len(doc))
print("First 200 characters:", text[:200])


In [None]:
displacy.render(doc, style="ent")

In [None]:
import os 
os.getcwd() 

In [None]:
### Replace path
import spacy
with open ("/Users/sym3/weine-xml-2024.xml", "r") as f:
    text = f.read()
doc = nlp(text)
sentence1 = list(doc.sents)[0]

In [None]:
sentence1

In [None]:
sentence1[0].vector

In [None]:
import numpy as np
your_word = "cat"
ms = nlp.vocab.vectors.most_similar(
    np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_word]]]), n=10)
words = [nlp.vocab.strings[w] for w in ms[0][0]]
distances = ms[2]
print(words)

In [None]:
ms[0][0]

In [None]:
doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")

# Similarity of two documents
print(doc1, "<->", doc2, doc1.similarity(doc2))

In [None]:
# Similarity of tokens and spans
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, "<->", burgers, french_fries.similarity(burgers))

In [None]:
### Spacy Pipeline
nlp.analyze_pipes()

In [None]:
### NER with rules
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADDRESS", [pattern])
doc = nlp("This is an email address: wmattingly@aol.com")
matches = matcher(doc)

In [None]:
print (matches)

In [None]:
print (nlp.vocab[matches[0][0]].text)

In [None]:
print(doc[matches[0][1]:matches[0][2]])

In [None]:
### RegEx
import re
pattern = r"(((\d){1,2}( (January|February|March|April|May|June|July|August|September|October|November|December)))|(((January|February|March|April|May|June|July|August|September|October|November|December) )(\d){1,2}))"
text = "This is a date February 2. Another date would be 14 August."
matches = re.findall(pattern, text)
print (matches)

In [None]:
text = "This is a date February 2. Another date would be 14 August."
iter_matches = re.finditer(pattern, text)
print (iter_matches)
for hit in iter_matches:
    print (hit)

In [None]:
text = "This is a date February 2. Another date would be 14 August."
iter_matches = re.finditer(pattern, text)
for hit in iter_matches:
    start = hit.start()
    end = hit.end()
    print (text[start:end])

In [None]:
import re
import spacy
from spacy.tokens import Span
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."
pattern = r"Paul [A-Z]\w+"
print(doc.ents)
nlp = spacy.blank("en")
doc = nlp(text)
original_ents = list(doc.ents)

In [None]:
mwt_ents = []
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
     
### Inject the Spans into the doc.ents
for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label="PERSON")
    original_ents.append(per_ent)

In [None]:
doc.ents = original_ents
for ent in doc.ents:
    print (ent.text, ent.label_)

In [None]:
### Custom component
from spacy.language import Language
@Language.component("paul_ner")
def paul_ner(doc):
    original_ents = list(doc.ents)
    mwt_ents = []
    pattern = r"Paul [A-Z]\w+"
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))  
    ### Inject the Spans into the doc.ents
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return(doc)
    

In [None]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")
doc2 = nlp2(text)
print(doc.ents)

In [None]:
### Custom component with filter for overlaps
from spacy.language import Language
from spacy.util import filter_spans
@Language.component("holly_ner")
def holly_ner(doc):
    original_ents = list(doc.ents)
    mwt_ents = []
    pattern = r"Hollywood"
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))  
    ### Inject the Spans into the doc.ents
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="CINEMA")
        original_ents.append(per_ent)
    filtered = filter_spans(original_ents)
    doc.ents = filtered
    return(doc)

In [None]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("holly_ner")
doc3 = nlp3(text)
print(doc3.ents)

In [None]:
for ent in doc3.ents:
    print (ent.text, ent.label_)