# spacy

```bash
pip install spacy==3.7.2 --only-binary=:all:
python -m spacy download de_core_news_lg
```

In [1]:
import spacy
nlp = spacy.load("de_core_news_lg")

### Test if it works

In [2]:
text = "Ich bin letzte Woche nach Zürich und Luzern gereist."

doc = nlp(text)

for ent in doc.ents:
    if ent.label_ in ["LOC", "GPE"]:
        print(ent.text, ent.label_)

Zürich LOC
Luzern LOC


In [5]:
import pandas as pd
from rich import print
from rich.console import Console
from rich.table import Table
from rich.rule import Rule

number_of_samples = 100


# Load CSV and samples of random rows
csv_path = r"C:\Users\fabia\OneDrive - FHNW\03_FHNW\06_6_Semester\03_6000_BTh\90_Daten_Zwischenablage\02_extracted_keywords_data.csv" 
df = pd.read_csv(csv_path)

sampled_df = df.dropna(subset=["combined_DE_keywords"]).sample(n=number_of_samples, random_state=42)

# Function to extract location phrases
def extract_location_phrases(text):
    doc = nlp(text)
    locations = []

    for i, token in enumerate(doc):
        if token.ent_type_ in ["LOC", "GPE"]:
            if i > 0 and doc[i - 1].text.lower() in ["stadt", "kanton", "region"," gebiet", "land","bezirk"]:
                locations.append(f"{doc[i - 1].text} {token.text}")
            else:
                locations.append(token.text)
    return locations

# Process each sampled line
results = []
for _, row in sampled_df.iterrows():
    keyword_text = str(row["combined_DE_keywords"])
    description_text = str(row.get("dataset_description_DE_preprocessed", ""))
    keyword_locations = extract_location_phrases(keyword_text)
    description_locations = extract_location_phrases(description_text)
    results.append({
        "title": str(row.get("dataset_title_DE_preprocessed", "")),
        "keyword_text": keyword_text,
        "keyword_locations": ", ".join(keyword_locations),
        "description_text": description_text,
        "description_locations": ", ".join(description_locations)
    })


# Display output nicely using rich table
console = Console(width=400)
table = Table(show_header=True, header_style="bold red", show_lines=True)
table.add_column("Title", width=40)
table.add_column("Keywords", width=40)
table.add_column("Locations (Keywords)", style="bold", width=30)
table.add_column("Description", width=40,)
table.add_column("Locations (Description)", style="bold", width=30)

for item in results:
    table.add_row(
        item["title"],
        item["keyword_text"],
        item["keyword_locations"],
        item["description_text"],
        item["description_locations"]
    )

console.print(table)


  df = pd.read_csv(csv_path)


### Add places to model

In [2]:
#all_locations = []
import csv
import json

input_csv = r"C:\Users\fabia\OneDrive - FHNW\03_FHNW\06_6_Semester\03_6000_BTh\90_Daten_Zwischenablage\swissnames3d_2024_2056\swissNAMES3D_PLY.csv"
output_jsonl = "gpe_patterns.jsonl"

with open(input_csv, newline='', encoding='utf-8') as csvfile, open(output_jsonl, "w", encoding="utf-8") as jsonlfile:
    reader = csv.DictReader(csvfile, delimiter=';')
    
    for row in reader:
        name = row["NAME"].strip()
        if not name:
            continue
        tokens = name.split()
        pattern = [{"LOWER": token.lower()} for token in tokens]
        entry = {"label": "GPE", "pattern": pattern}
        json.dump(entry, jsonlfile, ensure_ascii=False)
        jsonlfile.write("\n")

print(f"JSONL file created: {output_jsonl}")


In [6]:
# Only certain types of names
import csv
import json
import pandas as pd

# Input/output paths
input_csv = r"C:\Users\fabia\OneDrive - FHNW\03_FHNW\06_6_Semester\03_6000_BTh\90_Daten_Zwischenablage\swissnames3d_2024_2056\swissNAMES3D_PLY.csv"
output_jsonl = "gpe_patterns.jsonl"

# Load CSV and filter by ART
df = pd.read_csv(input_csv, sep=";")
filtered_df = df[df["OBJEKTART"].isin(["Ort"])]
names = filtered_df["NAME"].dropna().unique()

# Write to JSONL
with open(output_jsonl, "w", encoding="utf-8") as f:
    for name in names:
        tokens = name.strip().split()
        if not tokens:
            continue
        pattern = [{"LOWER": token.lower()} for token in tokens]
        json.dump({"label": "GPE", "pattern": pattern}, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ JSONL file created: {output_jsonl}")


In [7]:
import csv
import json
import pandas as pd
import random
import spacy
from rich import print
from rich.console import Console
from rich.table import Table

# --- Parameters ---
number_of_samples = 100
patterns_file = "gpe_patterns.jsonl"

# --- Step 2: Load and sample main dataset ---
csv_path = r"C:\Users\fabia\OneDrive - FHNW\03_FHNW\06_6_Semester\03_6000_BTh\90_Daten_Zwischenablage\02_extracted_keywords_data.csv"
df = pd.read_csv(csv_path)
sampled_df = df.dropna(subset=["combined_DE_keywords"]).sample(n=number_of_samples, random_state=100)

# --- Step 3: Function to extract locations ---
def run_with_or_without_custom_places(use_custom_places=True):
    nlp_local = spacy.load("de_core_news_lg")

    if use_custom_places:
        ruler = nlp_local.add_pipe("entity_ruler", before="ner")
        with open(patterns_file, "r", encoding="utf-8") as f:
            patterns = [json.loads(line) for line in f]
        ruler.add_patterns(patterns)

    def extract(text):
        doc = nlp_local(text)
        locations = []
        for i, token in enumerate(doc):
            if token.ent_type_ in ["LOC", "GPE"]:
                if i > 0 and doc[i - 1].text.lower() in ["stadt", "kanton", "region"]:
                    phrase = f"{doc[i - 1].text} {token.text}"
                else:
                    phrase = token.text
                locations.append(phrase)
        return locations

    return [
        {
            "title": str(row.get("dataset_title_DE_preprocessed", "")),
            "keywords": str(row["combined_DE_keywords"]),
            "desc": str(row.get("dataset_description_DE_preprocessed", "")),
            "loc_kw": extract(str(row["combined_DE_keywords"])),
            "loc_desc": extract(str(row.get("dataset_description_DE_preprocessed", ""))),
        }
        for _, row in sampled_df.iterrows()
    ]

# --- Step 4: Run both versions (with & without custom places) ---
results_with = run_with_or_without_custom_places(use_custom_places=True)
results_without = run_with_or_without_custom_places(use_custom_places=False)

# --- Step 5: Display comparison in rich table ---
console = Console(width=280)
table = Table(show_header=True, header_style="bold red", show_lines=True)
table.add_column("Title", width=30)
table.add_column("Keywords", width=30)
table.add_column("LOCs (with)", width=25)
table.add_column("LOCs (without)", width=25)

for row_with, row_without in zip(results_with, results_without):
    table.add_row(
        row_with["title"],
        row_with["keywords"],
        ", ".join(row_with["loc_kw"]),
        ", ".join(row_without["loc_kw"])
    )

console.print(table)

  df = pd.read_csv(csv_path)


In [4]:
# --- Load and sample CSV outside the function ---
import pandas as pd
import json
import spacy

csv_path = r"C:\Users\fabia\OneDrive - FHNW\03_FHNW\06_6_Semester\03_6000_BTh\90_Daten_Zwischenablage\02_extracted_keywords_data.csv"
df = pd.read_csv(csv_path)
sampled_df = df.dropna(subset=["combined_DE_keywords"]).sample(n=100, random_state=42)

# --- Refactored extraction function for DataFrame ---
def extract_locations_from_df(df, patterns_file=None):
    nlp_local = spacy.load("de_core_news_lg")

    if patterns_file:
        ruler = nlp_local.add_pipe("entity_ruler", before="ner")
        with open(patterns_file, "r", encoding="utf-8") as f:
            patterns = [json.loads(line) for line in f]
        ruler.add_patterns(patterns)

    def extract(text):
        doc = nlp_local(text)
        locations = []
        for i, token in enumerate(doc):
            if token.ent_type_ in ["LOC", "GPE"]:
                if i > 0 and doc[i - 1].text.lower() in ["stadt", "kanton", "region"]:
                    phrase = f"{doc[i - 1].text} {token.text}"
                else:
                    phrase = token.text
                locations.append(phrase)
        return locations

    results = []
    for _, row in df.iterrows():
        title = str(row.get("dataset_title_DE_preprocessed", ""))
        keywords = str(row["combined_DE_keywords"])
        desc = str(row.get("dataset_description_DE_preprocessed", ""))

        result = {
            "title": title,
            "keywords": keywords,
            "desc": desc,
            "loc_kw": extract(keywords),
            "loc_desc": extract(desc),
        }
        results.append(result)

    return results

# --- New function to test a single sentence ---
def extract_locations_from_text(text, patterns_file=None):
    nlp_local = spacy.load("de_core_news_lg")

    if patterns_file:
        ruler = nlp_local.add_pipe("entity_ruler", before="ner")
        with open(patterns_file, "r", encoding="utf-8") as f:
            patterns = [json.loads(line) for line in f]
        ruler.add_patterns(patterns)

    doc = nlp_local(text)
    locations = []
    for i, token in enumerate(doc):
        if token.ent_type_ in ["LOC", "GPE"]:
            if i > 0 and doc[i - 1].text.lower() in ["stadt", "kanton", "region"]:
                phrase = f"{doc[i - 1].text} {token.text}"
            else:
                phrase = token.text
            locations.append(phrase)
    return locations

# --- Example usage ---
results_with = extract_locations_from_df(sampled_df, patterns_file="gpe_patterns.jsonl")
results_without = extract_locations_from_df(sampled_df, patterns_file=None)

# --- Example test sentence ---
example_text = "Ich fahre morgen nach Otelfingen, Böckten und Im Löh."
print("Detected locations:", extract_locations_from_text(example_text, patterns_file="gpe_patterns.jsonl"))


  df = pd.read_csv(csv_path)


Detected locations: ['morgen', 'Otelfingen', 'Böckten', 'Löh']


# bert-base-NER 

```bash
pip install --upgrade transformers torch
pip install accelerate
```

Installation for torch and torchvision without GPU
```bash
pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cpu
```

In [6]:
import torchvision
print(torchvision.__file__)


In [7]:
#Test if torch and torchvision are installed
import torch
import torchvision

print("Torch:", torch.__version__)
print("Torchvision:", torchvision.__version__)
print("Torchvision path:", torchvision.__file__)

In [12]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

# example1 = "Silas wohnt in Mettmenstetten."
example2 = "Uzwil - GEP Einzugsgebiete"
# example = "inventar der filterbrunnen im landwirtschaftlichen gebiet  rhoneebene beinhaltet ebenfalls die für gärten genutzt werden landwirtschaft wasser ressourcen ressourcennutzung landwirtschaftliche anlagen und aquakulturanlagen wasserbrunnen wasserbedarf wassergewinnung fassung brunnen geodaten landwirtschaftszone."
ner_results = nlp(example2)

for entity in ner_results:
    print(f"{entity['entity_group']}: {entity['word']} ({entity['score']:.2f})")


Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


# Compare BERT vs spacy

In [1]:
import csv
import json
import pandas as pd
import random
import spacy
from rich import print
from rich.console import Console
from rich.table import Table
from rich.text import Text
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [2]:
# --- Step 0: Load models ---
nlp_spacy = spacy.load("de_core_news_lg")
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")
nlp_bert = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [3]:
# --- Step 1: Load and sample dataset ---
def load_sample(csv_path, number_of_samples):
    df = pd.read_csv(csv_path)
    # sampled = df.dropna(subset=["combined_DE_keywords"]).sample(n=min(number_of_samples, len(df)), random_state=100)
    sampled = df.dropna(subset=["dataset_description_DE_preprocessed"]).sample(n=min(number_of_samples, len(df)), random_state=100)

    sampled = sampled.reset_index(drop=True)
    return sampled

In [4]:
# --- Step 2: spaCy without custom places ---
def extract_with_spacy_default(texts):


    def extract(text):
        doc = nlp_spacy(text)
        locations = []
        for i, token in enumerate(doc):
            if token.ent_type_ in ["LOC", "GPE"]:
                if i > 0 and doc[i - 1].text.lower() in ["stadt", "kanton", "region"]:
                    phrase = f"{doc[i - 1].text} {token.text}"
                else:
                    phrase = token.text
                locations.append(phrase)
        return locations

    return [extract(text) for text in texts]

In [36]:
import spacy
import json

def extract_with_spacy_custom(texts, patterns_file):
    nlp = spacy.load("de_core_news_lg")
    ruler = nlp.add_pipe("entity_ruler", before="ner")

    # Load your custom patterns
    with open(patterns_file, "r", encoding="utf-8") as f:
        patterns = [json.loads(line) for line in f]
    ruler.add_patterns(patterns)

    # Extract locations only
    def extract(text):
        doc = nlp(text)
        locations = []
        for i, token in enumerate(doc):
            if token.ent_type_ in ["LOC", "GPE"]:
                if i > 0 and doc[i - 1].text.lower() in ["stadt", "kanton", "region"]:
                    phrase = f"{doc[i - 1].text} {token.text}"
                else:
                    phrase = token.text
                locations.append(phrase)
        return locations

    return [extract(text) for text in texts]


In [6]:
# --- Step 4: transformers NER ---
def extract_with_transformers_batch(texts):
    tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
    model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")
    nlp_bert = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

    all_results = []
    for text in texts:
        # Optional: Preprocess to improve recognition (capitalize, simplify)
        cleaned_text = text.strip().capitalize()
        results = nlp_bert(cleaned_text)
        locations = [ent["word"] for ent in results if ent["entity_group"] in ["LOC", "ORG", "PER"]]
        all_results.append(locations)
    return all_results


In [61]:
import pandas as pd
import re

def simple_match_all(texts):
    # Load Ort-names once
    ort_csv = r"C:\Users\fabia\OneDrive - FHNW\03_FHNW\06_6_Semester\03_6000_BTh\90_Daten_Zwischenablage\swissnames3d_2024_2056\swissNAMES3D_PLY.csv"
    ort_df = pd.read_csv(ort_csv, sep=";")
    print(f"Ort-names loaded from {ort_csv}")
    filtered_ort_names = ort_df["NAME"].dropna().str.lower().unique().tolist()
    # filtered_ort_names = ort_df[ort_df["OBJEKTART"] == "Ort"]["NAME"].dropna().str.lower().unique().tolist()


    # Precompile regex with strict word boundaries (whole word match only)
    regex_list = [re.compile(rf"(?<!\S){re.escape(place)}(?!\S)") for place in filtered_ort_names]

    def match(text):
        text_lower = text.lower()
        return [place for place, pattern in zip(filtered_ort_names, regex_list) if pattern.search(text_lower)]

    return [match(text) for text in texts]


In [7]:
# --- Step 5: Highlight helper ---
def highlight_matches(text, *other_lists):
    words = [w.strip() for w in text.split(",") if w.strip()]
    all_other_words = set()
    for col in other_lists:
        all_other_words.update([w.strip().lower() for w in col])
    highlighted = Text()
    for i, word in enumerate(words):
        if word.lower() in all_other_words:
            highlighted.append(word, style="bold green")
        else:
            highlighted.append(word)
        if i < len(words) - 1:
            highlighted.append(", ")
    return highlighted

In [17]:

# --- Step 6: Display results ---
def display_table(df, locs_with, locs_without, locs_transformers,simple_search):
    console = Console(width=300)
    table = Table(show_header=True, header_style="bold red", show_lines=True)
    table.add_column("Title", width=30)
    table.add_column("Keywords", width=30)
    table.add_column("LOCs (with)", width=25)
    table.add_column("LOCs (without)", width=25)
    table.add_column("LOCs (transformers)", width=25)
    table.add_column("Simple search)", width=25)

    row_count = min(len(df), len(locs_with), len(locs_without), len(locs_transformers))
    for i in range(row_count):
        row = df.iloc[i]
        table.add_row(
            str(row.get("dataset_title_DE_preprocessed", "")),
            str(row["dataset_description_DE_preprocessed"]),
            highlight_matches(", ".join(locs_with[i]), locs_without[i], locs_transformers[i]),
            highlight_matches(", ".join(locs_without[i]), locs_with[i], locs_transformers[i]),
            highlight_matches(", ".join(locs_transformers[i]), locs_with[i], locs_without[i]),
            highlight_matches(", ".join(simple_search[i]), locs_with[i], simple_search[i])
        )
    console.print(table)

In [37]:
number_of_samples = 100
patterns_file = "gpe_patterns.jsonl"
csv_path = r"C:\\Users\\fabia\\OneDrive - FHNW\\03_FHNW\\06_6_Semester\\03_6000_BTh\\90_Daten_Zwischenablage\\02_extracted_keywords_data.csv"

sampled_df = load_sample(csv_path, number_of_samples)
texts = sampled_df["dataset_description_DE_preprocessed"].astype(str).tolist()

  df = pd.read_csv(csv_path)


In [38]:
locs_with = extract_with_spacy_custom(texts, patterns_file)

In [51]:
locs_without = extract_with_spacy_default(texts)

In [40]:
locs_transformers = extract_with_transformers_batch(texts)

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [63]:
simple_search = simple_match_all(texts)


In [19]:
locs_with = extract_with_spacy_custom(texts, patterns_file)

ValueError: [E007] 'entity_ruler' already exists in pipeline. Existing names: ['tok2vec', 'tagger', 'morphologizer', 'parser', 'lemmatizer', 'senter', 'attribute_ruler', 'entity_ruler', 'ner']

In [64]:
display_table(sampled_df, locs_with, locs_without, locs_transformers, simple_search)