In [1]:
import random
from pathlib import Path

import regex as re
from rulechef import Rule, RuleChef, Task, TaskType
from rulechef.core import RuleFormat
from rulechef.executor import RuleExecutor

from clear_anonymization.extractors import factory
from clear_anonymization.extractors.base import BaseExtractor
from clear_anonymization.extractors.ruleextractor import RuleExtractor
from clear_anonymization.models.rulelearner import RuleChefLearner
from clear_anonymization.ner_datasets.ner_dataset import NERData, NERSample

In [2]:
def sample_data(samples, allowed_classes, k=6, seed=12):
    random.seed(seed)
    relevant_samples = []
    negative_samples = []
    for sample in samples:
        relevant_spans = []
        negative_spans = []

        for label in sample.labels:
            if label["class"] in allowed_classes:
                relevant_spans.append(
                    {
                        "text": label["text"],
                        "start": label["start"],
                        "end": label["end"],
                        "type": label["class"],
                    }
                )
            else:
                negative_spans.append(
                    {
                        "text": label["text"],
                        "start": label["start"],
                        "end": label["end"],
                        "type": label["class"],
                    }
                )

        if relevant_spans:
            relevant_samples.append(
                {
                    "text": sample.text,
                    "entities": list(relevant_spans),
                }
            )
        if negative_spans:
            negative_samples.append(
                {
                    "text": sample.text,
                    "entities": list(negative_spans),
                }
            )

    k = min(k, len(relevant_samples))

    return relevant_samples, random.sample(negative_samples, 6)

In [3]:
def print_rule_result(rule):
    print("CONTENT", rule.description)
    pattern = re.compile(rule.content)
    print(pattern)

In [4]:
# Data from the Ler dataset
input_dir = Path("data/ler/ler_data.json")
data = NERData.from_json(json.loads(input_dir.read_text()))
train_samples = [s for s in data.samples if s.split == "train"]

test_samples = [s for s in data.samples if s.split == "validation"]

sampled, negative = sample_data(train_samples, "ORG")

In [5]:
# My Examples

# positive Examples
positive_examples = [
    {
        "text": "Frau M√ºller arbeitet beim Bundesverfassungsgericht in Wien.",
        "entities": [
            {"text": "Bundesverfassungsgericht", "start": 26, "end": 50, "type": "ORG"}
        ],
    },
    {
        "text": "Nadia studiert an der Technische Universit√§t Wien.",
        "entities": [
            {
                "text": "Technische Universit√§t Wien",
                "start": 22,
                "end": 49,
                "type": "ORG",
            }
        ],
    },
    {
        "text": "Im Juli 2009 gr√ºndete der Angeklagte den Verein ‚Äû Neudeutschland ‚Äú mit ihm als ‚Äû unabw√§hlbarem ‚Äú ersten Vorstand.",
        "entities": [
            {"text": "Verein ‚Äû Neudeutschland ‚Äú", "start": 41, "end": 66, "type": "ORG"}
        ],
    },
]

# Negative Examples

negative_examples = [
    {
        "text": "Herr Rules kommt heute besuchen in Berlin.",
        "entities": [
            {"text": "Herr Rules", "start": 0, "end": 10, "type": "PER"},
            {"text": "Berlin", "start": 35, "end": 41, "type": "LOC"},
        ],
    },
]

#### Manual rules

In [None]:
from datetime import date

rules_manual = [
    {
        "id": "m1",
        "name": "german_corporate_suffix",
        "description": "matches german corporate suffix",
        "format": RuleFormat.REGEX,
        "content": "\\b([A-Z][a-z0-9]+(?:\\s+[A-Z][a-z]+)*)\\s+(AG|GmbH|KG|UG|OHG|Gbr)\\b",
        "priority": 5,
        "confidence": 0.5,
        "times_applied": 0,
        "successes": 0,
        "failures": 0,
        "created_at": date.fromisoformat("2026-01-29"),
    },
    {
        "id": "m2",
        "name": "university_clubs",
        "description": "matches university names",
        "format": RuleFormat.REGEX,
        "content": "\\b(?:[A-Z√Ñ√ñ√úa-z√§√∂√º0-9]+\\s+)?(Universit√§t|Fachhochschule|Uni|Verein|Partei)(?:\\s+[A-Z√Ñ√ñ√ú][a-z√§√∂√ºA-Z√Ñ√ñ√ú]+)*\\b",
        "priority": 5,
        "confidence": 0.5,
        "times_applied": 0,
        "successes": 0,
        "failures": 0,
        "created_at": date.fromisoformat("2026-01-29"),
    },
    {
        "id": "m2",
        "name": "gerichte",
        "description": "court_names",
        "format": RuleFormat.REGEX,
        "content": "\\b[A-Z√Ñ√ñ√ú][a-z√§√∂√ºA-Z√Ñ√ñ√ú]+gericht(?:\\s+[A-Z√Ñ√ñ√ú][a-z√§√∂√ºA-Z√Ñ√ñ√ú]+)*\\b",
        "priority": 5,
        "confidence": 0.5,
        "times_applied": 0,
        "successes": 0,
        "failures": 0,
        "created_at": date.fromisoformat("2026-01-29"),
    },
]

with open("./rulechef_data/ORG_manual_rules.json", "w", encoding="utf-8") as f:
    json.dump(
        [Rule(**r).to_dict() for r in rules_manual],
        f,
        indent=2,
        ensure_ascii=False,
        default=str,
    )

#### Added negative examples, plus instructions not to follow those examples.

In [18]:
rule_learner = RuleChefLearner(
    model="gpt-5-mini-2025-08-07",
    dataset="ler",
    allowed_classes="ORG",
    rule_file="2025-02-02_ORG_ler_myexamp",
    lang="de",
)
rule_learner.fit(positive_examples, negative_examples)

{'NRM', 'LIT', 'PERS', 'REG', 'ORG', 'LOC', 'RS'}
‚úì Added example (buffer: 1 new, 1 total)
‚úì Added example (buffer: 2 new, 2 total)
‚úì Added example (buffer: 3 new, 3 total)
‚úó Added negative example (buffer: 4 new, 4 total)

üì• Converting 4 buffered examples to dataset...
   (0 corrections, 0 LLM, 4 human)
‚úì Converted to dataset: 0 corrections, 4 examples

Learning rules from 4 training items
  Corrections: 0 (high value)
  Examples: 4
  Mode: Synthesis + Refinement (max 3 iterations)

de
NEW PROMPT Task: Named Entity Recognition
Description: Extract ORG: Organisationsnamen (Parteien, Vereine, Institutionen, Unternehmen) from German text 
Input schema: {'text': 'str'}
Output schema:
entities: List[Entity]

Entity:
  text: string  # The matched text span
  start: integer  # Start character offset
  end: integer  # End character offset
  type: string  # Entity label

TRAINING EXAMPLES (3 shown):
- Learn rules from these examples:

Input: {"text": "Frau M\u00fcller arbeitet bei

In [16]:
rules_file = Path("./rulechef_data/2025-02-02_ORG_ler_myexamp.json")
rules_data = json.loads(rules_file.read_text())

rules = [Rule.from_dict(r) for r in rules_data.get("rules")]
for rule in rules:
    print_rule_result(rule)
    print("###########")

CONTENT Match companies that use standard corporate suffixes (GmbH, AG, KG, UG, OHG, Gbr, e.V., GmbH & Co. KG, etc.). High precision to avoid person-name false positives.
regex.Regex('\\b[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü-]*(?:\\s+[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü-]*)*\\s+(?:GmbH(?:\\s*&\\s*Co\\.\\s*KG)?|AG|KG|UG|OHG|Gbr|e\\.V\\.?|eV)\\b', flags=regex.V0)
###########
CONTENT Match named courts and judicial bodies, including concatenated German compounds like Bundesverfassungsgericht.
regex.Regex('\\b(?:Bundesverfassungsgericht|Verfassungsgerichtshof|Bundesgerichtshof|Bundesgericht|Landesgericht|Amtsgericht|Verwaltungsgericht|Finanzgericht|Arbeitsgericht|Sozialgericht|Gericht)\\b', flags=regex.V0)
###########
CONTENT Match universities and related institutional names containing 'Universit√§t' or 'University' with up to a few surrounding capitalized tokens.
regex.Regex('\\b(?:[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü-]*(?:\\s+[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü-]*){0,3}\\s+)?Universit(?:√§t|y)(?:\\s+[A-Z√Ñ√ñ√ú][\\

In [10]:
def predict(rules, text):
    RuleChefExtractor = factory.make_extractor(
        "rulechef",
        dataset="ler",
        rules=rules,
    )
    print(RuleChefExtractor.predict({"text": text}))

In [32]:
predict(
    rules,
    "Frau M√ºller kommt jetzt. Sie geht heute zum Bundesgericht √ñst. Nadia studiert an der Technische Universit√§t Wien.",
)

{'entities': [{'text': 'Bundesgericht', 'start': 44, 'end': 57, 'type': 'ORG'}, {'text': 'Technische Universit√§t Wien', 'start': 85, 'end': 112, 'type': 'ORG'}]}


In [33]:
predict(rules, "UNO City befindet sich in der N√§he von Donauzentrum.")

{'entities': [{'text': 'UNO', 'start': 0, 'end': 3, 'type': 'ORG'}]}


In [38]:
predict(rules, "Max M ist Teil dieser Tierschutz Organisation.")

{'entities': [{'text': 'Tierschutz Organisation', 'start': 22, 'end': 45, 'type': 'ORG'}]}


In [42]:
predict(rules, "Anna studiert an der Universit√§t Wien.")

{'entities': [{'text': 'Universit√§t Wien', 'start': 21, 'end': 37, 'type': 'ORG'}]}


In [45]:
predict(rules, "Mn GmbH hat eine grosse Spende f√ºr Kinder in Not geleistet.")

{'entities': [{'text': 'Mn GmbH', 'start': 0, 'end': 7, 'type': 'ORG'}]}


In [46]:
predict(
    rules, "XX GmbH hat eine grosse Spende f√ºr Kinder in Not geleistet."
)  ## XX considered also as a separate ORG

{'entities': [{'text': 'XX GmbH', 'start': 0, 'end': 7, 'type': 'ORG'}, {'text': 'XX', 'start': 0, 'end': 2, 'type': 'ORG'}]}


#### Added xml tags such as <negative>, <positive> for clearer instructions

In [8]:
rule_learner = RuleChefLearner(
    model="gpt-5-mini-2025-08-07",
    dataset="ler",
    allowed_classes="ORG",
    rule_file="2025-02-02_ORG_ler_myexamp_v2",
    lang="de",
)
rule_learner.fit(positive_examples, negative_examples)

{'REG', 'ORG', 'LOC', 'RS', 'PERS', 'NRM', 'LIT'}
‚úì Added example (buffer: 1 new, 1 total)
‚úì Added example (buffer: 2 new, 2 total)
‚úì Added example (buffer: 3 new, 3 total)
‚úó Added negative example (buffer: 4 new, 4 total)

üì• Converting 4 buffered examples to dataset...
   (0 corrections, 0 LLM, 4 human)
‚úì Converted to dataset: 0 corrections, 4 examples

Learning rules from 4 training items
  Corrections: 0 (high value)
  Examples: 4
  Mode: Synthesis + Refinement (max 3 iterations)

de
NEW PROMPT Task: Named Entity Recognition
Description: Extract ORG: Organisationsnamen (Parteien, Vereine, Institutionen, Unternehmen) from German text 
Input schema: {'text': 'str'}
Output schema:
entities: List[Entity]

Entity:
  text: string  # The matched text span
  start: integer  # Start character offset
  end: integer  # End character offset
  type: string  # Entity label

TRAINING EXAMPLES (3 shown):
- Learn rules from the <positive>...</positive> examples:
<positive>

Input: {"tex

In [24]:
rules_file = Path("./rulechef_data/2025-02-02_ORG_ler_myexamp_v2.json")
rules_data = json.loads(rules_file.read_text())

rules = [Rule.from_dict(r) for r in rules_data.get("rules")]
for rule in rules:
    print_rule_result(rule)
    print("############")

CONTENT Match organization labels (Verein, Partei, Club, Verband, Initiative, etc.) followed by a quoted name. Accepts many Unicode quote characters and allows spaces inside quotes so full quoted name is captured (fixes the truncated-quote failure).
regex.Regex('\\b(?:Verein|Partei|Club|Organisation|Verband|Initiative|B√ºndnis|Freundeskreis|Gruppe)\\b\\s*(?:["\'\\u201c\\u201d\\u201e\\u201a\\u201b\\u2018\\u2019\\u00ab\\u00bb])\\s*[A-Z√Ñ√ñ√ú0-9][^"\'\\u201c\\u201d\\u201e\\u201a\\u201b\\u2018\\u2019\\u00ab\\u00bb]+?\\s*(?:["\'\\u201c\\u201d\\u201e\\u201a\\u201b\\u2018\\u2019\\u00ab\\u00bb])', flags=regex.V0)
############
CONTENT Match multi-word organization names that end with common corporate/legal suffixes (GmbH, AG, KG, e.V., KGaA, etc.).
regex.Regex('\\b([A-Z√Ñ√ñ√ú][A-Za-z√Ñ√ñ√ú√§√∂√º√ü0-9\\-&]*(?:\\s+[A-Z√Ñ√ñ√ú][A-Za-z√Ñ√ñ√ú√§√∂√º√ü0-9\\-&]*)*)\\s+(?:AG|GmbH|KG|UG|OHG|Gbr|GbR|e\\.V\\.|eV|SE|KGaA|&\\s*Co\\.\\s*KG|Co\\.\\s*KG)\\b', flags=regex.V0)
############
CONTENT Match institutio

In [18]:
predict(
    rules,
    "Frau M√ºller kommt jetzt. Sie geht heute zum Bundesgericht √ñst. Nadia studiert an der Technische Universit√§t Wien.",
)

{'entities': [{'text': 'Technische Universit√§t Wien', 'start': 85, 'end': 112, 'type': 'ORG'}, {'text': 'Bundesgericht', 'start': 44, 'end': 57, 'type': 'ORG'}]}


In [19]:
predict(rules, "UNO City befindet sich in der N√§he von Donauzentrum.")

{}


In [36]:
predict(rules, "Max M ist Teil dieser Tierschutz Verband.")

{'entities': [{'text': 'Tierschutz Verband', 'start': 22, 'end': 40, 'type': 'ORG'}]}


In [37]:
predict(
    rules, "Max M ist Teil 'der Gr√ºne' Partei"
)  # wrong because the quotes should be after ORG

{}


In [42]:
predict(rules, "Max M ist Teil der Partei 'Der Gr√ºne'")

{'entities': [{'text': "Partei 'Der Gr√ºne'", 'start': 19, 'end': 37, 'type': 'ORG'}]}


In [43]:
predict(
    rules, "Anna studiert an der Universit√§t Wien."
)  # wrong because requires word in the beginning

{}


In [48]:
predict(rules, "MM GmbH hat eine grosse Spende f√ºr Kinder in Not geleistet.")

{'entities': [{'text': 'MM GmbH', 'start': 0, 'end': 7, 'type': 'ORG'}]}


#### Removed xml tags, adding some additional instructions
Every rule must be justified by POSITIVE examples AND must exclude NEGATIVE examples.


In [7]:
rule_learner = RuleChefLearner(
    model="gpt-5-mini-2025-08-07",
    dataset="ler",
    allowed_classes="ORG",
    rule_file="2025-02-02_ORG_ler_myexamp_v3",
    lang="de",
)
rule_learner.fit(positive_examples, negative_examples)

{'PERS', 'LOC', 'NRM', 'RS', 'ORG', 'LIT', 'REG'}
‚úì Added example (buffer: 1 new, 1 total)
‚úì Added example (buffer: 2 new, 2 total)
‚úì Added example (buffer: 3 new, 3 total)
‚úó Added negative example (buffer: 4 new, 4 total)

üì• Converting 4 buffered examples to dataset...
   (0 corrections, 0 LLM, 4 human)
‚úì Converted to dataset: 0 corrections, 4 examples

Learning rules from 4 training items
  Corrections: 0 (high value)
  Examples: 4
  Mode: Synthesis + Refinement (max 3 iterations)

de
NEW PROMPT Task: Named Entity Recognition
Description: Extract ORG: Organisationsnamen (Parteien, Vereine, Institutionen, Unternehmen) from German text 
Input schema: {'text': 'str'}
Output schema:
entities: List[Entity]

Entity:
  text: string  # The matched text span
  start: integer  # Start character offset
  end: integer  # End character offset
  type: string  # Entity label

TRAINING EXAMPLES (3 shown):
Learn rules from the these examples:

Input: {"text": "Frau M\u00fcller arbeitet b

In [8]:
rules_file = Path("./rulechef_data/2025-02-02_ORG_ler_myexamp_v3.json")
rules_data = json.loads(rules_file.read_text())

rules = [Rule.from_dict(r) for r in rules_data.get("rules")]
for rule in rules:
    print_rule_result(rule)
    print("####################")

CONTENT Match the exact word Bundesverfassungsgericht (federal constitutional court).
regex.Regex('\\bBundesverfassungsgericht\\b', flags=regex.V0)
####################
CONTENT Match compound names ending with 'Verfassungsgericht', e.g. Staatsverfassungsgericht.
regex.Regex('\\b[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü\\-]*Verfassungsgericht\\b', flags=regex.V0)
####################
CONTENT Match university names (single or two-word variants around 'Universit*').
regex.Regex('\\b(?:[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü]+\\s+)?Universit\\w+(?:\\s+[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü]+)?\\b', flags=regex.V0)
####################
CONTENT Match corporate names with common German legal suffixes (AG, GmbH, KG, UG, OHG, Gbr, GmbH & Co. KG variants).
regex.Regex('\\b[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü\\-]*(?:\\s+[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü\\-]*)*\\s+(?:AG|GmbH|KG|UG|OHG|Gbr|GmbH\\s*&\\s*Co\\.\\s*KG)\\b', flags=regex.V0)
####################
CONTENT Match 'Verein' followed by a quoted name using various quote characters (han

In [11]:
predict(
    rules,
    "Frau M√ºller kommt jetzt. Sie geht heute zum Bundesgericht √ñst. Nadia studiert an der Technische Universit√§t Wien.",
)  # WRONG

{'entities': [{'text': 'Technische Universit√§t Wien', 'start': 85, 'end': 112, 'type': 'ORG'}, {'text': 'Bundesgericht', 'start': 44, 'end': 57, 'type': 'ORG'}]}


#### removed this line from prompt
Every rule must be justified by POSITIVE examples AND must exclude NEGATIVE examples.


In [7]:
rule_learner = RuleChefLearner(
    model="gpt-5-mini-2025-08-07",
    dataset="ler",
    allowed_classes="ORG",
    rule_file="2025-02-02_ORG_ler_myexamp_v4",
    lang="de",
)
rule_learner.fit(positive_examples, negative_examples)

{'LOC', 'NRM', 'PERS', 'LIT', 'RS', 'REG', 'ORG'}
‚úì Added example (buffer: 1 new, 1 total)
‚úì Added example (buffer: 2 new, 2 total)
‚úì Added example (buffer: 3 new, 3 total)
‚úó Added negative example (buffer: 4 new, 4 total)

üì• Converting 4 buffered examples to dataset...
   (0 corrections, 0 LLM, 4 human)
‚úì Converted to dataset: 0 corrections, 4 examples

Learning rules from 4 training items
  Corrections: 0 (high value)
  Examples: 4
  Mode: Synthesis + Refinement (max 3 iterations)

de
NEW PROMPT Task: Named Entity Recognition
Description: Extract ORG: Organisationsnamen (Parteien, Vereine, Institutionen, Unternehmen) from German text 
Input schema: {'text': 'str'}
Output schema:
entities: List[Entity]

Entity:
  text: string  # The matched text span
  start: integer  # Start character offset
  end: integer  # End character offset
  type: string  # Entity label

TRAINING EXAMPLES (3 shown):
Learn rules from the these examples:

Input: {"text": "Frau M\u00fcller arbeitet b

In [8]:
rules_file = Path("./rulechef_data/2025-02-02_ORG_ler_myexamp_v4.json")
rules_data = json.loads(rules_file.read_text())

rules = [Rule.from_dict(r) for r in rules_data.get("rules")]
for rule in rules:
    print_rule_result(rule)
    print("####################")

CONTENT Match company names followed by common corporate legal suffixes, including variants like 'GmbH & Co. KG'.
regex.Regex('\\b([A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü]*(?:\\s+[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü]*){0,4})\\s+(?:AG|GmbH(?:\\s*&\\s*Co\\.\\s*KG|&Co\\.KG|&Co\\. KG|\\.?\\s*Co\\.?\\s*KG)?|KG|UG|OHG|Gbr|SE|GmbH&Co\\.KG)\\b', flags=regex.V0)
####################
CONTENT Match universities, technical universities, Hochschulen and Akademien with following capitalized location or name parts (e.g. 'Technische Universit√§t Wien').
regex.Regex('\\b(?:Technische|Hochschule|Universit(?:√§t|y)|Akademie)(?:\\s+(?:[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü-]{1,60})){0,4}\\b', flags=regex.V0)
####################
CONTENT Match German court names that end with 'gericht' (case-insensitive endings with proper capitalization at start).
regex.Regex('\\b[A-Z√Ñ√ñ√ú][A-Za-z√§√∂√º√Ñ√ñ√ú√ü-]*gericht\\b', flags=regex.V0)
####################
CONTENT Match institution keywords (Partei, Stiftung, Institut, Beh√∂rde, Ministerium

In [20]:
predict(
    rules,
    "Frau M√ºller kommt jetzt. Sie geht heute zum Finanzgericht √ñst. Nadia studiert an der Technische Universit√§t Wien.",
)

{'entities': [{'text': 'Technische Universit√§t Wien', 'start': 85, 'end': 112, 'type': 'ORG'}, {'text': 'Finanzgericht', 'start': 44, 'end': 57, 'type': 'ORG'}]}


In [19]:
predict(
    rules,
    "UNO City befindet sich in der N√§he von Donauzentrum. Dort befindet sich auch A GmbH.",
)

{'entities': [{'text': 'A GmbH', 'start': 77, 'end': 83, 'type': 'ORG'}]}


In [14]:
predict(rules, "Max M ist Teil dieser Tierschutz Verband.")

{}


In [15]:
predict(
    rules, "Max M ist Teil 'der Gr√ºne' Partei"
)  # wrong because the quotes should be after ORG

{}


In [16]:
predict(rules, "Max M ist Teil der Partei 'Der Gr√ºne'")

{'entities': [{'text': "Partei 'Der Gr√ºne'", 'start': 19, 'end': 37, 'type': 'ORG'}]}


In [17]:
predict(
    rules, "Anna studiert an der Universit√§t Wien."
)  # wrong because requires word in the beginning

{'entities': [{'text': 'Universit√§t Wien', 'start': 21, 'end': 37, 'type': 'ORG'}]}


In [21]:
predict(rules, "MM KG hat eine grosse Spende f√ºr Kinder in Not geleistet.")

{'entities': [{'text': 'MM KG', 'start': 0, 'end': 5, 'type': 'ORG'}]}




### Changed my negative example

In [45]:
# My Examples

# positive Examples
positive_examples = [
    {
        "text": "Frau M√ºller arbeitet beim Bundesverfassungsgericht in Wien.",
        "entities": [
            {"text": "Bundesverfassungsgericht", "start": 26, "end": 50, "type": "ORG"}
        ],
    },
    {
        "text": "Nadia studiert an der Technische Universit√§t Wien.",
        "entities": [
            {
                "text": "Technische Universit√§t Wien",
                "start": 22,
                "end": 49,
                "type": "ORG",
            }
        ],
    },
    {
        "text": "Im Juli 2009 gr√ºndete der Angeklagte den Verein ‚Äû Neudeutschland ‚Äú mit ihm als ‚Äû unabw√§hlbarem ‚Äú ersten Vorstand.",
        "entities": [
            {"text": "Verein ‚Äû Neudeutschland ‚Äú", "start": 41, "end": 66, "type": "ORG"}
        ],
    },
]

# Negative Examples

negative_examples = [
    {
        "text": "Frau M√ºller arbeitet beim Bundesverfassungsgericht in Wien.",
        "entities": [
            {"text": "Frau M√ºller", "start": 0, "end": 11, "type": "PER"},
            {"text": "Wien", "start": 54, "end": 58, "type": "LOC"},
        ],
    },
]

In [46]:
rule_learner = RuleChefLearner(
    model="gpt-5-mini-2025-08-07",
    dataset="ler",
    allowed_classes="ORG",
    rule_file="2025-02-02_ORG_ler_myexamp_v5",
    lang="de",
)
rule_learner.fit(positive_examples, negative_examples)

{'LOC', 'NRM', 'PERS', 'LIT', 'RS', 'REG', 'ORG'}
‚úì Added example (buffer: 1 new, 1 total)
‚úì Added example (buffer: 2 new, 2 total)
‚úì Added example (buffer: 3 new, 3 total)
‚úó Added negative example (buffer: 4 new, 4 total)

üì• Converting 4 buffered examples to dataset...
   (0 corrections, 0 LLM, 4 human)
‚úì Converted to dataset: 0 corrections, 4 examples

Learning rules from 4 training items
  Corrections: 0 (high value)
  Examples: 4
  Mode: Synthesis + Refinement (max 3 iterations)

de
NEW PROMPT Task: Named Entity Recognition
Description: Extract ORG: Organisationsnamen (Parteien, Vereine, Institutionen, Unternehmen) from German text 
Input schema: {'text': 'str'}
Output schema:
entities: List[Entity]

Entity:
  text: string  # The matched text span
  start: integer  # Start character offset
  end: integer  # End character offset
  type: string  # Entity label

TRAINING EXAMPLES (3 shown):
Learn rules from the these examples:

Input: {"text": "Frau M\u00fcller arbeitet b

In [47]:
rules_file = Path("./rulechef_data/2025-02-02_ORG_ler_myexamp_v5.json")
rules_data = json.loads(rules_file.read_text())

rules = [Rule.from_dict(r) for r in rules_data.get("rules")]
for rule in rules:
    print_rule_result(rule)
    print("####################")

CONTENT Match organization labels (Verein, Partei, Firma, Institut, Beh√∂rde, Stiftung, Organisation, Unternehmen) followed by a quoted name (various quote characters). This captures examples like 'Verein ‚Äû Neudeutschland ‚Äú'.
regex.Regex('\\b(?:Verein|Partei|Organisation|Firma|Unternehmen|Institut|Beh√∂rde|Stiftung)\\s*[‚Äû"‚Äú‚Äù\'‚Äπ‚Ä∫]\\s*[^‚Äû"‚Äú‚Äù\'‚Äπ‚Ä∫]+?\\s*[‚Äû"‚Äú‚Äù\'‚Äπ‚Ä∫]', flags=regex.V0)
####################
CONTENT Match multi-word names that include a leading capitalized sequence and a following organizational keyword (Universit√§t, Hochschule, Institut, Akademie, Fakult√§t, Zentrum, Ministerium, Beh√∂rde, Parlament, Kammer, Stiftung, Fraktion, Regierung, Senat, Verein, Gericht). Allows trailing capitalized location or descriptor (e.g., 'Technische Universit√§t Wien').
regex.Regex('\\b(?:[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü]+(?:\\s+[A-Z√Ñ√ñ√ú][\\w√§√∂√º√Ñ√ñ√ú√ü]+){0,3})\\s+(?:Universit√§t|Universitaet|Hochschule|Institut|Akademie|Fakult√§t|Zentrum|Ministerium|Beh√∂rd

In [48]:
predict(
    rules,
    "Frau M√ºller kommt jetzt. Sie geht heute zum Finanzgericht √ñst in Wien. Nadia studiert an der Technische Universit√§t Wien.",
)

{'entities': [{'text': 'Technische Universit√§t Wien', 'start': 93, 'end': 120, 'type': 'ORG'}]}


In [51]:
predict(
    rules,
    "UNO City befindet sich in der N√§he von Donauzentrum. Dort befindet sich auch Aa GmbH.",
)

{'entities': [{'text': 'Aa GmbH', 'start': 77, 'end': 84, 'type': 'ORG'}]}


In [52]:
predict(rules, "Max M ist Teil des Verein 'Sport'.")

{'entities': [{'text': "Verein 'Sport'", 'start': 19, 'end': 33, 'type': 'ORG'}]}


In [54]:
predict(rules, "Max M ist eine Angeklagte an Gericht des Finanz.")

{'entities': [{'text': 'Gericht', 'start': 29, 'end': 36, 'type': 'ORG'}]}
