In [1]:
import random
from pathlib import Path

import regex as re
from rulechef import Rule, RuleChef, Task, TaskType
from rulechef.core import RuleFormat
from rulechef.executor import RuleExecutor

from clear_anonymization.extractors import factory
from clear_anonymization.extractors.base import BaseExtractor
from clear_anonymization.extractors.ruleextractor import RuleExtractor
from clear_anonymization.models.rulelearner import RuleChefLearner
from clear_anonymization.ner_datasets.ner_dataset import NERData, NERSample

In [2]:
# Data from the Ler dataset
input_dir = Path("data/bfg/bfg_train.json")
data = NERData.from_json(json.loads(input_dir.read_text()))
train_samples = [s for s in data.samples[:15] if s.split == "train"]

test_samples = [s for s in data.samples if s.split == "validation"]

In [3]:
def print_rule_result(rule):
    print("CONTENT", rule.description)
    pattern = re.compile(rule.content)
    print(pattern)

In [5]:
rules_file = Path("./rulechef_data/2026-02-16_qwen_qwen2.5-7B-Instruct_bfg_ORG_.json")
rules_data = json.loads(rules_file.read_text())

rules = [Rule.from_dict(r) for r in rules_data.get("rules")]
for rule in rules:
    print_rule_result(rule)
    print("###########")

CONTENT Matches exact organization names with specific capitalization.
regex.Regex('(?<=\\b)(?:FA\\s)?(Deutschlandsberg\\sLeibnitz\\sVoitsberg|Logbach-Bildung\\sAG|RheinMetall\\sTechnologien\\sGMBH|Finanzamt\\sInnsbruck|Mur\\sDonwerk\\sGMBH|Wenker\\sBau\\sGMBH|D\\u00fcfel\\sTechnik\\sKG|ZFGQ\\sPharma\\sGMBH|FA\\sLandeck\\sReutte|OberRecycling|Okur\\sAutomotive|Celikkanat\\sGarten|Finanzamt\\sGmunden\\sV\\u00f6cklabruck|Nowothnig\\sWind|S\\u00fcd\\sLemkel|Enns\\sWerkal\\sGMBH|StadtEnergie\\sHolding|Wald\\sBruckval\\sAG)(?=\\b)', flags=regex.V0)
###########
CONTENT Captures multi-word organizations with proper capitalization.
regex.Regex('(?<=\\b)([A-Z][a-z]+\\s+[A-Z][a-z]+\\s+GmbH|AG|KG)(?=\\b)', flags=regex.V0)
###########
CONTENT Captures single-word organizations with proper capitalization.
regex.Regex('(?<=\\b)([A-Z][a-z]+\\s*GmbH|AG|KG)(?=\\b)', flags=regex.V0)
###########
CONTENT Captures organizations based on contextual clues like 'Beschwerdeführerin', 'Bescheid', etc.
regex.Reg

In [6]:
def predict(rules, text):
    RuleChefExtractor = factory.make_extractor(
        "rulechef",
        dataset="ler",
        rules=rules,
    )
    print(RuleChefExtractor.predict({"text": text}))

In [11]:
predict(
    rules,
    "Frau Müller kommt nach Wien. Sie geht heute zum Fa GmbH weil sie einen Termin hat.",
)

{'entities': [{'text': 'Fa GmbH', 'start': 48, 'end': 55, 'type': 'organisation'}]}


In [17]:
predict(
    rules,
    "Die Behörde Finanzamt Wien prüft die Unterlagen.",
)

{'entities': [{'text': 'Finanzamt Wien', 'start': 12, 'end': 26, 'type': 'organisation'}]}


In [22]:
predict(
    rules,
  "Die FA Landeck Reutte arbeitet mit RheinMetall Technologien GMBH."
)

{'entities': [{'text': 'FA Landeck Reutte', 'start': 4, 'end': 21, 'type': 'organisation'}, {'text': 'RheinMetall Technologien GMBH', 'start': 35, 'end': 64, 'type': 'organisation'}, {'text': 'RheinMetall', 'start': 35, 'end': 46, 'type': 'organisation'}]}
