In [1]:
from clear_anonymization.models.rulelearner import RuleChefLearner
from clear_anonymization.extractors.ruleextractor import RuleExtractor
from pathlib import Path
import regex as re
from rulechef import RuleChef, Task, TaskType, Rule
from rulechef.core import RuleFormat
import random
from rulechef.executor import RuleExecutor
from clear_anonymization.ner_datasets.ner_dataset import NERData, NERSample
from clear_anonymization.extractors import factory
from clear_anonymization.extractors.base import BaseExtractor

In [2]:
def sample_data(samples, allowed_classes, k=6, seed=12):
    random.seed(seed)
    relevant_samples = []
    negative_samples = []
    for sample in samples:
        relevant_spans = []
        negative_spans = []

        for label in sample.labels:
            if label["class"] in allowed_classes:
                relevant_spans.append(
                    {
                        "text": label["text"],
                        "start": label["start"],
                        "end": label["end"],
                        "type": label["class"],
                    }
                )
            else:
                negative_spans.append(
                    {
                        "text": label["text"],
                        "start": label["start"],
                        "end": label["end"],
                        "type": label["class"],
                    }
                )

        if relevant_spans:
            relevant_samples.append(
                {
                    "text": sample.text,
                    "entities": list(relevant_spans),
                }
            )
        if negative_spans:
            negative_samples.append(
                {
                    "text": sample.text,
                    "entities": list(negative_spans),
                }
            )

    k = min(k, len(relevant_samples))

    return random.sample(relevant_samples, k), random.sample(negative_samples, 3)

In [3]:
def print_rule_result(rule, text):
    print("RULE")
    print(rule.content)
    pattern = re.compile(rule.content)
    print(pattern)
    print("######")
    print("RESULT")
    for match in pattern.finditer(text):
        match_text = match.group()
        print(match.group())
        if match_text:
            print(match_text)
        else:
            print("NO result")

    print("-----------------------")

In [4]:
input_dir = Path("data/ler/ler_data.json")
data = NERData.from_json(json.loads(input_dir.read_text()))
train_samples = [s for s in data.samples if s.split == "train"]

test_samples = [s for s in data.samples if s.split == "validation"]

sampled, negative = sample_data(train_samples, "ORG")
sampled

[{'text': 'cc ) Eine die erhÃ¶hte Beweiskraft bewirkende Rechtsgrundlage fÃ¼r die Ausstellung der Zertifikate lÃ¤sst sich ebenso wenig der am 31. Oktober 2005 zwischen dem Bundesministerium fÃ¼r Verbraucherschutz , ErnÃ¤hrung und Landwirtschaft und dem ukrainischen VeterinÃ¤rdienst getroffenen Ãœbereinkunft Ã¼ber eine " VeterinÃ¤rbescheinigung fÃ¼r die Ausfuhr von frischem Schweinefleisch aus der Bundesrepublik Deutschland in die Ukraine " oder den beiden E-Mails des Bundesministeriums vom 4. November 2005 und vom 13. MÃ¤rz 2011 entnehmen .',
  'entities': [{'text': 'Bundesministerium fÃ¼r Verbraucherschutz , ErnÃ¤hrung und Landwirtschaft',
    'start': 158,
    'end': 228,
    'type': 'ORG'}]},
 {'text': 'Nach der Rechtsprechung des EuropÃ¤ischen Gerichtshofs fÃ¼r Menschenrechte sei ein VerstoÃŸ gegen Art. 2 EMRK durch die VerhÃ¤ngung der Todesstrafe erst dann gegeben , wenn die ernsthafte Gefahr einer Vollstreckung bestehe .',
  'entities': [{'text': 'EuropÃ¤ischen Gerichtshofs fÃ¼r 

In [5]:
negative

[{'text': 'Aufgrund der dargelegten Sachlage hÃ¤tte die PrÃ¼fungsstelle die Unwirksamkeit der Zustellungen erkennen kÃ¶nnen , insbesondere nachdem sie durch die Mitteilung von Patentanwalt B1 â€¦ vom 2. Mai 2013 Kenntnis von dem Bescheid der Patentanwaltskammer vom 4. April 2013 und damit von der Tatsache erhalten hat , dass die Kanzlei des beigeordneten Patentanwalts K â€¦ jedenfalls zum Zeit- punkt der vermeintlichen Zustellung der FristverlÃ¤ngerung mit BeschlussankÃ¼ndigung am 14. MÃ¤rz 2013 schon seit einiger Zeit verwaist war .',
  'entities': [{'text': 'B1 â€¦', 'start': 175, 'end': 179, 'type': 'PER'},
   {'text': 'K â€¦', 'start': 355, 'end': 358, 'type': 'PER'}]},
 {'text': 'D10 Khalil , M. S. : International Research and Development Trends and Problems of HVDC Cables with Polymeric Insulation .',
  'entities': [{'text': 'Khalil , M. S.',
    'start': 4,
    'end': 18,
    'type': 'PER'}]},
 {'text': 'Eng damit zusammen hÃ¤ngt das ebenfalls aus Art 103 Abs 1 GG folgende Verbo

In [6]:
rule_learner = RuleChefLearner(
    model="gpt-5-mini-2025-08-07",
    dataset="ler",
    allowed_classes="ORG",
    rule_file="gpt5-mini_ORG_negative",
)

{'PERS', 'LOC', 'LIT', 'RS', 'NRM', 'REG', 'ORG'}


In [7]:
rule_learner.fit(sampled, negative)

âœ“ Added example (buffer: 1 new, 1 total)
âœ“ Added example (buffer: 2 new, 2 total)
âœ“ Added example (buffer: 3 new, 3 total)
âœ“ Added example (buffer: 4 new, 4 total)
âœ“ Added example (buffer: 5 new, 5 total)
âœ“ Added example (buffer: 6 new, 6 total)
âœ— Added negative example (buffer: 7 new, 7 total)
âœ— Added negative example (buffer: 8 new, 8 total)
âœ— Added negative example (buffer: 9 new, 9 total)

ðŸ“¥ Converting 9 buffered examples to dataset...
   (0 corrections, 0 LLM, 9 human)
âœ“ Converted to dataset: 0 corrections, 9 examples

Learning rules from 9 training items
  Corrections: 0 (high value)
  Examples: 9
  Mode: Synthesis + Refinement (max 3 iterations)

NEW PROMPT Task: Named Entity Recognition
Description: Extract ORG: Organisationsnamen (Parteien, Vereine, Institutionen, Unternehmen) from text

Input schema: {'text': 'str'}
Output schema:
entities: List[Entity]

Entity:
  text: string  # The matched text span
  start: integer  # Start character offset
  end: in

In [8]:
rules_file = Path("./rulechef_data/gpt5-mini_ORG_negative.json")
rules_data = json.loads(rules_file.read_text())

rules = [Rule.from_dict(r) for r in rules_data.get("rules")]
for rule in rules:
    print(rule.description)
    print(rule.content)
    print("####################")

Exact match for 'Bundesministerium fÃ¼r Verbraucherschutz, ErnÃ¤hrung und Landwirtschaft' (handles Ã¼/ae variants and optional comma).
\bBundesministerium\s+f(?:Ã¼|u)r\s+Verbraucherschutz\s*,?\s*(?:Ern(?:Ã¤|ae)hrung)\s+und\s+Landwirtschaft\b
####################
Exact match for 'EuropÃ¤ischen Gerichtshofs fÃ¼r Menschenrechte' (umlaut/ae variants).
\bEurop(?:ae|Ã¤)ischen\s+Gerichtshofs\s+f(?:Ã¼|u)r\s+Menschenrechte\b
####################
Match 'Markenstelle fÃ¼r Klasse <num> des Deutschen Patent- und Markenamts' stopping before dates (no trailing 'vom ...').
\bMarkenstelle\s+f(?:Ã¼|u)r\s+Klasse\s*\d+\s+des\s+Deutschen\s+Patent-\s*und\s+Markenamts\b(?=(?:[\.,;:]|\s+vom\b|\s*$))
####################
Exact token match for 'Bundesgerichtshof'.
\bBundesgerichtshof\b
####################
Match 'Patentanwaltskammer' with up to 3 preceding capitalized words (e.g., regional name) but do not include trailing 'vom' or dates.
\b(?:[A-ZÃ„Ã–Ãœ][\wÃ„Ã–ÃœÃ¤Ã¶Ã¼ÃŸ\-']*(?:\s+[A-ZÃ„Ã–Ãœ][\wÃ„Ã–ÃœÃ¤Ã¶Ã¼ÃŸ\

In [15]:
rules = [Rule.from_dict(r) for r in rules_data.get("rules")]
# print(rules)

RuleChefExtractor = factory.make_extractor(
    "rulechef",
    dataset="ler",
    rules=rules,
)

print(
    RuleChefExtractor.predict(
        {
            "text": "Nadia studiert an der Technische UniversitÃ¤t. Sie arbeitet beim AG GmbH."
        }
    )
)

{'entities': [{'text': 'AG GmbH', 'start': 64, 'end': 71, 'type': 'ORG'}, {'text': 'Technische UniversitÃ¤t', 'start': 22, 'end': 44, 'type': 'ORG'}]}


In [26]:
print(
    RuleChefExtractor.predict(
        {"text": "Frau X kommt heute nach Wien und geht zum Sport Verein"}
    )
)

{'entities': [{'text': 'Sport Verein', 'start': 42, 'end': 54, 'type': 'ORG'}]}


In [22]:
print(RuleChefExtractor.predict({"text": "Nadia arbeitet beim AG gmbh."}))
pred = True
if pred:
    for test in test_samples:
        print(test.labels)
        result = RuleChefExtractor.predict({"text": test.text})
        print(f"Output: {result}")
        print("---------------------------------------")
        print("---------------------------------------")
        print("---------------------------------------")

{}
[]
Output: {}
---------------------------------------
---------------------------------------
---------------------------------------
[{'start': 102, 'end': 126, 'text': 'Â§ 14 Abs. 2 Satz 2 TzBfG', 'class': 'NRM'}]
Output: {}
---------------------------------------
---------------------------------------
---------------------------------------
[]
Output: {}
---------------------------------------
---------------------------------------
---------------------------------------
[{'start': 111, 'end': 128, 'text': 'Bundesgerichtshof', 'class': 'ORG'}]
Output: {'entities': [{'text': 'Bundesgerichtshof', 'start': 111, 'end': 128, 'type': 'ORG'}]}
---------------------------------------
---------------------------------------
---------------------------------------
[]
Output: {}
---------------------------------------
---------------------------------------
---------------------------------------
[]
Output: {}
---------------------------------------
---------------------------------------

In [16]:
rule_learner = RuleChefLearner(
    model="gpt-5-mini-2025-08-07",
    dataset="ler",
    allowed_classes="ORG",
    rule_file="gpt5-mini_ORG_negative_v2",
)

{'LIT', 'ORG', 'REG', 'RS', 'LOC', 'PERS', 'NRM'}


In [17]:
rule_learner.fit(sampled, negative)

âœ“ Added example (buffer: 1 new, 1 total)
âœ“ Added example (buffer: 2 new, 2 total)
âœ“ Added example (buffer: 3 new, 3 total)
âœ“ Added example (buffer: 4 new, 4 total)
âœ“ Added example (buffer: 5 new, 5 total)
âœ“ Added example (buffer: 6 new, 6 total)
âœ— Added negative example (buffer: 7 new, 7 total)
âœ— Added negative example (buffer: 8 new, 8 total)
âœ— Added negative example (buffer: 9 new, 9 total)
âœ— Added negative example (buffer: 10 new, 10 total)
âœ— Added negative example (buffer: 11 new, 11 total)
âœ— Added negative example (buffer: 12 new, 12 total)

ðŸ“¥ Converting 12 buffered examples to dataset...
   (0 corrections, 0 LLM, 12 human)
âœ“ Converted to dataset: 0 corrections, 12 examples

Learning rules from 12 training items
  Corrections: 0 (high value)
  Examples: 12
  Mode: Synthesis + Refinement (max 3 iterations)

NEW PROMPT Task: Named Entity Recognition
Description: Extract ORG: Organisationsnamen (Parteien, Vereine, Institutionen, Unternehmen) from text

In

In [20]:
rules_file = Path("./rulechef_data/gpt5-mini_ORG_negative_v2.json")
rules_data = json.loads(rules_file.read_text())

rules = [Rule.from_dict(r) for r in rules_data.get("rules")]
for rule in rules:
    print(rule.description)
    print(rule.content)
    print("####################")

Match the full name 'Bundesministerium fÃ¼r Verbraucherschutz, ErnÃ¤hrung und Landwirtschaft' (variants for umlauts and optional comma) as a single ORG to avoid partial duplicates.
\bBundesministerium\s+f(?:u|\u00fc)r\s+Verbraucherschutz\s*,?\s*Ern(?:\u00e4|a)hrung\s+und\s+Landwirtschaft\b
####################
Match full 'Markenstelle fÃ¼r Klasse <n> des Deutschen Patent- und Markenamts' as ORG to avoid the partial 'Markenstelle' duplicate.
\bMarkenstelle\s+f(?:u|\u00fc)r\s+Klasse\s+\d+\s+des\s+Deutschen\s+Patent-?\s*und\s+Markenamts\b
####################
Match quoted organization names when an explicit organization indicator (Verein/Partei/Firma/.../Ministerium) appears immediately before the quote. This avoids matching quoted document titles which lack such indicators.
\b(?:Verein|Partei|Firma|Stiftung|Institut|Beh\u00f6rde|Behorde|Amt|Kammer|Konzern|Unternehmen|Gesellschaft|Bundesministerium|Ministerium)\s*(?:[:\s,]*)?(?:"|\u201e|\u201c)\s*[^"\u201e\u201c\u201d\u2019\u2018]{1,200}?

In [4]:
rule_learner = RuleChefLearner(
    model="gpt-5-mini-2025-08-07",
    dataset="ler",
    allowed_classes="ORG",
    rule_file="gpt5-mini_ORG_newprompt_v2",
)

rule_learner.fit(sampled)

{'PERS', 'RS', 'REG', 'NRM', 'ORG', 'LIT', 'LOC'}
âœ“ Loaded dataset: 0 corrections, 10 examples
âœ“ Added example (buffer: 1 new, 1 total)
âœ“ Added example (buffer: 2 new, 2 total)
âœ“ Added example (buffer: 3 new, 3 total)
âœ“ Added example (buffer: 4 new, 4 total)
âœ“ Added example (buffer: 5 new, 5 total)

ðŸ“¥ Converting 5 buffered examples to dataset...
   (0 corrections, 0 LLM, 5 human)
âœ“ Converted to dataset: 0 corrections, 15 examples

Learning rules from 15 training items
  Corrections: 0 (high value)
  Examples: 15
  Mode: Synthesis + Refinement (max 3 iterations)

NEW PROMPT Task: Named Entity Recognition
Description: Extract ORG: Organisationsnamen (Parteien, Vereine, Institutionen, Unternehmen) from text

Input schema: {'text': 'str'}
Output schema:
entities: List[Entity]

Entity:
  text: string  # The matched text span
  start: integer  # Start character offset
  end: integer  # End character offset
  type: string  # Entity label

TRAINING EXAMPLES (15 shown):

Input:

In [10]:
rule_learner = RuleChefLearner(
    model="gpt-5-mini-2025-08-07",
    dataset="ler",
    allowed_classes="ORG",
    rule_file="gpt5-mini_ORG_negative",
)
rule_learner.fit(sampled, negative)

{'NRM', 'LIT', 'LOC', 'PERS', 'RS', 'REG', 'ORG'}
âœ“ Loaded dataset: 0 corrections, 2 examples
âœ“ Added example (buffer: 1 new, 1 total)
âœ“ Added example (buffer: 2 new, 2 total)
âœ“ Added example (buffer: 3 new, 3 total)
âœ“ Added example (buffer: 4 new, 4 total)
âœ“ Added example (buffer: 5 new, 5 total)
âœ“ Added example (buffer: 6 new, 6 total)
âœ— Added negative example (buffer: 7 new, 7 total)
âœ— Added negative example (buffer: 8 new, 8 total)
âœ— Added negative example (buffer: 9 new, 9 total)
âœ— Added negative example (buffer: 10 new, 10 total)
âœ— Added negative example (buffer: 11 new, 11 total)
âœ— Added negative example (buffer: 12 new, 12 total)

ðŸ“¥ Converting 12 buffered examples to dataset...
   (0 corrections, 0 LLM, 12 human)
âœ“ Converted to dataset: 0 corrections, 14 examples

Learning rules from 14 training items
  Corrections: 0 (high value)
  Examples: 14
  Mode: Synthesis + Refinement (max 3 iterations)

NEW PROMPT Task: Named Entity Recognition
Descriptio

KeyboardInterrupt: 

In [11]:
rules_file = Path("./rulechef_data/gpt5-mini_ORG_negative.json")
rules_data = json.loads(rules_file.read_text())

rules = [Rule.from_dict(r) for r in rules_data.get("rules")]
for rule in rules:
    print(rule.description)
    print(rule.content)
    print("####################")

Match ministries, federal/state authorities and similar institutional names containing strong keywords (Bundesministerium, Ministerium fÃ¼r, Bundesamt, Landesamt, BehÃ¶rde, Senatsverwaltung, Bundeskanzleramt, VeterinÃ¤rdienst, Staatsministerium, RegierungsprÃ¤sidium, Polizei). Captures surrounding multi-word names with commas/slashes/parentheses.
\b[A-ZÃ„Ã–Ãœ][A-Za-zÃ¤Ã¶Ã¼Ã„Ã–ÃœÃŸ\.,\-/\s\(\)"']{0,200}?(?:Bundesministerium|Ministerium\s+fÃ¼r|Ministerium|Bundesamt|Landesamt|BehÃ¶rde|Senatsverwaltung|Bundeskanzleramt|VeterinÃ¤rdienst|Staatsministerium|RegierungsprÃ¤sidium|Polizei)[A-Za-zÃ¤Ã¶Ã¼Ã„Ã–ÃœÃŸ\.,\-/\s\(\)"']{0,80}
####################
Match company names that include common German corporate suffixes (GmbH, GmbH & Co. KG, AG, e.V., eV, KG, OHG, UG), capturing preceding capitalized name tokens.
\b([A-ZÃ„Ã–Ãœ][\wÃ¤Ã¶Ã¼Ã„Ã–ÃœÃŸ\.-]*(?:\s+[A-ZÃ„Ã–Ãœ][\wÃ¤Ã¶Ã¼Ã„Ã–ÃœÃŸ\.-]*){0,6})\s+(GmbH|GmbH\s*&\s*Co\.\s*KG|AG|e\.V\.|eV|KG|OHG|UG)\b
####################
Match universities, institutes,

In [12]:
rules = [Rule.from_dict(r) for r in rules_data.get("rules")]
# print(rules)

RuleChefExtractor = factory.make_extractor(
    "rulechef",
    dataset="ler",
    rules=rules,
)

print(RuleChefExtractor.predict({"text": "Nadia arbeitet beim AG GmbH."}))

{'entities': [{'text': 'AG GmbH', 'start': 20, 'end': 27, 'type': 'ORG'}]}


In [20]:
for rule in rules:
    print(rule.description)
    print(rule.content)
    print("####################")

Match long 'Bundesministerium fÃ¼r ...' official names (e.g., 'Bundesministerium fÃ¼r Verbraucherschutz, ErnÃ¤hrung und Landwirtschaft'). Captures sequences of capitalized components, commas and 'und' following 'fÃ¼r'.
\bBundesministerium\s+f(?:Ã¼|ue)r(?:\s*,?\s*(?:[A-ZÃ„Ã–Ãœ][A-Za-zÃ¤Ã¶Ã¼Ã„Ã–ÃœÃŸ\-]*|und|&|,))+
####################
Match German phrases for the European Court of Human Rights (various grammatical forms: 'EuropÃ¤ischen Gerichtshofs fÃ¼r Menschenrechte', 'EuropÃ¤ischer Gerichtshof fÃ¼r Menschenrechte').
\bEurop(?:Ã¤|ae)ischen\s+Gerichtshofs\s+f(?:Ã¼|ue)r\s+Menschenrechte\b|\bEurop(?:Ã¤|ae)ischer\s+Gerichtshof\s+f(?:Ã¼|ue)r\s+Menschenrechte\b
####################
Match 'Markenstelle fÃ¼r Klasse <num> des Deutschen Patent- und Markenamts' and similar full phrases including optional class numbers.
\bMarkenstelle\s+f(?:Ã¼|ue)r(?:\s+Klasse\s+\d+)?\s+des\s+Deutschen\s+Patent-?\s*und\s*Markenamts\b
####################
Match frequent single-token or tightly hyphenated German ins

In [14]:
print_rule_result(rules[0], "Nadia arbeitet beim Bundesverfassungsgericht.")

RULE
(?<!\S)\b(?:Bundesministerium|Ministerium|Bundesgerichtshof|Bundesverfassungsgericht|Gerichtshof|Markenstelle|Patent(?:\s*-\s*und\s*Markenamt|\s*und\s*Markenamt|(?:-?\s*Markenamts?)?)|Patent-\s*und\s*Markenamt)\b(?:(?:[\s,:\.\-\u2013\u2014]+(?:fÃ¼r|der|des|und|von|zu|am|im|dem|die|den|Klasse|\d+|[A-ZÃ„Ã–Ãœ][\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*)))*\b(?!\s+und\s+[a-zÃ¤Ã¶Ã¼ÃŸ])
regex.Regex('(?<!\\S)\\b(?:Bundesministerium|Ministerium|Bundesgerichtshof|Bundesverfassungsgericht|Gerichtshof|Markenstelle|Patent(?:\\s*-\\s*und\\s*Markenamt|\\s*und\\s*Markenamt|(?:-?\\s*Markenamts?)?)|Patent-\\s*und\\s*Markenamt)\\b(?:(?:[\\s,:\\.\\-\\u2013\\u2014]+(?:fÃ¼r|der|des|und|von|zu|am|im|dem|die|den|Klasse|\\d+|[A-ZÃ„Ã–Ãœ][\\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*)))*\\b(?!\\s+und\\s+[a-zÃ¤Ã¶Ã¼ÃŸ])', flags=regex.V0)
######
RESULT
Bundesverfassungsgericht
Bundesverfassungsgericht
-----------------------


In [15]:
print_rule_result(rules[1], "Nadia arbeitet beim 1. Zivilkammer")
# DOES NOT FIND IT

print_rule_result(rules[1], "Nadia arbeitet beim 1. Zivilkammer des Landesgerichtswien")

RULE
(?<!\b(?:des|der|dem|den)\s)\b[A-ZÃ„Ã–Ãœ][\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*(?:\s+[A-ZÃ„Ã–Ãœ][\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*)?\s+(?:Gerichtshofs?|Gericht|Ministerium|Ministeriums?|Amt|BehÃ¶rde|Dienst|Stelle|Zentrum|Institut|Universit(?:Ã¤t)|Agentur|Kammer|Senat|Rat|Anstalt|Abteilung)\b(?:(?:[\s,:\.\-\u2013\u2014]+(?:fÃ¼r|der|des|und|von|zu|am|im|dem|die|den|Klasse|\d+|[A-ZÃ„Ã–Ãœ][\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*)))*\b(?!\s+und\s+[a-zÃ¤Ã¶Ã¼ÃŸ])
regex.Regex('(?<!\\b(?:des|der|dem|den)\\s)\\b[A-ZÃ„Ã–Ãœ][\\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*(?:\\s+[A-ZÃ„Ã–Ãœ][\\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*)?\\s+(?:Gerichtshofs?|Gericht|Ministerium|Ministeriums?|Amt|BehÃ¶rde|Dienst|Stelle|Zentrum|Institut|Universit(?:Ã¤t)|Agentur|Kammer|Senat|Rat|Anstalt|Abteilung)\\b(?:(?:[\\s,:\\.\\-\\u2013\\u2014]+(?:fÃ¼r|der|des|und|von|zu|am|im|dem|die|den|Klasse|\\d+|[A-ZÃ„Ã–Ãœ][\\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*)))*\\b(?!\\s+und\\s+[a-zÃ¤Ã¶Ã¼ÃŸ])', flags=regex.V0)
######
RESULT
-----------------------
RULE
(?<!\b(?:des|der|dem|den)\s)\b[A-ZÃ„Ã–Ãœ][\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*(?:\s+[A-ZÃ„Ã–Ãœ][\wÃ¤Ã¶

In [11]:
print_rule_result(rules[3], "Beim FA habe ich mich beworben")

RULE
\b[A-ZÃ„Ã–Ãœ][\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*(?:\s+[A-ZÃ„Ã–Ãœ][\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*)*\s+(?:GmbH|AG|KG|OHG|UG|e\.V\.)\b
regex.Regex('\\b[A-ZÃ„Ã–Ãœ][\\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*(?:\\s+[A-ZÃ„Ã–Ãœ][\\wÃ¤Ã¶Ã¼ÃŸÃ„Ã–Ãœ-]*)*\\s+(?:GmbH|AG|KG|OHG|UG|e\\.V\\.)\\b', flags=regex.V0)
######
RESULT
-----------------------


In [None]:
print_rule_result(rules[4], "Nadia hat sich beim AG GmbH beworben")

In [None]:
print_rule_result(rules[5], "Ich arbeite beim Finanzgericht")

In [None]:
print_rule_result(rules[6], "ich studiere an der TU Hochschule")