In [8]:
import spacy
from spacy.matcher import Matcher
import re
import datetime as dt

## Using the Matcher object from SpaCy

In [9]:
nlp = spacy.load("nb_core_news_lg")

In [10]:
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "marius"}, {"LOWER": "dioli"}]

In [11]:
tx = "Hei, mitt navn er Marius Dioli. Eg er nysgjerrig på kor mange poeng eg har."

In [12]:
def on_match(matcher, doc, id, matches):
      print('Matched!', matches)

In [13]:
matcher.add("MD matcher", on_match, pattern)

In [14]:
#matcher = Matcher(nlp.vocab)

In [15]:
doc = nlp(tx)

In [16]:
matches = matcher(doc)

Matched! [(10180913855843715433, 5, 7)]


In [17]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

10180913855843715433 MD matcher 5 7 Marius Dioli


In [18]:
reg =  [{"TEXT": {"REGEX": r'(\b\d{11}\b)|(\b\d{6}\s\d{5}\b)'}}]


In [19]:
tx="Mitt navn er Marius Dioli og mitt fødselsnummer er 15044216652"

In [20]:
def checksum(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # The matched span
    fmt = "%d%m%y"
    fnr = span[0].text
    date = fnr[:6]
    prsn = fnr[-5:]
    try:
        int(date)
    except:
        return
    try:
        dt.datetime.strptime(date, fmt)
    except ValueError:
        print("Invalid date")
    d1 = int(date[0])
    d2 = int(date[1])

    m1 = int(date[2])
    m2 = int(date[3])

    y1 = int(date[4])
    y2 = int(date[5])

    i1 = int(prsn[0])
    i2 = int(prsn[1])
    i3 = int(prsn[2])

    k1 = int(prsn[3])
    k2 = int(prsn[4])
    k1_mod = (3 * d1 + 7 * d2 + 6 * m1 + m2 + 8 * y1 + 9 * y2 + 4 * i1 + 5 * i2 + 2 * i3) % 11
    new_k1 = 0 if k1_mod == 0 else (11 - k1_mod)

    if k1 != new_k1:
        print("Invalid personnummer")

    k2_mod = (5 * d1 + 4 * d2 + 3 * m1 + 2 * m2 + 7 * y1 + 6 * y2 + 5 * i1 + 4 * i2 + 3 * i3 + 2 * k1) % 11
    new_k2 = 0 if k2_mod == 0 else (11 - k2_mod)

    if k2 != new_k2:
        print("Invalid personnummer")

In [21]:
matcher.add("fnr RegEx_med_checksum", checksum, reg)
doc = nlp(tx)
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, start, end, span.text)

Matched! [(10180913855843715433, 3, 5), (1737304374787103023, 9, 10)]
10180913855843715433 3 5 Marius Dioli
1737304374787103023 9 10 15044216652


## Using the Entity Ruler object from SpaCy

In [22]:
from spacy.lang.nb import Norwegian
from spacy.pipeline import EntityRuler

In [None]:
patterns = [{"label": "ORG", "pattern": "Apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]

In [32]:
reg = [{"label": "FNR", "pattern":[{"TEXT": {"REGEX": r'(\b\d{11}\b)|(\b\d{6}\s\d{5}\b)'}}]}]

In [35]:
nlp = spacy.load("nb_core_news_lg")
#nlp = Norwegian() is an empty model
ruler = EntityRuler(nlp)
ruler.add_patterns(reg)
nlp.add_pipe(ruler)

In [36]:
doc = nlp(tx)
print([(ent.text, ent.label_) for ent in doc.ents])

[('Marius Dioli', 'PER'), ('15044216652', 'FNR')]


### Assigning multiple custom entities