In [None]:
# Using EntityRuler for domain-specific texts
#Import the requisite library
import spacy

#Build upon the spaCy Small Model
nlp = spacy.load("en_core_web_sm")

#Sample text
text = "The village of Treblinka is in Poland. Treblinka was also an extermination camp."

In [None]:
#Initially with EntityRuler
#extract entities
doc = nlp(text)
for ent in doc.ents:
    print (ent.text, ent.label_)

In [None]:
#Create the EntityRuler on a new nlp object
nlp = spacy.load("en_core_web_sm")
ruler = nlp.add_pipe("entity_ruler")

#List of Entities and Patterns
patterns = [
                {"label": "GPE", "pattern": "Treblinka"}
            ]

ruler.add_patterns(patterns)

In [None]:
#extract entities
doc = nlp(text) #use nlp function again as we added a ruler pipe to it
for ent in doc.ents:
    print (ent.text, ent.label_)

In [None]:
#visualizing the pipeline
nlp.analyze_pipes()

In [None]:
#In order for our EntityRuler to have primacy, we have to assign it to after the “ner” pipe, as the example below shows in this line:

#Build upon the spaCy Small Model
nlp = spacy.load("en_core_web_sm")

#Sample text
text = "The village of Treblinka is in Poland. Treblinka was also an extermination camp."

#Create the EntityRuler
ruler = nlp.add_pipe("entity_ruler", after="ner")

#List of Entities and Patterns
patterns = [
                {"label": "GPE", "pattern": "Treblinka"}
            ]

ruler.add_patterns(patterns)


doc = nlp(text)

#extract entities
for ent in doc.ents:
    print (ent.text, ent.label_)

In [None]:
# Adding complex rules and variances
# refer https://spacy.io/usage/rule-based-matching
#Import the requisite library
import spacy

#Sample text
text = "This is a sample number (555) 555-5555."

#Build upon the spaCy Small Model
nlp = spacy.blank("en")

#Create the Ruler and Add it
ruler = nlp.add_pipe("entity_ruler")

#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
patterns = [
                {"label": "PHONE_NUMBER", "pattern": [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
                {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]}
            ]
#add patterns to ruler
ruler.add_patterns(patterns)



#create the doc
doc = nlp(text)

#extract entities
for ent in doc.ents:
    print (ent.text, ent.label_)



In [None]:
import re
pattern = r"((\d){1,2} (January|February|March|April|May|June|July|August|September|October|November|December))"

text = "This is a date 2 February. Another date would be 14 August."
matches = re.findall(pattern, text)
print (matches)

In [None]:
#Using RegEx with spaCy
#Import the requisite library
import spacy

#Sample text
text = "This is a sample number 555-5555."

#Build upon the spaCy Small Model
nlp = spacy.blank("en")

#Create the Ruler and Add it
ruler = nlp.add_pipe("entity_ruler")

#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
patterns = [
                {"label": "PHONE_NUMBER", "pattern": [{"SHAPE": "ddd"},
                {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]}
            ]
#add patterns to ruler
ruler.add_patterns(patterns)

#create the doc
doc = nlp(text)

#extract entities
for ent in doc.ents:
    print (ent.text, ent.label_)

In [None]:
pattern = r"((\d){3}-(\d){4})" # here 3 and 4 represent the number of digits
text = "This is a sample number 555-5555."
matches = re.findall(pattern, text)
print (matches)

In [None]:
#Working with Multi-Word Token Entities and RegEx in spaCy
#we can use spaCy’s Matcher to grab multi-word tokens, or tokens that span multiple tokens, but multi-word tokens are not placed into the doc.ents.
# So we cannot access them the same way we would other entities.
# To solve this: Inject the Spans into doc.ents

import re
import spacy
from spacy.tokens import Span

In [None]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."
pattern = r"Paul [A-Z]\w+" # to fetch Paul followed by other word with first letter caps

In [None]:
nlp = spacy.blank("en")
doc = nlp(text)

In [None]:
original_ents = list(doc.ents)

In [None]:
#recosntruct spans
mwt_ents = []
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))

In [None]:
#inject spans into the doc.ents
for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label="PERSON")
    original_ents.append(per_ent)

In [None]:
doc.ents = original_ents


In [None]:
for ent in doc.ents:
    print (ent.text, ent.label_)

In [55]:
# Give priority to Longer Spans
# Sometimes our custom RegEx entities will overlap with spaCy’s Entities
import re
import spacy

text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host."
pattern = r"Hollywood"

nlp = spacy.load("en_core_web_sm")

doc = nlp(text)
for ent in doc.ents:
    print (ent.text, ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP


In [56]:
#when one of our tokens from the re.finditer() overlapped with one that our “ner” component
#We fix this problem with spaCy’s filter_spans. This gives primacy to longer spans.
from spacy.util import filter_spans
filtered = filter_spans(original_ents)
doc.ents = filtered
for ent in doc.ents:
    print (ent.text, ent.label_)

Paul Newman PERSON
Paul Hollywood PERSON
