In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = nlp("Dr Alex Smith chaired first board meeting at Google")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Alex Smith', 'PERSON'), ('first board', 'ORG'), ('Google', 'ORG')]


In [4]:
def add_title(doc):
    new_ents = []
    for ent in doc.ents:
        # Only check for title if it's a person and not the first token
        if ent.label_ == "PERSON" and ent.start != 0:
            prev_token = doc[ent.start - 1]
            if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
                new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
                new_ents.append(new_ent)
            else:
                new_ents.append(ent)
    doc.ents = new_ents
    return doc

In [5]:
nlp.add_pipe(add_title, after='ner')

In [6]:
doc = nlp("Dr Alex Smith chaired first board meeting at Google")

In [7]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('Dr Alex Smith', 'PERSON')]


### Use of POS and dep Parsing

In [8]:
doc =nlp("Alex Smith was working at Google")

In [10]:
displacy.render(doc, style="dep", options={"compact": True, "distance":100})

In [13]:
from spacy.pipeline import merge_entities

In [11]:
def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
                print({'person': ent, 'orgs': orgs, 'past': head.tag_ == "VBD"})
    return doc

In [14]:
nlp.add_pipe(merge_entities)
nlp.add_pipe(extract_person_orgs)

In [25]:
nlp = spacy.load("en_core_web_sm")

In [31]:
doc =nlp("Alex Smith worked at Google")

In [27]:
displacy.render(doc, options={"compact": True, "distance":100})

In [28]:
displacy.render(doc, options={'fine_grained': True})

In [32]:
def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [t for t in prep.children if t.ent_type_ == "ORG"]
                aux = [token for token in head.children if token.dep_ == "aux"]
                past_aux = any(t.tag_ == "VBD" for t in aux)
                past = head.tag_ == "VBD" or head.tag_ == "VBG" and past_aux
                print({'person': ent, 'orgs': orgs, 'past': past})
    return doc

In [33]:
nlp.add_pipe(merge_entities)
nlp.add_pipe(extract_person_orgs)

In [34]:
nlp = spacy.load("en_core_web_sm")
doc =nlp("Alex Smith worked at Google")

In [35]:
displacy.render(doc, options={"compact": True, "distance":100})