In [1]:
# N.E.R. ------ Named Entity Recognition

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
nlp.pipe_names  # we can see the component ner

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [4]:
doc = nlp("Tesla Inc is going to acquire Twitter Inc for $45 billion")

for ent in doc.ents:
    print(ent, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter Inc  |  PERSON  |  People, including fictional
$45 billion  |  MONEY  |  Monetary values, including unit


In [5]:
from spacy import displacy

displacy.render(doc, style="ent")

In [6]:
nlp.pipe_labels["ner"]

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [7]:
doc = nlp("Michael Bloomber founded Bloomberg in 1982")
for ent in doc.ents:
    print(ent, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Michael Bloomber  |  PERSON  |  People, including fictional
Bloomberg  |  ORG  |  Companies, agencies, institutions, etc.
1982  |  DATE  |  Absolute or relative dates or periods


In [8]:
doc = nlp("Tesla is going to acquire Twitter for $45 billion")

for ent in doc.ents:
    print(ent, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  PERSON  |  People, including fictional
$45 billion  |  MONEY  |  Monetary values, including unit


In [9]:
type(doc[2:5])

spacy.tokens.span.Span

In [10]:
from spacy.tokens import Span

s1 = Span(doc, 0, 1, label="ORG")
s2 = Span(doc, 5, 6, label="ORG")

doc.set_ents([s1, s2], default="unmodified")

In [11]:
for ent in doc.ents:
    print(ent," | ", ent.label_)

Tesla  |  ORG
Twitter  |  ORG
$45 billion  |  MONEY


In [12]:
# Building my own NER

# APPROACH 1
# USING SIMPLE LOOKUP METHODS ---- KIND OF NOT NLP BUT JUST HARDCODED RULES

# APPROACH 2 
# RULE BASED METHODS
# EG. XYZ WAS BORN ON 19 AUG, 2002.    (RULE -> IF THERE IS A PROPER NOUN BEFORE "WAS BORN", THEN IT IS THE SUBJECT OR THE PERSON)
# EG. PHONE NUMBER - USING REGEX ETC.

# SPACY LIBRARY TO WRITE OR SPECIFY ALL THESE RULES ---- EntityRuler (entity_ruler)

# APPROACH 3
# MACHINE LEARNING
# USING CRF (CONDITIONAL RANDOM FIELDS) AND BERT ETC.


In [13]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']