In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.tokenize import PunktSentenceTokenizer

In [5]:
ex = "European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone marketand ordered the company to alter its practices"

Then we apply word tokenization and part-of-speech tagging to the sentence.

In [6]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [7]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('marketand', 'NN'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

### POS tag list:

- **CC**	coordinating conjunction
- **CD**	cardinal digit
- **DT**	determiner
- **EX**	existential there (like: "there is" ... think of it like "there exists")
- **FW**	foreign word
- **IN**	preposition/subordinating conjunction
- **JJ**	adjective	'big'
- **JJR**	adjective, comparative	'bigger'
- **JJS**	adjective, superlative	'biggest'
- **LS**	list marker	1)
- **MD**	modal	could, will
- **NN**	noun, singular 'desk'
- **NNS**	noun plural	'desks'
- **NNP**	proper noun, singular	'Harrison'
- **NNPS**	proper noun, plural	'Americans'
- **PDT**	predeterminer	'all the kids'
- **POS**	possessive ending	parent's
- **PRP**	personal pronoun	I, he, she

- **PRP$**	possessive pronoun	my, his, hers

- **RB**	adverb	very, silently,
- **RBR**	adverb, comparative	better
- **RBS**	adverb, superlative	best
- **RP**	particle	give up
- **TO**	to	go 'to' the store.
- **UH**	interjection	errrrrrrrm
- **VB**	verb, base form	take
- **VBD**	verb, past tense	took
- **VBG**	verb, gerund/present participle	taking
- **VBN**	verb, past participle	taken
- **VBP**	verb, sing. present, non-3d	take
- **VBZ**	verb, 3rd person sing. present	takes
- **WDT**	wh-determiner	which
- **WP**	wh-pronoun	who, what
- **WP$**	possessive wh-pronoun	whose
- **WRB**	wh-abverb	where, when

### regular expretion identifier
- \d = any number
- \D = anything but a number
- \s = space
- \S = anything but a space
- \w = any letter
- \W = anything but a letter
- . = any character, except for a new line
- \b = space around whole words
- \. = period. must use backslash, because . normally means any character.

### regular expretion Modifiers:
- {1,3} = for digits, u expect 1-3 counts of digits, or "places"
- (add symble) = match 1 or more
- ? = match 0 or 1 repetitions.
- (multiply symble) = match 0 or MORE repetitions
- $ = matches at the end of string
- ^ = matches start of a string
- | = matches either/or. Example x|y = will match either x or y
- [] = range, or "variance"
- {x} = expect to see this amount of the preceding code.
- {x,y} = expect to see this x-y amounts of the precedng code

We get a list of tuples containing the individual words in the sentence and their associated part-of-speech.

Now we’ll implement noun phrase chunking to identify named entities using a regular expression consisting of rules that indicate how sentences should be chunked.

Our chunk pattern consists of one rule, that a noun phrase, NP, should be formed whenever the chunker finds an optional determiner, DT, followed by any number of adjectives, JJ, and then a noun, NN.

In [8]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

### Chunking
Using this pattern, we create a chunk parser and test it on our sentence.

In [9]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP marketand/NN)
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


The output can be read as a tree or a hierarchy with S as the first level, denoting sentence. we can also display it graphically.

IOB tags have become the standard way to represent chunk structures in files, and we will also be using this format.

In [10]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('marketand', 'NN', 'B-NP'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In this representation, there is one token per line, each with its part-of-speech tag and its named entity tag. Based on this training corpus, we can construct a tagger that can be used to label new sentences; and use the nltk.chunk.conlltags2tree() function to convert the tag sequences into a chunk tree.

With the function nltk.ne_chunk(), we can recognize named entities using a classifier, the classifier adds category labels such as PERSON, ORGANIZATION, and GPE.

In [11]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  marketand/NN
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


Google is recognized as a person. It’s quite disappointing, don’t you think so?

In [12]:
custom_sent_tokenizer = PunktSentenceTokenizer(ex)
tokenized = custom_sent_tokenizer.tokenize(ex)

In [13]:
def process_content_cuanking():
    for i in tokenized[:5]:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
        chunkParser = nltk.RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        chunked.draw()
        print(chunked)

process_content_cuanking()

(S
  European/JJ
  authorities/NNS
  (Chunk fined/VBD Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  (Chunk Wednesday/NNP)
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  marketand/NN
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [11]:
def process_content_chinking():
    for i in tokenized[:5]:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)

        chunkGram = r"""Chunk: {<.*>+}
                                }<VB.?|IN|DT|TO>+{"""
        chunkParser = nltk.RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        chunked.draw()
        print(chunked)

process_content_chinking()

(S
  (Chunk European/JJ authorities/NNS)
  fined/VBD
  (Chunk Google/NNP)
  a/DT
  (Chunk record/NN $/$ 5.1/CD billion/CD)
  on/IN
  (Chunk Wednesday/NNP)
  for/IN
  abusing/VBG
  (Chunk its/PRP$ power/NN)
  in/IN
  the/DT
  (Chunk mobile/JJ phone/NN marketand/NN)
  ordered/VBD
  the/DT
  (Chunk company/NN)
  to/TO
  alter/VB
  (Chunk its/PRP$ practices/NNS))


In [14]:
def process_content_named_entity():
    for i in tokenized[:5]:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        namedEnt = nltk.ne_chunk(tagged, binary=True)
        namedEnt.draw()
        print(namedEnt)

process_content_named_entity()

(S
  (NE European/JJ)
  authorities/NNS
  fined/VBD
  Google/NNP
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  marketand/NN
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


### SpaCy
SpaCy’s named entity recognition has been trained on the OntoNotes 5 corpus and it supports the following entity types:

##### TYPE	-      DESCRIPTION
- PERSON	- People, including fictional.
- NORP	- Nationalities or religious or political groups.
- FAC	- Buildings, airports, highways, bridges, etc.
- ORG	- Companies, agencies, institutions, etc.
- GPE	- Countries, cities, states.
- LOC	- Non-GPE locations, mountain ranges, bodies of water.
- PRODUCT	- Objects, vehicles, foods, etc. (Not services.)
- EVENT	- Named hurricanes, battles, wars, sports events, etc.
- WORK_OF_ART	- Titles of books, songs, etc.
- LAW	- Named documents made into laws.
- LANGUAGE	- Any named language.
- DATE	- Absolute or relative dates or periods.
- TIME	- Times smaller than a day.
- PERCENT	- Percentage, including "%".
- MONEY	- Monetary values, including unit.
- QUANTITY	- Measurements, as of weight or distance.
- ORDINAL	- "first", "second", etc.
- CARDINAL	- Numerals that do not fall under another type.

In [15]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = spacy.load('en_core_web_sm')

One of the nice things about Spacy is that we only need to apply nlp once, the entire background pipeline will return the objects.

In [16]:
doc = nlp(ex)
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


European is NORD (nationalities or religious or political groups), Google is an organization, $5.1 billion is monetary value and Wednesday is a date object. They are all correct.

### Token
During the above example, we were working on entity level, in the following example, we are demonstrating token-level entity annotation using the BILUO tagging scheme to describe the entity boundaries.

### BILUO Scheme
##### TAG	-  DESCRIPTION
- **B** EGIN	The first token of a multi-token entity.
- **I** N	An inner token of a multi-token entity.
- **L** AST	The final token of a multi-token entity.
- **U** NIT	A single-token entity.
- **O** UT	A non-entity token.

In [17]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (marketand, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


"B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set.

### Extracting named entity from an article

Now let’s get serious with SpaCy and extracting named entities from a New York Times article, — “F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired.”

In [18]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [19]:
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
#ny_bb

In [20]:
article = nlp(ny_bb)
print(len(article.ents))
for i in article.ents:
    print(i)

223
Peter Strzok
Who Criticized Trump
Fired
The New York Times                                              
PaperAdvertisementSupported
Peter Strzok
Who Criticized Trump
F.B.I.
Trump
CreditCreditT.J. Kirkpatrick
The New York TimesBy Adam Goldman
Michael S. SchmidtAug
13
2018WASHINGTON
Peter Strzok
F.B.I.
Trump
Hillary Clinton
Russia
Strzok
Monday
2016
F.B.I.
Lisa Page — in
Russia
Strzok
20 years
F.B.I.
the early months
Strzok
F.B.I.
Trump
Strzok
last summer
Robert S. Mueller III
Strzok
Twitter
Monday
Trump’s
June
Strzok
F.B.I.
Hillary Clinton’s
2016
Strzok
Office of Professional Responsibility
Strzok
60 days
Strzok
House
July
Strzok
F.B.I.
David Bowdich
the Office of Professional Responsibility
Strzok
F.B.I.
Strzok
Strzok
Trump
F.B.I.
Bowdich
F.B.I.
Christopher A. Wray
Aitan Goelman
Strzok
Special Agent Strzok
Wray
Congress
F.B.I.
Goelman
Americans
Goelman
Special Agent
Strzok
Page
Trump
Page
Strzok
Michael E. Horowitz
Strzok
Strzok
Clinton
just weeks
2016
Horowitz
Hundreds
months
Tru

In [21]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'CARDINAL': 7,
         'DATE': 32,
         'EVENT': 1,
         'FAC': 1,
         'GPE': 38,
         'LOC': 1,
         'MONEY': 1,
         'NORP': 6,
         'ORDINAL': 1,
         'ORG': 39,
         'PERSON': 92,
         'PRODUCT': 2,
         'WORK_OF_ART': 2})

The following are three most frequent tokens.

In [22]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 32), ('F.B.I.', 17), ('Trump', 10)]

Let’s randomly select one sentence to learn more.

In [23]:
sentences = [x for x in article.sents]
print(sentences[20])

Firing Mr. Strzok, however, removes a favorite target of Mr. Trump from the ranks of the F.B.I. and gives Mr. Bowdich and the F.B.I. director, Christopher A. Wray, a chance to move beyond the president’s ire.


Let’s run **displacy.render** to generate the raw markup.

In [24]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

Using spaCy’s built-in displaCy visualizer, here’s what the above sentence and its dependencies look like:

In [32]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [26]:
[(x.orth_,x.pos_, x.lemma_) for x in [y for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Firing', 'VERB', 'fire'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Strzok', 'PROPN', 'strzok'),
 ('removes', 'VERB', 'remove'),
 ('favorite', 'ADJ', 'favorite'),
 ('target', 'NOUN', 'target'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Trump', 'PROPN', 'trump'),
 ('ranks', 'NOUN', 'rank'),
 ('F.B.I.', 'PROPN', 'f.b.i.'),
 ('gives', 'VERB', 'give'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Bowdich', 'PROPN', 'bowdich'),
 ('F.B.I.', 'PROPN', 'f.b.i.'),
 ('director', 'NOUN', 'director'),
 ('Christopher', 'PROPN', 'christopher'),
 ('A.', 'PROPN', 'a.'),
 ('Wray', 'PROPN', 'wray'),
 ('chance', 'NOUN', 'chance'),
 ('president', 'NOUN', 'president'),
 ('’s', 'PART', '’s'),
 ('ire', 'NOUN', 'ire')]

In [27]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'Bowdich': 'PERSON',
 'Christopher A. Wray': 'PERSON',
 'F.B.I.': 'GPE',
 'Strzok': 'PERSON',
 'Trump': 'PERSON'}

Named entity extraction are correct except “F.B.I”.

In [28]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(Firing, 'O', ''), (Mr., 'O', ''), (Strzok, 'B', 'PERSON'), (,, 'O', ''), (however, 'O', ''), (,, 'O', ''), (removes, 'O', ''), (a, 'O', ''), (favorite, 'O', ''), (target, 'O', ''), (of, 'O', ''), (Mr., 'O', ''), (Trump, 'B', 'PERSON'), (from, 'O', ''), (the, 'O', ''), (ranks, 'O', ''), (of, 'O', ''), (the, 'O', ''), (F.B.I., 'B', 'GPE'), (and, 'O', ''), (gives, 'O', ''), (Mr., 'O', ''), (Bowdich, 'B', 'PERSON'), (and, 'O', ''), (the, 'O', ''), (F.B.I., 'B', 'GPE'), (director, 'O', ''), (,, 'O', ''), (Christopher, 'B', 'PERSON'), (A., 'I', 'PERSON'), (Wray, 'I', 'PERSON'), (,, 'O', ''), (a, 'O', ''), (chance, 'O', ''), (to, 'O', ''), (move, 'O', ''), (beyond, 'O', ''), (the, 'O', ''), (president, 'O', ''), (’s, 'O', ''), (ire, 'O', ''), (., 'O', '')]


In [29]:
displacy.render(article, jupyter=True, style='ent')