In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

In [14]:
sentence = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [16]:
ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))

In [17]:
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [18]:
import nltk
nltk.download('averaged_perceptron_tagger')

nltk.download('maxent_ne_chunker')
nltk.download('words')
  

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/abeo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/abeo/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/abeo/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [19]:
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/abeo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [21]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [22]:
sent = preprocess(ex)
sent

[(&#39;European&#39;, &#39;JJ&#39;),
 (&#39;authorities&#39;, &#39;NNS&#39;),
 (&#39;fined&#39;, &#39;VBD&#39;),
 (&#39;Google&#39;, &#39;NNP&#39;),
 (&#39;a&#39;, &#39;DT&#39;),
 (&#39;record&#39;, &#39;NN&#39;),
 (&#39;$&#39;, &#39;$&#39;),
 (&#39;5.1&#39;, &#39;CD&#39;),
 (&#39;billion&#39;, &#39;CD&#39;),
 (&#39;on&#39;, &#39;IN&#39;),
 (&#39;Wednesday&#39;, &#39;NNP&#39;),
 (&#39;for&#39;, &#39;IN&#39;),
 (&#39;abusing&#39;, &#39;VBG&#39;),
 (&#39;its&#39;, &#39;PRP$&#39;),
 (&#39;power&#39;, &#39;NN&#39;),
 (&#39;in&#39;, &#39;IN&#39;),
 (&#39;the&#39;, &#39;DT&#39;),
 (&#39;mobile&#39;, &#39;JJ&#39;),
 (&#39;phone&#39;, &#39;NN&#39;),
 (&#39;market&#39;, &#39;NN&#39;),
 (&#39;and&#39;, &#39;CC&#39;),
 (&#39;ordered&#39;, &#39;VBD&#39;),
 (&#39;the&#39;, &#39;DT&#39;),
 (&#39;company&#39;, &#39;NN&#39;),
 (&#39;to&#39;, &#39;TO&#39;),
 (&#39;alter&#39;, &#39;VB&#39;),
 (&#39;its&#39;, &#39;PRP$&#39;),
 (&#39;practices&#39;, &#39;NNS&#39;)]

In [23]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [24]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [27]:
import matplotlib
matplotlib.use('Agg')
NPChunker = nltk.RegexpParser(pattern) 
result = NPChunker.parse(sent)
result.draw()

TclError: no display name and no $DISPLAY environment variable

In [28]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[(&#39;European&#39;, &#39;JJ&#39;, &#39;O&#39;),
 (&#39;authorities&#39;, &#39;NNS&#39;, &#39;O&#39;),
 (&#39;fined&#39;, &#39;VBD&#39;, &#39;O&#39;),
 (&#39;Google&#39;, &#39;NNP&#39;, &#39;O&#39;),
 (&#39;a&#39;, &#39;DT&#39;, &#39;B-NP&#39;),
 (&#39;record&#39;, &#39;NN&#39;, &#39;I-NP&#39;),
 (&#39;$&#39;, &#39;$&#39;, &#39;O&#39;),
 (&#39;5.1&#39;, &#39;CD&#39;, &#39;O&#39;),
 (&#39;billion&#39;, &#39;CD&#39;, &#39;O&#39;),
 (&#39;on&#39;, &#39;IN&#39;, &#39;O&#39;),
 (&#39;Wednesday&#39;, &#39;NNP&#39;, &#39;O&#39;),
 (&#39;for&#39;, &#39;IN&#39;, &#39;O&#39;),
 (&#39;abusing&#39;, &#39;VBG&#39;, &#39;O&#39;),
 (&#39;its&#39;, &#39;PRP$&#39;, &#39;O&#39;),
 (&#39;power&#39;, &#39;NN&#39;, &#39;B-NP&#39;),
 (&#39;in&#39;, &#39;IN&#39;, &#39;O&#39;),
 (&#39;the&#39;, &#39;DT&#39;, &#39;B-NP&#39;),
 (&#39;mobile&#39;, &#39;JJ&#39;, &#39;I-NP&#39;),
 (&#39;phone&#39;, &#39;NN&#39;, &#39;I-NP&#39;),
 (&#39;market&#39;, &#39;NN&#39;, &#39;B-NP&#39;),
 (&#39;and&#39;, &#39;CC&#39;, &#3

In [29]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [30]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[(&#39;European&#39;, &#39;NORP&#39;),
 (&#39;Google&#39;, &#39;ORG&#39;),
 (&#39;$5.1 billion&#39;, &#39;MONEY&#39;),
 (&#39;Wednesday&#39;, &#39;DATE&#39;)]


In [31]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, &#39;B&#39;, &#39;NORP&#39;),
 (authorities, &#39;O&#39;, &#39;&#39;),
 (fined, &#39;O&#39;, &#39;&#39;),
 (Google, &#39;B&#39;, &#39;ORG&#39;),
 (a, &#39;O&#39;, &#39;&#39;),
 (record, &#39;O&#39;, &#39;&#39;),
 ($, &#39;B&#39;, &#39;MONEY&#39;),
 (5.1, &#39;I&#39;, &#39;MONEY&#39;),
 (billion, &#39;I&#39;, &#39;MONEY&#39;),
 (on, &#39;O&#39;, &#39;&#39;),
 (Wednesday, &#39;B&#39;, &#39;DATE&#39;),
 (for, &#39;O&#39;, &#39;&#39;),
 (abusing, &#39;O&#39;, &#39;&#39;),
 (its, &#39;O&#39;, &#39;&#39;),
 (power, &#39;O&#39;, &#39;&#39;),
 (in, &#39;O&#39;, &#39;&#39;),
 (the, &#39;O&#39;, &#39;&#39;),
 (mobile, &#39;O&#39;, &#39;&#39;),
 (phone, &#39;O&#39;, &#39;&#39;),
 (market, &#39;O&#39;, &#39;&#39;),
 (and, &#39;O&#39;, &#39;&#39;),
 (ordered, &#39;O&#39;, &#39;&#39;),
 (the, &#39;O&#39;, &#39;&#39;),
 (company, &#39;O&#39;, &#39;&#39;),
 (to, &#39;O&#39;, &#39;&#39;),
 (alter, &#39;O&#39;, &#39;&#39;),
 (its, &#39;O&#39;, &#39;&#39;),
 (practices, &#39;O&#39;, &#39;&#39

In [32]:
from bs4 import BeautifulSoup
import requests
import re

In [33]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [34]:
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

158

In [35]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({&#39;PERSON&#39;: 75,
         &#39;ORG&#39;: 40,
         &#39;PRODUCT&#39;: 1,
         &#39;CARDINAL&#39;: 5,
         &#39;GPE&#39;: 11,
         &#39;DATE&#39;: 23,
         &#39;NORP&#39;: 2,
         &#39;ORDINAL&#39;: 1})

In [36]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[(&#39;Strzok&#39;, 28), (&#39;F.B.I.&#39;, 18), (&#39;Trump&#39;, 11)]

In [37]:
sentences = [x for x in article.sents]
print(sentences[20])

But Mr. Strzok’s lawyer said the deputy director of the F.B.I., David Bowdich, had overruled the Office of Professional Responsibility and fired Mr. Strzok.


In [38]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [39]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [40]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[(&#39;Mr.&#39;, &#39;PROPN&#39;, &#39;Mr.&#39;),
 (&#39;Strzok&#39;, &#39;PROPN&#39;, &#39;Strzok&#39;),
 (&#39;lawyer&#39;, &#39;NOUN&#39;, &#39;lawyer&#39;),
 (&#39;said&#39;, &#39;VERB&#39;, &#39;say&#39;),
 (&#39;deputy&#39;, &#39;NOUN&#39;, &#39;deputy&#39;),
 (&#39;director&#39;, &#39;NOUN&#39;, &#39;director&#39;),
 (&#39;F.B.I.&#39;, &#39;PROPN&#39;, &#39;F.B.I.&#39;),
 (&#39;David&#39;, &#39;PROPN&#39;, &#39;David&#39;),
 (&#39;Bowdich&#39;, &#39;PROPN&#39;, &#39;Bowdich&#39;),
 (&#39;overruled&#39;, &#39;VERB&#39;, &#39;overrule&#39;),
 (&#39;Office&#39;, &#39;PROPN&#39;, &#39;Office&#39;),
 (&#39;Professional&#39;, &#39;PROPN&#39;, &#39;Professional&#39;),
 (&#39;Responsibility&#39;, &#39;PROPN&#39;, &#39;Responsibility&#39;),
 (&#39;fired&#39;, &#39;VERB&#39;, &#39;fire&#39;),
 (&#39;Mr.&#39;, &#39;PROPN&#39;, &#39;Mr.&#39;),
 (&#39;Strzok&#39;, &#39;PROPN&#39;, &#39;Strzok&#39;)]

In [41]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{&#39;Strzok&#39;: &#39;PERSON&#39;,
 &#39;F.B.I.&#39;: &#39;ORG&#39;,
 &#39;David Bowdich&#39;: &#39;PERSON&#39;,
 &#39;the Office of Professional Responsibility&#39;: &#39;ORG&#39;}

In [42]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(But, &#39;O&#39;, &#39;&#39;), (Mr., &#39;O&#39;, &#39;&#39;), (Strzok, &#39;B&#39;, &#39;PERSON&#39;), (’s, &#39;O&#39;, &#39;&#39;), (lawyer, &#39;O&#39;, &#39;&#39;), (said, &#39;O&#39;, &#39;&#39;), (the, &#39;O&#39;, &#39;&#39;), (deputy, &#39;O&#39;, &#39;&#39;), (director, &#39;O&#39;, &#39;&#39;), (of, &#39;O&#39;, &#39;&#39;), (the, &#39;O&#39;, &#39;&#39;), (F.B.I., &#39;B&#39;, &#39;ORG&#39;), (,, &#39;O&#39;, &#39;&#39;), (David, &#39;B&#39;, &#39;PERSON&#39;), (Bowdich, &#39;I&#39;, &#39;PERSON&#39;), (,, &#39;O&#39;, &#39;&#39;), (had, &#39;O&#39;, &#39;&#39;), (overruled, &#39;O&#39;, &#39;&#39;), (the, &#39;B&#39;, &#39;ORG&#39;), (Office, &#39;I&#39;, &#39;ORG&#39;), (of, &#39;I&#39;, &#39;ORG&#39;), (Professional, &#39;I&#39;, &#39;ORG&#39;), (Responsibility, &#39;I&#39;, &#39;ORG&#39;), (and, &#39;O&#39;, &#39;&#39;), (fired, &#39;O&#39;, &#39;&#39;), (Mr., &#39;O&#39;, &#39;&#39;), (Strzok, &#39;B&#39;, &#39;PERSON&#39;), (., &#39;O&#39;, &#39;&#39;)]


In [43]:
displacy.render(article, jupyter=True, style='ent')