In [1]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
from pprint import pprint

In [2]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [3]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


SERIOUS STUFFF!!!!!!

In [4]:
from bs4 import BeautifulSoup
import requests
import re

In [5]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

162

In [6]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 42,
         'PERSON': 79,
         'DATE': 25,
         'GPE': 9,
         'NORP': 2,
         'CARDINAL': 3,
         'LOC': 1,
         'ORDINAL': 1})

In [7]:
[(x.text, x.label_) for x in article.ents]

[('F.B.I.', 'ORG'),
 ('Peter Strzok', 'PERSON'),
 ('Texts', 'ORG'),
 ('The New York Times', 'ORG'),
 ('InLog InToday', 'ORG'),
 ('Peter Strzok', 'PERSON'),
 ('Texts', 'ORG'),
 ('byContinue', 'PERSON'),
 ('Peter Strzok', 'PERSON'),
 ('Texts', 'ORG'),
 ('FiredPeter Strzok', 'ORG'),
 ('F.B.I.', 'ORG'),
 ('Trump', 'PERSON'),
 ('The New York TimesBy Adam', 'ORG'),
 ('Michael S. SchmidtAug', 'PERSON'),
 ('13', 'DATE'),
 ('2018WASHINGTON', 'GPE'),
 ('Peter Strzok', 'PERSON'),
 ('F.B.I.', 'ORG'),
 ('Trump', 'PERSON'),
 ('Hillary Clinton', 'PERSON'),
 ('Russia', 'GPE'),
 ('Strzok', 'PERSON'),
 ('Monday', 'DATE'),
 ('Trump', 'PERSON'),
 ('2016', 'DATE'),
 ('F.B.I.', 'ORG'),
 ('Lisa Page', 'PERSON'),
 ('Russia', 'GPE'),
 ('Strzok', 'PERSON'),
 ('20 years', 'DATE'),
 ('F.B.I.', 'ORG'),
 ('the early months', 'DATE'),
 ('Strzok', 'PERSON'),
 ('F.B.I.', 'ORG'),
 ('Trump', 'PERSON'),
 ('Strzok', 'PERSON'),
 ('last summer', 'DATE'),
 ('Robert S. Mueller III', 'PERSON'),
 ('Strzok', 'PERSON'),
 ('Monday

In [8]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 29), ('F.B.I.', 19), ('Trump', 13)]

In [9]:
sentences = [x for x in article.sents]
print(sentences[20])

A spokeswoman for the F.B.I. did not respond to a message seeking comment about why Mr. Strzok was dismissed rather than demoted.


In [10]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [11]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [12]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('spokeswoman', 'NOUN', 'spokeswoman'),
 ('F.B.I.', 'PROPN', 'F.B.I.'),
 ('respond', 'VERB', 'respond'),
 ('message', 'NOUN', 'message'),
 ('seeking', 'VERB', 'seek'),
 ('comment', 'NOUN', 'comment'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('dismissed', 'VERB', 'dismiss'),
 ('demoted', 'VERB', 'demote')]

In [13]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'F.B.I.': 'ORG', 'Strzok': 'PERSON'}

In [14]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(A, 'O', ''), (spokeswoman, 'O', ''), (for, 'O', ''), (the, 'O', ''), (F.B.I., 'B', 'ORG'), (did, 'O', ''), (not, 'O', ''), (respond, 'O', ''), (to, 'O', ''), (a, 'O', ''), (message, 'O', ''), (seeking, 'O', ''), (comment, 'O', ''), (about, 'O', ''), (why, 'O', ''), (Mr., 'O', ''), (Strzok, 'B', 'PERSON'), (was, 'O', ''), (dismissed, 'O', ''), (rather, 'O', ''), (than, 'O', ''), (demoted, 'O', ''), (., 'O', '')]


In [15]:

displacy.render(article, jupyter=True, style='ent')