# spaCy in Python

In [149]:
import pandas as pd
import numpy as np

import spacy
from spacy import displacy

from collections import Counter

import pprint as pp


# displacy.render(doc, style='ent', jupyter=True)

nlp = spacy.load('en_core_web_md') #Language Class Instance


In [150]:
df_articles = pd.read_pickle('..//data//processed//df_articles.pkl')

# # Convert Timestamps where necessary
# df_articles['pub_date'] = pd.to_datetime(df_articles['pub_date'])

# # Drop constant columns, userTitle is 99% empty
# df_articles.drop(columns=['uri','byline.person'],inplace=True)

In [137]:
# Entity Recognizer
doc = nlp(df_articles.iloc[300]['keywords']) # Make a Doc object
displacy.render(doc, style='ent')

In [138]:
# Entity Recognizer
doc = nlp(df_articles.iloc[300]['lead_paragraph'])
displacy.render(doc, style='ent')

In [139]:
df_articles.dropna(subset=['keywords'],inplace=True)

In [108]:
df_articles['clean_keywords'] = df_articles['keywords'].apply(eval)

In [109]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

def extract_full_name(nlp_doc):
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    matcher.add('FULL_NAME', None, pattern)
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
    return span.text

In [114]:
# doc = nlp(df_articles.iloc[400]['keywords'])
# # print(extract_full_name(doc))
# for token in doc:
#     print(token)

[
'
Kushner
,
Jared
'
,
'
Presidential
Election
of
2020
'
,
'
United
States
International
Relations
'
,
'
Trump
,
Donald
J
'
,
'
Trump
,
Ivanka
'
,
'
United
States
Politics
and
Government
'
,
'
Iran
'
]


# Customizing the Tokenizer

## How many sentences do we have

In [140]:
text = nlp(df_articles.iloc[300]['lead_paragraph'])
doc = nlp(text)

for sent in doc.sents:
    print(sent.text)

print(len(list(doc.sents)))


Much of the work of H.P. Lovecraft, an American horror and science fiction writer who worked during the first decades of the 20th century, is defined by individual encounters with the incomprehensible, with sights, sounds and ideas that undermine and disturb reality as his characters understand it.
Faced with things too monstrous to be real, but which exist nonetheless, Lovecraftian protagonists either reject their senses or descend into madness, unable to live with what they’ve learned.
2


### Lemmas

In [30]:
doc = nlp(df_articles.iloc[300]['lead_paragraph'])
# doc = nlp(df_articles.iloc[300]['keywords'])

for token in doc:
    print(token.text, token.lemma_)

Much much
of of
the the
work work
of of
H.P. H.P.
Lovecraft Lovecraft
, ,
an an
American american
horror horror
and and
science science
fiction fiction
writer writer
who who
worked work
during during
the the
first first
decades decade
of of
the the
20th 20th
century century
, ,
is be
defined define
by by
individual individual
encounters encounter
with with
the the
incomprehensible incomprehensible
, ,
with with
sights sight
, ,
sounds sound
and and
ideas idea
that that
undermine undermine
and and
disturb disturb
reality reality
as as
his his
characters character
understand understand
it it
. .
Faced face
with with
things thing
too too
monstrous monstrous
to to
be be
real real
, ,
but but
which which
exist exist
nonetheless nonetheless
, ,
Lovecraftian Lovecraftian
protagonists protagonist
either either
reject reject
their their
senses sense
or or
descend descend
into into
madness madness
, ,
unable unable
to to
live live
with with
what what
they they
’ve ’ve
learned learn
. .


In [32]:
# Noun Phrases

doc.ents

(H.P. Lovecraft, American, the first decades of the 20th century, Lovecraftian)

In [34]:
list(doc.noun_chunks)

[the work,
 H.P. Lovecraft,
 an American horror and science fiction writer,
 who,
 the first decades,
 the 20th century,
 individual encounters,
 the incomprehensible,
 sights,
 sounds,
 ideas,
 that,
 reality,
 his characters,
 it,
 things,
 which,
 their senses,
 madness,
 what,
 they]

# what is the POS breakdown

In [141]:
doc = nlp(df_articles.iloc[700]['lead_paragraph'])
doc = nlp(df_articles.iloc[700]['keywords'])

pos = []

for token in doc:
    pos.append(token.pos_)
#     if token.pos_ == 'NOUN':
#         print(token.pos_,token.text)

# for token in doc:
#     if token.pos_ == 'VERB':
#         print(token.pos_,token.text)

pos_ctr = Counter(pos)
pp.pprint(pos_ctr.most_common)

<bound method Counter.most_common of Counter({'PUNCT': 33, 'PROPN': 18, 'NOUN': 7, 'PART': 5, 'CCONJ': 4})>


In [142]:
ctr = Counter(doc.ents)
ctr.most_common()

[(Mississippi, 1),
 (Prison Guards and Corrections Officers', 1),
 (Criminal Justice', 1),
 (Mississippi State Penitentiary', 1),
 (Homicides, 1)]

In [143]:
dir(token)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [64]:
doc = nlp(df_articles.iloc[700]['lead_paragraph'])
# doc = nlp(df_articles.iloc[700]['keywords'])

pos = []

for token in doc:
    pos.append(token.pos_)
#     if token.pos_ == 'NOUN':
#         print(token.pos_,token.text)

# for token in doc:
#     if token.pos_ == 'VERB':
#         print(token.pos_,token.text)

# set(pos)
pos_ctr = Counter(pos)
pp.pprint(pos_ctr.most_common)

<bound method Counter.most_common of Counter({'PUNCT': 7, 'PROPN': 6, 'ADP': 5, 'DET': 4, 'NOUN': 3, 'VERB': 3, 'PRON': 2, 'ADV': 1, 'SCONJ': 1, 'ADJ': 1, 'AUX': 1, 'NUM': 1, 'INTJ': 1})>


In [83]:
doc = nlp(df_articles.iloc[700]['lead_paragraph'])
pos = []
doc = nlp(text)
for token in doc:
    pos.append(token.pos_)

pos = Counter(pos)
pos['VERB']

11

In [65]:
pos_ctr.keys()

dict_keys(['PROPN', 'PUNCT', 'DET', 'NOUN', 'VERB', 'ADV', 'SCONJ', 'PRON', 'ADP', 'ADJ', 'AUX', 'NUM', 'INTJ'])

In [66]:
pos_ctr['NOUN']

3

In [74]:
doc = nlp(df_articles.iloc[700]['keywords'])
# doc = nlp(df_articles.iloc[700]['keywords'])

doc.ents

for token in doc:
    print(token.text, token.ent_type_)

[ 
' 
Mississippi GPE
' 
, 
' 
Prisons 
and 
Prisoners 
' 
, 
' 
Cellular 
Telephones 
' 
, 
' 
Smuggling 
' 
, 
' 
Prison ORG
Guards ORG
and ORG
Corrections ORG
Officers ORG
' ORG
, 
' 
Criminal ORG
Justice ORG
' ORG
, 
' 
Mississippi ORG
State ORG
Penitentiary ORG
' ORG
, 
' 
Parchman 
( 
Miss 
) 
' 
, 
' 
Murders 
, 
Attempted 
Murders 
and 
Homicides PERSON
' 
, 
' 
Assaults 
' 
, 
' 
Demonstrations 
, 
Protests 
and 
Riots 
' 
] 


In [68]:
type(doc.ents[0])

spacy.tokens.span.Span

### Find Names

In [124]:
doc = nlp(df_articles.iloc[900]['keywords']) # Make a Doc object
displacy.render(doc, style='ent')

In [133]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

doc = nlp(df_articles.iloc[900]['keywords'])

def extract_full_name(nlp_doc):
   # Get only names from here

   pattern = [{'POS': 'PROPN'}, 
              {'POS': 'PROPN'}]

   matcher.add('FULL_NAME', [pattern])
   matches = matcher(nlp_doc)
   for match_id, start, end in matches:
      span = nlp_doc[start:end]

   return span.text

extract_full_name(doc)

'Military Forces'

In [148]:
from spacy.tokens import Span
import dateparser

# https://stackoverflow.com/questions/51490620/extracting-names-from-a-text-file-using-spacy

def expand_person_entities(doc):
    new_ents = []
    for ent in doc.ents:
        # Only check for title if it's a person and not the first token
        if ent.label_ == "PERSON":
            if ent.start != 0:
                # if person preceded by title, include title in entity
                prev_token = doc[ent.start - 1]
                if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
                    new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
                    new_ents.append(new_ent)
                else:
                    # if entity can be parsed as a date, it's not a person
                    if dateparser.parse(ent.text) is None:
                        new_ents.append(ent) 
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc

# Add the component after the named entity recognizer
# nlp.remove_pipe('expand_person_entities')

@Language.component('expand_person_entities')
nlp.add_pipe("expand_person_entities", after='ner')

SyntaxError: invalid syntax (Temp/ipykernel_20368/1283036612.py, line 30)

In [129]:
l = [1,2,3,4,5,89]

ls = pd.Series(l,name='numbers')
print(ls)

0     1
1     2
2     3
3     4
4     5
5    89
Name: numbers, dtype: int64


In [130]:
def extract_nlp(doc):
    return {
    'lemmas'          : extract_lemmas(doc,
                                        exclude_pos = ['PART', 'PUNCT',
                                        'DET', 'PRON', 'SYM', 'SPACE'],
                                        filter_stops = False),
    'adjs_verbs'      : extract_lemmas(doc, include_pos = ['ADJ', 'VERB']),
    'nouns'           : extract_lemmas(doc, include_pos = ['NOUN', 'PROPN']),
    'noun_phrases'    : extract_noun_phrases(doc, ['NOUN']),
    'adj_noun_phrases': extract_noun_phrases(doc, ['ADJ']),
    'entities'        : extract_entities(doc, ['PERSON', 'ORG', 'GPE', 'LOC'])
    }

In [131]:
## Spacy Find Names
def extract_full_name(nlp_doc):
    matcher.add('FULL_NAME', [{'POS': 'PROPN'}, {'POS': 'PROPN'}])
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:      
        span = nlp_doc[start:end]
    return span.text

In [132]:
extract_full_name(doc)

ValueError: [E178] Each pattern should be a list of dicts, but got: {'POS': 'PROPN'}. Maybe you accidentally passed a single pattern to Matcher.add instead of a list of patterns? If you only want to add one pattern, make sure to wrap it in a list. For example: `matcher.add('FULL_NAME', [pattern])`

In [159]:
nlp.add_pipe("merge_entities", after="ner")

doc = nlp(df_articles.iloc[300]['lead_paragraph'])

pattern = [{'POS': 'PROPN'}, 
              {'POS': 'PROPN'}]

matcher = Matcher(nlp.vocab)

matcher.add('person_only', [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
    print(doc[start:end])

ValueError: [E007] 'merge_entities' already exists in pipeline. Existing names: ['tok2vec', 'tagger', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner', 'merge_entities']

In [164]:
doc = nlp(df_articles.iloc[20]['keywords'])
displacy.render(doc, style='ent')

In [165]:
doc = nlp(df_articles.iloc[20]['keywords'])

pattern = [{'POS': 'PROPN'}, 
{'POS': 'PROPN'}]

matcher = Matcher(nlp.vocab)

matcher.add('person_only', [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
    print(doc[start:end])

MatchPatternError: Invalid token patterns for matcher rule 'person_only'

Pattern 0:
- [pattern -> 0 -> ENT] extra fields not permitted
- [pattern -> 1 -> ENT] extra fields not permitted
