In [4]:
import spacy


TEXTS = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]



def main(model="en_core_web_md"):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
    print("Processing %d texts" % len(TEXTS))

    for text in TEXTS:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))


def filter_spans(spans):
    # Filter a sequence of spans so they don't contain overlaps
    # For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
    get_sort_key = lambda span: (span.end - span.start, -span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
        seen_tokens.update(range(span.start, span.end))
    result = sorted(result, key=lambda span: span.start)
    return result


def extract_currency_relations(doc):
    # Merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)

    relations = []
    for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
        if money.dep_ in ("attr", "dobj"):
            subject = [w for w in money.head.lefts if w.dep_ == "nsubj"]
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == "pobj" and money.head.dep_ == "prep":
            relations.append((money.head.head, money))
    return relations

main()

Loaded model 'en_core_web_md'
Processing 2 texts
Net income	MONEY	$9.4 million
the prior year	MONEY	$2.7 million
Revenue   	MONEY	twelve billion dollars
a loss    	MONEY	1b


In [25]:
s = 'On May 30, 2020, hundreds of people in Little Rock, Arkansas participated in a demonstration at the Arkansas State Capitol.'

In [8]:
import spacy


model="en_core_web_md"

nlp = spacy.load(model)


NameError: name 'text' is not defined

In [26]:
doc = nlp(s)

In [18]:
for e in doc.ents:
    print(e, e.ent_id)

May 30, 2020 0
500 0
Little Rock 0
Arkansas 0
the Arkansas State Capitol 0


In [22]:
for t in doc:
    print(t, t.pos_)

On ADP
May PROPN
30 NUM
, PUNCT
2020 NUM
, PUNCT
500 NUM
people NOUN
in ADP
Little PROPN
Rock PROPN
, PUNCT
Arkansas PROPN
participated VERB
in ADP
a DET
demonstration NOUN
at ADP
the DET
Arkansas PROPN
State PROPN
Capitol PROPN
. PUNCT


In [76]:
doc = nlp("500 people marched for justice in Little Rock, AK.")
doc = nlp('On Friday, May 29, 2020, dozens of people demonstrated in front of the the Capitol.')
doc = nlp('On Friday, May 29, 2020, 100,000 people demonstrated in front of the the Capitol.')
doc = nlp('Dozens of individuals protested on the stairs of Alabama State Capitol in Montgomery, Alabama on May 30, 2020.')


def protest_count(text):
    doc = nlp(text)
    verbs = ['protested', 'marched', 'demonstrated', 'gathered', 'attended' , 'participated', 'formed']
    for chunk in doc.noun_chunks:
        if chunk.root.head.text in verbs:
            return ([chunk.text,  chunk.root.dep_,
                    chunk.root.head.text])

In [62]:
import pandas as pd 
df = pd.read_csv('data/wiki_protest.csv')

In [63]:
df.sample(10)

Unnamed: 0,city,text,state,references
707,Beavercreek,Protests were organized Monday night in some ...,Ohio,['https://www.whio.com/news/crime-and-law/geor...
863,Paris,"On May 30, around three dozen people formed a ...",Texas,['http://theparisnews.com/free/article_3d8aa7b...
540,Ypsilanti,"On May 28, about 100 protesters gathered at t...",Michigan,['https://web.archive.org/web/20200603155948/h...
810,Philadelphia,\nMain article: George Floyd protests in Phil...,Pennsylvania,[]
346,Orlando,"Two separate protests took place on May 30, w...",Florida,['https://www.orlandoweekly.com/Blogs/archives...
697,Morehead City,Peaceful protests have been taking place daily...,North Carolina,['https://www.facebook.com/pages/category/Pers...
618,Oswego,"On May 31, a gathering of several hundred peo...",New York (state),['https://www.oswegonian.com/2020/05/31/oswego...
270,Temecula,"On May 30, 2020, hundreds of protesters came ...",California,['https://patch.com/california/temecula/temecu...
220,Downey,Hundreds marched down Firestone Boulevard bet...,California,['https://www.presstelegram.com/hundreds-gathe...
599,Princeton,"On June 2, thousands protested in downtown Pri...",New Jersey,['https://planetprinceton.com/2020/06/02/photo...


In [65]:
pd.options.display.max_colwidth = 100
df_sample = df.sample(10)
df_sample['people'] = df_sample['text'].astype(str).apply(protest_count)
df_sample[['people','text']]

Unnamed: 0,people,text
154,,Roughly 100 UC Davis students held protests in downtown Davis.
573,"[hundreds, nsubj, marched]","On June 6, hundreds of protesters marched from City Hall to the Union County Courthouse. Among t..."
542,"[ A number, nsubj, gathered]",A number of protesters gathered outside the city's post office on May 29.
539,,"On June 2, more than 100 protesters led a peaceful march along Van Dyke Avenue to protest Georg..."
178,"[ Protesters, nsubj, gathered]",Protesters gathered at the intersection of Redwood Road and a driveway leading to an office of ...
941,"[ Hundreds, nsubj, gathered]",Hundreds gathered in Lopez Village.
151,"[70 to 100 protesters, nsubj, gathered]","On Saturday afternoon, May 30, 70 to 100 protesters gathered peacefully at the State Line Post O..."
951,"[Over one thousand people, nsubj, gathered]",Over one thousand people gathered in downtown Appleton on May 30 and 31 for a peaceful protest d...
891,,"On June 3, a protest in downtown Lexington drew hundreds of participants."
406,"[Hundreds, nsubj, gathered]",Hundreds of protesters gathered outside the Naperville Police Department on June 1. The Mayor an...


In [75]:
def protest_count(text):
    doc = nlp(text)
    verbs = ['protested', 'marched', 'demonstrated', 'gathered', 'attended' , 'participated', 'formed']
    for chunk in doc.noun_chunks:
        #if chunk.root.head.text in verbs:
        print ([chunk.text,  chunk.root.dep_,
                    chunk.root.head.text])

text = ' hundreds of protesters marched from City Hall .'
#protest_count(text)
doc = nlp(text) 
for t in doc:
    print(t.text, t.dep_,'>',t.head)

   > hundreds
hundreds nsubj > marched
of prep > hundreds
protesters pobj > of
marched ROOT > marched
from prep > marched
City compound > Hall
Hall pobj > from
. punct > marched


In [78]:
text = ' hundreds of protesters marched from City Hall .'
def protest_count(text):
    doc = nlp(text)
    verbs = ['protested', 'marched', 'demonstrated', 'gathered', 'attended' , 'participated', 'formed']
    for chunk in doc.noun_chunks:
        if chunk.root.head.text in verbs:
            return ([chunk.text,  chunk.root.dep_,
                    chunk.root.head.text])

protest_count(text)

[' hundreds', 'nsubj', 'marched']