In [1]:
!which pip # expect pip to be inside .../Versions/3.x/bin/pip
!echo
!which python
!echo
!which python3 # make sure python3 is a thing
!echo
!spacy # make sure spacy is a thing

/Library/Frameworks/Python.framework/Versions/3.7/bin/pip

/usr/bin/python

/Library/Frameworks/Python.framework/Versions/3.7/bin/python3

/bin/sh: /Library/Frameworks/Python.framework/Versions/3.7/bin/spacy: Permission denied


In [2]:
!pip install spacy



In [3]:
!python3 -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [4]:
import numpy as np
import pandas as pd
import os
import spacy

In [5]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

In [6]:
DEBUG = False
BABEL_API_KEY = "<INSERT_YOUR_API_KEY>" # https://babelnet.org/login

In [7]:
DATA_DIR = "state-of-the-state"
FILENAME = "index.csv"
SPEECH_DIR_NAME = "speeches"
TABLE_PATH = os.path.join(DATA_DIR, FILENAME)
SPEECH_DIR = os.path.join(DATA_DIR, SPEECH_DIR_NAME)
print("DATA_DIR:\t", os.listdir(DATA_DIR))
print("SPEECH_DIR[:2]:\t", os.listdir(SPEECH_DIR)[:2])
print("TABLE_PATH:\t", TABLE_PATH)
print("SPEECH_DIR:\t", SPEECH_DIR)

DATA_DIR:	 ['words.csv', 'speeches', 'README.md', 'index.csv']
SPEECH_DIR[:2]:	 ['Minnesota_SOTS.txt', 'Indiana_SOTS.txt']
TABLE_PATH:	 state-of-the-state/index.csv
SPEECH_DIR:	 state-of-the-state/speeches


In [8]:
# read the csv for info on the data
df = pd.read_csv(TABLE_PATH)
df.head()

Unnamed: 0,state,governor,party,filename,url
0,Alabama,Kay Ivey,R,Alabama_SOTS.txt,https://governor.alabama.gov/remarks-speeches/...
1,Alaska,Mike Dunleavy,R,Alaska_SOTS.txt,https://gov.alaska.gov/newsroom/2019/01/22/201...
2,Arizona,Doug Ducey,R,Arizona_SOTS.txt,https://azgovernor.gov/governor/news/2019/01/g...
3,Arkansas,Asa Hutchinson,R,Arkansas_SOTS.txt,https://governor.arkansas.gov/news-media/speec...
4,California,Gavin Newsom,D,California_SOTS.txt,https://www.gov.ca.gov/2019/02/12/state-of-the...


In [9]:
# dictionary mapping filenames to text content
speech_map = {}

# read the files
for fname in df['filename'].get_values():
    with open( os.path.join(SPEECH_DIR, fname) ) as f:
        speech_map[fname] = f.read()
        f.close()

In [10]:
# https://spacy.io/api/span#set_extension
# extend Span object with is_stop
from spacy.tokens import Span

# if the length is 1 and the word is a stopword
stopword_checker_for_span = lambda span: (len(span) == 1 and type(span[0]).__name__ == "Token" and span[0].is_stop)
Span.set_extension("is_stop", getter=stopword_checker_for_span)

# test ._.is_stop
assert(nlp("I")[:]._.is_stop == True)
assert(nlp("Hello world")[:]._.is_stop == False)

In [11]:
# https://spacy.io/usage/linguistic-features#noun-chunks

speech_noun_map = {}

for state in speech_map:
    doc = nlp(speech_map[state])

    nouns = []
    for chunk in doc.noun_chunks:
        if (chunk._.is_stop): # e.g. if chunk == ["it"]
            if DEBUG: 
                print("stopword?", chunk[0], chunk.text)
            continue

        if (chunk[0].text == "’s" or chunk[0].text == "'s"):
            if len(chunk) == 1:
                continue
            else:
                print("\t...found weird chunk {{", chunk, "}} ", end="")
                chunk = chunk[1:]
                print(f"...fixed... [[ {chunk} ]]")

        nouns.append(chunk.text)

    print(f"got noun chunks for {state}")
    speech_noun_map[state] = nouns

	...found weird chunk {{ ’s roadways }} ...fixed... [[ roadways ]]
	...found weird chunk {{ ’s youngest learners }} ...fixed... [[ youngest learners ]]
	...found weird chunk {{ ’s infrastructure }} ...fixed... [[ infrastructure ]]
got noun chunks for Alabama_SOTS.txt
	...found weird chunk {{ ’s history }} ...fixed... [[ history ]]
	...found weird chunk {{ ’s future }} ...fixed... [[ future ]]
got noun chunks for Alaska_SOTS.txt
	...found weird chunk {{ ’s time }} ...fixed... [[ time ]]
	...found weird chunk {{ ’s resolution }} ...fixed... [[ resolution ]]
got noun chunks for Arizona_SOTS.txt
	...found weird chunk {{ ’s remarks }} ...fixed... [[ remarks ]]
	...found weird chunk {{ ’s desk motto }} ...fixed... [[ desk motto ]]
got noun chunks for Arkansas_SOTS.txt
	...found weird chunk {{ ’s economy }} ...fixed... [[ economy ]]
	...found weird chunk {{ ’s prosperity }} ...fixed... [[ prosperity ]]
	...found weird chunk {{ ’s excellence }} ...fixed... [[ excellence ]]
	...found weird chun

	...found weird chunk {{ ’s commitment }} ...fixed... [[ commitment ]]
	...found weird chunk {{ ’s birds }} ...fixed... [[ birds ]]
	...found weird chunk {{ ’s businesses }} ...fixed... [[ businesses ]]
	...found weird chunk {{ ’s decisions }} ...fixed... [[ decisions ]]
	...found weird chunk {{ ’s institutions }} ...fixed... [[ institutions ]]
	...found weird chunk {{ ’s failure }} ...fixed... [[ failure ]]
got noun chunks for SouthDakota_SOTS.txt
	...found weird chunk {{ ’s constitution }} ...fixed... [[ constitution ]]
	...found weird chunk {{ ’s concern }} ...fixed... [[ concern ]]
	...found weird chunk {{ ’s Capital }} ...fixed... [[ Capital ]]
	...found weird chunk {{ ’s Services }} ...fixed... [[ Services ]]
	...found weird chunk {{ ’s Excellence }} ...fixed... [[ Excellence ]]
got noun chunks for Tennessee_SOTS.txt
got noun chunks for Texas_SOTS.txt
	...found weird chunk {{ ’s challenges }} ...fixed... [[ challenges ]]
	...found weird chunk {{ ’s State }} ...fixed... [[ State ]

In [12]:
print(speech_noun_map['Alabama_SOTS.txt'][:50])

['Lieutenant Governor Ainsworth', 'Pro Tempore Marsh', 'Speaker McCutcheon', 'Speaker Pro Tempore Gaston', 'members', 'the Alabama Legislature', 'Chief Justice Parker', 'justices', 'the Alabama Supreme Court', 'my fellow Alabamians', 'Mother Nature', 'the form', 'our state', 'significant devastation', 'At least 23 innocent lives', 'Young children', 'life', 'Mothers', 'Fathers', 'Friends', 'neighbors', 'times', 'the good Lord', 'His continued comfort', 'healing hands', 'special thanks', 'the emergency responders', 'local law enforcement', 'a moment', 'silence', 'many others', 'uncertainty', 'what tomorrow', 'absolute certainty', 'the resiliency', 'the people', 'Alabama', 'a time', 'our entire nation', 'these good people', 'Lee County', 'its feet', 'our 200 years', 'statehood', 'the men', 'women', 'Alabama', 'our nation', 'our country', 'defending']


In [13]:
def query_babelnet(chunk):
    """
    given a chunk of words, do call to BabelNet API and get results
    """
    pass

def save_babelnet_results(babel_res, filename):
    """
    given BabelNet result, save to file
    """
    try:
        with open( filename, 'a' ) as f:
            f.write(babel_res)
            f.close()
    except:
        with open( filename, 'w' ) as f:
            f.write(babel_res)
            f.close()

In [14]:
save_babelnet_results("lol\n", 'lol.txt')

In [15]:
i=0
for state in speech_noun_map:
    nouns = speech_noun_map[state]
    i+=1
    print(i)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
