In [1]:
##### uncomment the following code to set up your computer with the tools
# !which pip # expect pip to be inside .../Versions/3.x/bin/pip
# !echo
# !which python
# !echo
# !which python3 # make sure python3 is a thing
# !echo
# !spacy # make sure spacy is a thing

In [2]:
# !pip install spacy

In [3]:
# !python3 -m spacy download en_core_web_sm

In [4]:
import numpy as np
import pandas as pd
import os
import sys
import spacy
import json
import datetime
from urllib.request import Request, urlopen
from urllib.parse import urlencode
import gzip

DEBUG = False

SERVICE_URL = 'https://babelnet.io/v5/getSynsetIds'
BABEL_RESPONSE_DIR = "babel_responses"
print("BABEL_RESPONSE_DIR[:2]:\t", os.listdir(BABEL_RESPONSE_DIR)[:2],"\n")

DATA_DIR = "state-of-the-state"
FILENAME = "index.csv"
SPEECH_DIR_NAME = "speeches"
TABLE_PATH = os.path.join(DATA_DIR, FILENAME)
SPEECH_DIR = os.path.join(DATA_DIR, SPEECH_DIR_NAME)
print("DATA_DIR:\t", os.listdir(DATA_DIR))
print("SPEECH_DIR[:2]:\t", os.listdir(SPEECH_DIR)[:2])
print("TABLE_PATH:\t", TABLE_PATH)
print("SPEECH_DIR:\t", SPEECH_DIR)
print()
print(datetime.date.today())
print(datetime.datetime.now())

BABEL_RESPONSE_DIR[:2]:	 ['Alabama Supreme Court_2019-07-21.gzip', '.DS_Store'] 

DATA_DIR:	 ['.DS_Store', 'words.csv', 'speeches', 'README.md', 'index.csv']
SPEECH_DIR[:2]:	 ['Minnesota_SOTS.txt', 'Indiana_SOTS.txt']
TABLE_PATH:	 state-of-the-state/index.csv
SPEECH_DIR:	 state-of-the-state/speeches

2019-07-22
2019-07-22 00:00:49.849451


In [5]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

In [6]:
try:
    # make a file called .api_key with your API_KEY from BabelNet
    with open(".api_key") as f: # https://babelnet.org/login
        BABEL_API_KEY = f.read()
        f.close()
except:
    # or just for one time use, insert it here
    BABEL_API_KEY = "<INSERT_YOUR_API_KEY>" # https://babelnet.org/login

In [7]:
# read the csv for info on the data
df = pd.read_csv(TABLE_PATH)
df.head()

Unnamed: 0,state,governor,party,filename,url
0,Alabama,Kay Ivey,R,Alabama_SOTS.txt,https://governor.alabama.gov/remarks-speeches/...
1,Alaska,Mike Dunleavy,R,Alaska_SOTS.txt,https://gov.alaska.gov/newsroom/2019/01/22/201...
2,Arizona,Doug Ducey,R,Arizona_SOTS.txt,https://azgovernor.gov/governor/news/2019/01/g...
3,Arkansas,Asa Hutchinson,R,Arkansas_SOTS.txt,https://governor.arkansas.gov/news-media/speec...
4,California,Gavin Newsom,D,California_SOTS.txt,https://www.gov.ca.gov/2019/02/12/state-of-the...


In [8]:
# dictionary mapping filenames to text content
speech_map = {}

# read the files
for fname in df['filename'].get_values():
    with open( os.path.join(SPEECH_DIR, fname) ) as f:
        speech_map[fname] = f.read()
        f.close()

In [9]:
# https://spacy.io/api/span#set_extension
# extend Span object with is_stop
from spacy.tokens import Span

# if the length is 1 and the word is a stopword
stopword_checker_for_span = lambda span: (len(span) == 1 and type(span[0]).__name__ == "Token" and span[0].is_stop)
Span.set_extension("is_stop", getter=stopword_checker_for_span)

# test ._.is_stop
assert(nlp("I")[:]._.is_stop == True)
assert(nlp("Hello world")[:]._.is_stop == False)

In [10]:
# https://spacy.io/usage/linguistic-features#noun-chunks

speech_noun_map = {}

for state in speech_map:
    doc = nlp(speech_map[state])

    nouns = []
    for chunk in doc.noun_chunks:
        if (chunk._.is_stop): # e.g. if chunk == ["it"]
            if DEBUG: 
                print("stopword?", chunk[0], chunk.text)
            continue

        if (chunk[0].is_stop):
            if len(chunk) == 1:
                continue
            else:
                print("\t...found weird chunk {{", chunk, "}} ", end="") if DEBUG else None
                chunk = chunk[1:]
                print(f"...fixed... [[ {chunk} ]]") if DEBUG else None

        nouns.append(chunk.text)

    
    sys.stdout.write( f"got noun chunks for {state}          \r" )
    sys.stdout.flush()
    speech_noun_map[state] = nouns
print()

got noun chunks for Wyoming_SOTS.txt                


In [11]:
print(speech_noun_map['Alabama_SOTS.txt'][:50])

['Lieutenant Governor Ainsworth', 'Pro Tempore Marsh', 'Speaker McCutcheon', 'Speaker Pro Tempore Gaston', 'members', 'Alabama Legislature', 'Chief Justice Parker', 'justices', 'Alabama Supreme Court', 'fellow Alabamians', 'Mother Nature', '’s', 'form', 'state', 'significant devastation', 'least 23 innocent lives', 'Young children', 'life', 'Mothers', 'Fathers', 'Friends', 'neighbors', 'times', 'good Lord', 'continued comfort', 'healing hands', 'special thanks', 'emergency responders', 'local law enforcement', 'moment', 'silence', 'others', 'uncertainty', 'tomorrow', 'absolute certainty', 'resiliency', 'people', 'Alabama', 'time', 'entire nation', 'good people', 'Lee County', 'feet', '200 years', 'statehood', 'men', 'women', 'Alabama', 'nation', 'country']


In [12]:
def query_babelnet(chunk):
    """
    given a chunk of words, do call to BabelNet API and get results
    """
    params = {
        'lemma' : chunk,
        'searchLang' : 'EN',
        'key'  : BABEL_API_KEY
    }
    
    url = SERVICE_URL + '?' + urlencode(params) # format the request url
    request = Request(url)
    request.add_header('Accept-encoding', 'gzip') # ask for gzip response
    response = urlopen(request)
    
    if response.info().get('Content-Encoding') == 'gzip':
        buf = response.read()
        fname = chunk + "_" + str(datetime.date.today()) + ".gzip"
        save_babelnet_bytes_results(buf, fname)
        return json.loads(gzip.decompress(buf))
    else:
        raise Exception("unexpected response from BabelNet API")

def save_babelnet_bytes_results(babel_res, filename):
    """
    given BabelNet result, save to file
    """
    filename = os.path.join(BABEL_RESPONSE_DIR, filename)
    try:
        with open( filename, 'ab' ) as f: # append
            f.write(babel_res)
            f.close()
    except:
        with open( filename, 'wb' ) as f: # write/create
            f.write(babel_res)
            f.close()

In [13]:
babel_data = {}
chunk = speech_noun_map['Alabama_SOTS.txt'][8]
print(chunk)

Alabama Supreme Court


In [14]:
# GOT THE RESPONSE!! WOOHOO!
json_response = query_babelnet(chunk)
babel_data = json_response
print(babel_data)

[{'id': 'bn:14875491n', 'pos': 'NOUN', 'source': 'BABELNET'}]
