This notebook assumes Python version 3.

## Import several Python packages to check availability

In [1]:
import numpy
import matplotlib
import scipy
import bokeh
import pandas

In [2]:
# These were added explicitly to environment.yml for binder.
import autocorrect
import plotly
import nltk

## Download some data to use with NLTK

**Note:**  The nltk.download() function with no argument launches a dialog to choose data such as corpora, but you can also provide an argument to specify what to download. See: https://stackoverflow.com/questions/5843817/programmatically-install-nltk-corpora-models-i-e-without-the-gui-downloader

In [3]:
nltk.download('wordnet')
nltk.download('verbnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/peckhams/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package verbnet to
[nltk_data]     /Users/peckhams/nltk_data...
[nltk_data]   Package verbnet is already up-to-date!


True

## Run some tests with WordNet from NLTK

In [4]:
from nltk.corpus import wordnet as wn

sets = wn.synsets('boil')
for s in sets:
    print( s.name() )
    # print( s.pos() )

print(' ')
sets = wn.synsets('jump')
for s in sets:
    print( s.name() )
    # print( s.pos() )

print(' ')
sets = wn.synsets('flow')
for s in sets:
    print( s.name() )
    # print( s.pos() )
    
print(' ')
sets = wn.synsets('gyrate')
for s in sets:
    print( s.name() )
    # print( s.pos() )

boil.n.01
boiling_point.n.01
boil.v.01
boil.v.02
boil.v.03
churn.v.02
seethe.v.02
 
jump.n.01
leap.n.02
jump.n.03
startle.n.01
jump.n.05
jump.n.06
jump.v.01
startle.v.02
jump.v.03
jump.v.04
leap_out.v.01
jump.v.06
rise.v.11
jump.v.08
derail.v.02
chute.v.01
jump.v.11
jumpstart.v.01
jump.v.13
leap.v.02
alternate.v.01
 
flow.n.01
flow.n.02
flow.n.03
flow.n.04
stream.n.04
stream.n.02
menstruation.n.01
flow.v.01
run.v.06
flow.v.03
flow.v.04
hang.v.05
flow.v.06
menstruate.v.01
 
gyrate.v.01
spin.v.01


## Run some tests with VerbNet from NLTK

In [5]:
from nltk.corpus import verbnet as vn
vn.lemmas()[0:25]

# help(vn)

[u'December',
 u'FedEx',
 u'UPS',
 u'abandon',
 u'abase',
 u'abash',
 u'abate',
 u'abbreviate',
 u'abduct',
 u'abet',
 u'abhor',
 u'abolish',
 u'abound',
 u'abrade',
 u'abridge',
 u'absolve',
 u'abstain',
 u'abstract',
 u'abuse',
 u'abut',
 u'accelerate',
 u'accept',
 u'acclaim',
 u'accompany',
 u'accrue']

## Experiment for generating nominalizations of verbs

GSN process names are nominalizations of verbs.  One idea for how to automatically generate a list of these process names from a list of verbs (e.g. from VerbNet) is to concatenate some of the 11 or so possible verb nominalization endings directly to the verb, then apply autocorrect and check if the result is a noun.  Standard endings are:
**tion** (absorption, convection),
**sion** (conversion, dispersion),
**cion** (suspicion, coercion),
**ing** (swimming, upwelling),
**age** (drainage, seepage),
__y__ (discovery, recovery),
**al** (arrival, retrieval),
**ance** (acceptance, attendance),
**ence** (existence, maintanence)
**ment** (alignment, improvement),
**ure** (failure, departure).

In [6]:
from autocorrect import spell

[spell('proposetion'), spell('proposeal'), spell('proposeence')]

['proposition', 'proposal', 'proposeence']

Notice that **spell** may return strings that aren't words with no change, like the last one.

In [7]:
[spell('distracttion'),spell('distractal'),spell('distractence')]

['distraction', u'distracted', 'distractence']

In [8]:
[spell('failment'),spell('failence'),spell('faily'),spell('failure')]

['wailment', 'faience', 'family', 'failure']

## Get root verb of a nominalized verb using a PPattach algorithm

In [18]:
from nltk.corpus import wordnet as wn
from nltk.corpus import verbnet as vn

def get_root_verb( noun ):
    verbs = list()
    for sense in wn.synsets(noun, pos=wn.NOUN):
        for lemma in sense.lemmas():
            name = str(lemma.name())   # remove the "u"
            # print( [noun, name] )
            # if (name[0:3] == noun[0:3]):   ######
            if (name == noun):
                forms = lemma.derivationally_related_forms()
                for form in forms:
                    # form is of type Lemma
                    fset = form.synset()
                    if (fset.pos() == wn.VERB):
                        verb = form.name()
                        verbs.append( verb )

    verbs = sorted( set(verbs) )
    n_verbs = len(verbs)
    if (n_verbs == 0):
        print('### Sorry, no root verb found for: ' + noun)
    else:
        # print( 'n_verbs = ', n_verbs )
        dum = 0
    for verb in verbs:
        print( verb )


In [34]:
processes = ['abatement','abseiling','acidification','accrual',
             'canyoning','caving','clearance',
             'closure','decision','diagnosis','distribution','drainage',
             'electrification','evaporation', 'failure','formation','hypnosis',
             'impact','infiltration','interrogation',
             'maintenance','macgyvering','melting','picnicking','proposal',
             'proposition','rafting','rebellion','reconciliation','retrieval',
             'segmentation','spelunking','swimming']
for noun in processes:
    get_root_verb( noun )

abate
#### Sorry, no root verb found for: abseiling
acidify
accrue
#### Sorry, no root verb found for: canyoning
#### Sorry, no root verb found for: caving
clear
close
closure
decide
diagnose
distribute
drain
electrify
evaporate
fail
form
hypnotise
hypnotize
impact
infiltrate
interrogate
maintain
#### Sorry, no root verb found for: macgyvering
melt
#### Sorry, no root verb found for: picnicking
propose
propose
proposition
#### Sorry, no root verb found for: rafting
rebel
reconcile
retrieve
segment
#### Sorry, no root verb found for: spelunking
swim


## Get noun forms (nominalizations) of a verb

In [44]:
def get_noun_form( verb ):
    nouns = list()
    for sense in wn.synsets(verb, pos=wn.VERB):
        for lemma in sense.lemmas():
            name = str(lemma.name())   # remove the "u"
            # print( [noun, name] )
            # if (name[0:3] == noun[0:3]):   ######
            if (name == verb):
                forms = lemma.derivationally_related_forms()
                for form in forms:
                    # form is of type Lemma
                    fset = form.synset()
                    if (fset.pos() == wn.NOUN):
                        APPEND = True
                        noun = form.name()
                        ## nouns.append( noun )
                        #--------------------------------
                        # Remove "people" nouns (CHECK)
                        #--------------------------------
                        if (noun.endswith(('or','er'))):
                            APPEND = False
                        #-----------------------------------
                        # Don't allow noun == verb (CHECK)
                        #-----------------------------------
                        if (noun == verb):
                            APPEND = False
                        if (APPEND):
                            nouns.append( noun )
    nouns = sorted( set(nouns) )
    n_nouns = len(nouns)
    if (n_nouns == 0):
        print('### Sorry, no noun form found for: ' + verb)
    else:
        # print( 'n_nouns = ', n_nouns )
        dum = 0
    for noun in nouns:
        print( noun )

In [45]:
verbs = ['abate','abseil','acidify','accrue','canyon','cave',
         'clear','close','decide','diagnose','distribute','drain',
         'electrify','evaporate','fail','form','hypnotize','impact',
         'infiltrate','interrogate','maintain','macgyver','melt',
         'picnic','propose','proposition','raft','rebel','reconcile',
         'retrieve','segment','spelunk','swim']
for verb in verbs:
    get_noun_form( verb )


abatement
### Sorry, no noun form found for: abseil
acid
acidification
accrual
accruement
### Sorry, no noun form found for: canyon
### Sorry, no noun form found for: cave
clearance
clearing
closing
closure
deciding
decision
diagnosing
diagnosis
distribution
drainage
electricity
electrification
evaporation
failing
failure
formation
hypnosis
impaction
infiltration
interrogation
maintenance
### Sorry, no noun form found for: macgyver
melting
### Sorry, no noun form found for: picnic
proposal
proposition
### Sorry, no noun form found for: proposition
### Sorry, no noun form found for: raft
rebellion
reconciliation
retrieval
segmentation
### Sorry, no noun form found for: spelunk
swimming


## Download a Wikipedia page and parse it with Beautiful Soup

In [11]:
import requests

def is_downloadable(url):
    """
    Does the url contain a downloadable resource
    """
    h = requests.head(url, allow_redirects=True)
    header = h.headers
    content_type = header.get('content-type')
    if 'text' in content_type.lower():
        return False
    if 'html' in content_type.lower():
        return False
    return True

In [12]:
url = 'https://en.wikipedia.org/wiki/Reflectance'
print(is_downloadable(url))

r = requests.get(url, allow_redirects=True)
# open('Reflectance.bin', 'wb').write(r.content)
# r.content[5000:6000]  # (as bytes)
# r.text[5000:6000]
# type(r.text)
# type(r.json)


False


In [13]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'lxml')          # (HTML parser)
# soup = BeautifulSoup(r.text, 'lxml-xml')    # (XML parser)

In [14]:
pars = soup.find_all('p')
# type(pars)
print( len(pars) )
pars[0:3]

21


[<p><b>Reflectance</b> of the surface of a material is its effectiveness in reflecting <a href="/wiki/Radiant_energy" title="Radiant energy">radiant energy</a>. It is the fraction of incident electromagnetic power that is reflected at an interface. The reflectance spectrum or spectral reflectance curve is the plot of the reflectance as a function of <a href="/wiki/Wavelength" title="Wavelength">wavelength</a>.\n</p>,
 <p>The <i>hemispherical reflectance</i> of a surface, denoted <i>R</i>, is defined as<sup class="reference" id="cite_ref-ISO_9288-1989_1-0"><a href="#cite_note-ISO_9288-1989-1">[1]</a></sup>\n</p>,
 <p>where\n</p>]

In [15]:
links = soup.find_all('a')
n = len(links)
# print( n )

# Use numpy arrays vs. appended list for speed,
# using unicode strings up to 100 chars each.
import numpy as np
alist = np.zeros(n, dtype='U100')

# Extract all the URLs on the page and save in a
# string array, but remove ones we don't want
for k in range(n):
    s = str(links[k].get('href'))
    if (s.startswith( ('/w/','#cite_','//','https','/wiki/Wikipedia:') )):
        dum = 0
    elif (s.endswith( ('.png','.jpg','.svg'))):
        dum = 0
    elif (s.startswith('/wiki/')):
        alist[k] = s[6:]
    elif (s.startswith('File:')):
        alist[k] = s[5:]
    else:
        alist[k] = s

# Print just the unique, sorted "link strings"
blist = sorted(set( alist ))
for item in blist:
    print( item )



#Applications
#Directional_reflectance
#External_links
#Grating_efficiency
#Hemispherical_reflectance
#Mathematical_definitions
#References
#Reflectivity
#SI_radiometry_units
#See_also
#Spectral_directional_reflectance
#Spectral_hemispherical_reflectance
#Surface_type
#Water_reflectance
#mw-head
#p-search
Absorbance
Absorptance
Albedo
Aluminium
Attenuation_coefficient
Bidirectional_reflectance_distribution_function
Category:All_articles_with_unsourced_statements
Category:Articles_with_unsourced_statements_from_May_2015
Category:Dimensionless_numbers
Category:Physical_quantities
Category:Radiometry
Compendium_of_Chemical_Terminology
Complex_number
Diffraction_efficiency
Diffraction_grating
Diffuse_reflection
Electric_field
Emissivity
Flux_density
Frequency
Fresnel_equation
Fresnel_equations
Fresnel_power_reflection
Fresnel_reflection_coefficient
Gold
Half-space_(geometry)
Help:Category
Help:Contents
Hertz
Index_of_refraction
International_Commission_on_Illumination
International_Organi

In [16]:
# print(soup.prettify())
# soup.title()
# print(soup.get_text())

## Import Indra and run a test with the Eidos reader

In [47]:
import indra
from indra.sources import eidos
sentence1 = "Conflict causes displacement, which leads to hunger."
sentence2 = "Water trucking has decreased due to the cost of fuel."
ep = eidos.process_text( sentence1 )
ep.statements

ImportError: No module named indra

In [49]:
from indra.assemblers import CAGAssembler
from IPython.core.display import Javascript

ca = CAGAssembler( ep.statements )
ca.make_model()
Javascript( ca.generate_jupyter_js() )

ImportError: No module named indra.assemblers