In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
#creation of doc object based on unicode string input
doc = nlp(u'I have flown to La. Now I am flying to San Fransisco')

In [4]:
#shred the discourse = separate into sentences
for sent in doc.sents:
    print([w.text for w in sent if w.dep_ == 'ROOT' or w.dep_ == 'probj'])

['flown']
['flying']


In [5]:
#examining the attributes of the doc's tokens
for token in doc:
    print(token.text, ",", token.tag_, ",", token.pos_, ',', token.dep_, ",", token.head.text)

I , PRP , PRON , nsubj , flown
have , VBP , AUX , aux , flown
flown , VBN , VERB , ROOT , flown
to , IN , ADP , prep , flown
La. , NNP , PROPN , punct , flown
Now , RB , ADV , advmod , flying
I , PRP , PRON , nsubj , flying
am , VBP , AUX , aux , flying
flying , VBG , VERB , ROOT , flying
to , IN , ADP , prep , flying
San , NNP , PROPN , compound , Fransisco
Fransisco , NNP , PROPN , pobj , to


# Try This Ch1:

Create a script which properly inteprets the intent to fly to SanFran in the above doc.

"Start with latest script... enahance conditional clause adding conditions to account for fine-grained part-of-speech tags... on pg21. Then add lemmatization funtionality to script... on page 18."

Output should be ['fly, 'San Fransico']

In [6]:
#attempt 1
for sent in doc.sents:
    for token in sent:
        if (token.dep_ == 'dobj' or token.dep_ == "pobj") and \
        (token.tag_ == 'NNS' or token.tag_ == 'NN' or token.tag_ == 'NNP') or \
        (token.head.tag_ == 'VB' or token.head.tag_ == 'VBG' or token.head.tag_ == 'VBN'):
            print(token.head.lemma_, token.text)

fly I
fly have
fly flown
fly to
fly La.
fly Now
fly I
fly am
fly flying
fly to
to Fransisco


In [7]:
#attempt 2
for sent in doc.sents:
    print([token.lemma_ for token in sent \
           if(token.dep_ == 'ROOT' or token.dep_ == 'pobj') and \
           (token.tag_ == 'NNS' or token.tag_ == 'NN' or token.tag_ == 'NNP' or\
            token.tag_ == 'VB' or token.tag_ == 'VBD' or token.tag_ == 'VBG')])

[]
['fly', 'Fransisco']


**The above was it!**

# Try this #1 Ch2:
Get noun chunks using tokens' syntatctic children instead of doc.noun_chunks. Utilize the .left container attribute.

In [8]:
doc = nlp(u'A noun chunk is a phrase that has a noun at its head.')
doc

A noun chunk is a phrase that has a noun at its head.

In [9]:
for chunk in doc.noun_chunks:
    print(chunk)

A noun chunk
a phrase
that
a noun
its head


In [10]:
#try this
for token in doc:
    noun_chunk = ''
    if token.pos_ == "NOUN":
        for child in token.lefts: #note: cannot access .lefts via indicies
            noun_chunk = noun_chunk + " " + child.text
        noun_chunk = noun_chunk + " " + token.text
        print(noun_chunk)

 noun
 A noun chunk
 a phrase
 a noun
 its head


**Gottem**

# Try this 2 Ch2:
Use a span to create a custom lemma of a multi word name, San Fransisco, to be a single token.

In [11]:
#demonstration of initial tokens
doc = nlp(u'The Golden Gate Bridge is an iconic landmark in San Fransisco.')
[doc[i] for i in range(len(doc))]

[The, Golden, Gate, Bridge, is, an, iconic, landmark, in, San, Fransisco, .]

In [12]:
doc = nlp(u'The Golden Gate Bridge is an iconic landmark in San Fransisco.')
span = doc[1:4]
lem_id = doc.vocab.strings[span.text] #doc.vocab.strings = a huge list of strings included in en_core_web_sm
#merge golden gate bridge into 1 token
with doc.retokenize() as retokenizer:
    retokenizer.merge(span)
[doc[i] for i in range(len(doc))]

[The, Golden Gate Bridge, is, an, iconic, landmark, in, San, Fransisco, .]

In [13]:
#merge san fransisco into 1 token
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[-3:-1])
[doc[i] for i in range(len(doc))]

[The, Golden Gate Bridge, is, an, iconic, landmark, in, San Fransisco, .]

**Got it**

# Try this 1 Ch4:

Create a script to extract every phrase reffering to an amount of money in a given string. 

In [15]:
doc = nlp(u"The firm earned $1.5 million in 2017, in comparison with $1.2 million in 2016.")
doc

The firm earned $1.5 million in 2017, in comparison with $1.2 million in 2016.

In [16]:
for token in doc:
    print(token.i, token.text, token.pos_, token.tag_)

0 The DET DT
1 firm NOUN NN
2 earned VERB VBD
3 $ SYM $
4 1.5 NUM CD
5 million NUM CD
6 in ADP IN
7 2017 NUM CD
8 , PUNCT ,
9 in ADP IN
10 comparison NOUN NN
11 with ADP IN
12 $ SYM $
13 1.2 NUM CD
14 million NUM CD
15 in ADP IN
16 2016 NUM CD
17 . PUNCT .


In [21]:
#get all references to monetary values from text
phrase = ''
for token in doc:
    phrase = ''
    i = token.i
    if token.tag_ == '$':
        phrase = '$'
        i += 1
        while doc[i].tag_ == 'CD':
            phrase += doc[i].text + ' '
            i += 1
    if phrase != '':
        print(phrase)

$1.5 million 
$1.2 million 


**Got it**

# CH 4 trying out the question thinggy 

In [3]:
import spacy
import sys
nlp = spacy.load('en_core_web_sm')

In [14]:
#this method determines what question type a chatbot would need to respond with based on the user input
def find_chunk(doc):
    chunk = ''
    for i, token in enumerate(doc):
        if token.dep_ == "dobj":
            shift = len([w for w in token.children])  #this only takes the leftmost children, not always the case
            chunk = doc[i-shift:i+1]
            break
    return chunk

In [5]:
doc = nlp('I have a strange steam controller.')
find_chunk(doc)

a strange steam controller

In [7]:
doc = nlp('What do you do with that big hammer?')
find_chunk(doc)

What

In [8]:
doc = nlp('Can I confuse you?')
find_chunk(doc)

you

In [12]:
doc = nlp('He ate the pies.')
find_chunk(doc)

the pies

It seems as though some types of sentences are not suitable for this algorithm.


In [15]:
def determine_question_type(chunk):
    question_type = 'yesno'
    for token in chunk:
        if token.dep_ == 'amod':
            question_type = 'info'
            break
    return question_type

In [19]:
doc = nlp('I like beans.')
determine_question_type(find_chunk(doc))

'yesno'

In [21]:
doc = nlp('I like red beans.')
determine_question_type(find_chunk(doc))

'info'

In [None]:
doc = nlp('What do you do with that big hammer?')

# Intention Recognition

Code from the book, ch 8

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [13]:
def extract_tv_dobj(doc):
    intent = ''
    for token in doc:
        if token.dep_ == 'dobj':
            intent = token.head.text.lower() + token.text.capitalize()
            return intent
    return 'unknown' 

In [9]:
def test_extract(string):
    doc = nlp(string)
    return extract_tv_dobj(doc)

In [5]:
doc = nlp('I want to go to bed.')
extract_tv_dobj(doc)

'unknown'

In [6]:
for token in doc:
    print(token.text, token.dep_)

I nsubj
want ROOT
to aux
go xcomp
to prep
bed pobj
. punct


In [14]:
test_extract('Show me something cool.')

'showSomething'

In [15]:
test_extract('does this sentence have two verbs?')

'haveVerbs'

In [16]:
test_extract('I want a refund')

'wantRefund'