In [1]:
import spacy
nlp = spacy.load('en_core_web_trf')
def dep_pattern(doc):
  for i in range(len(doc)-1):
    if doc[i].dep_ == 'nsubj' and doc[i+1].dep_ == 'aux' and  doc[i+2].dep_ == 'ROOT':
      for tok in doc[i+2].children:
        if tok.dep_ == 'dobj':
          return True
  return False
doc = nlp(u'We can overtake them.')
if dep_pattern(doc):
  print('Found')
else:
  print('Not found')

  from .autonotebook import tqdm as notebook_tqdm


Found


In [5]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_trf')
matcher = Matcher(nlp.vocab)
pattern = [{"DEP": "nsubj"}, {"DEP": "aux"}, {"DEP": "ROOT"}]
matcher.add("NsubjAuxRoot", [pattern])
doc = nlp(u"We can overtake them.")
matches = matcher(doc)
for match_id, start, end in matches:
  span = doc[start:end]
  print("Span: ", span.text)
  print("The positions in the doc are: ", start, "-", end)

Span:  We can overtake
The positions in the doc are:  0 - 3


In [6]:
# multi-pattern i.e., patterns based on both POS + DEP
def dep_pattern(doc):
  for i in range(len(doc)-1):
    if doc[i].dep_ == 'nsubj' and doc[i+1].dep_ == 'aux' and  doc[i+2].dep_ == 'ROOT':
      for tok in doc[i+2].children:
        if tok.dep_ == 'dobj':
          return True
  return False
def pos_pattern(doc):
  for token in doc:
    if token.dep_ == 'nsubj' and token.tag_ != 'PRP':
      return False
    if token.dep_ == 'aux' and token.tag_ != 'MD':
      return False
    if token.dep_ == 'ROOT' and token.tag_ != 'VB':
      return False
    if token.dep_ == 'dobj' and token.tag_ != 'PRP':
      return False
  return True
#Testing code
doc = nlp(u'We can overtake them.')
if dep_pattern(doc) and pos_pattern(doc):
  print('Found')
else:
  print('Not found')

Found


In [4]:
# extracting noun chunks, their text, their dep, and their head word
doc = nlp("Self-driving cars shift insurance liability toward manufacturers.")

print(list(doc.noun_chunks))

for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

[Self-driving cars, insurance liability, manufacturers]
Self-driving cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [5]:
# token meta-data along with the children of each node
doc = nlp("Myriam saw Clement with a telescope.")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children])

Myriam nsubj saw VERB []
saw ROOT saw VERB [Myriam, Clement, with, .]
Clement dobj saw VERB []
with prep saw VERB [telescope]
a det telescope NOUN []
telescope pobj with ADP [a]
. punct saw VERB []


In [9]:
# how to find a root verb(head) of a sentence using nsubj dependency

from spacy.symbols import nsubj, VERB 
doc = nlp("Self-driving cars shift insurance liability toward manufacturers.")
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB: 
        verbs.add(possible_subject.head)

print(verbs)

{shift}


In [10]:
doc = nlp('Self-driving cars shift insurance liability toward manufacturers.')
root = [token for token in doc if token.head == token][0] 
subject = list(root.lefts)[0] 
for descendant in subject.subtree:
    assert subject is descendant or subject.is_ancestor(descendant) 
    print(descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, 
    [ancestor.text for ancestor in descendant.ancestors])

Self npadvmod 0 0 ['driving', 'cars', 'shift']
- punct 0 0 ['driving', 'cars', 'shift']
driving amod 2 0 ['cars', 'shift']
cars nsubj 1 0 ['shift']


In [32]:
#doc = nlp("Measure and compare the lengths and capacities of pairs of objects using uniform informal units")
doc = nlp("Explore differences in words that represent people, places and things (nouns, including pronouns), happenings and states (verbs), qualities (adjectives) and details such as when, where and how (adverbs)")
#doc = nlp('Explore the different contribution of words and images to meaning in stories and informative texts.')
root = [token for token in doc if token.head == token][0] 

print(root.text)

directObject = list(root.rights)[0]

print(directObject.text)

print(list(directObject.subtree))

#for descendant in directObject.subtree:
 #   print(list(descendant.rights))

context = []

print('direct object rights: ', list(directObject.rights))

for node in directObject.rights:
    children = node.children
    for child in children:
        if child.dep_ == 'pobj':
            context.append(node.text +" " + child.text)
            if child.conjuncts:
                for conjunct in child.conjuncts:
                  context.append(node.text +" " + conjunct.text)  

print("context: ", context)




Explore
differences
[differences, in, words, that, represent, people, ,, places, and, things, (, nouns, ,, including, pronouns, ), ,, happenings, and, states, (, verbs, ), ,, qualities, (, adjectives, ), and, details, such, as, when, ,, where, and, how, (, adverbs, )]
direct object rights:  [in]
context:  ['in words']
