In [2]:
import spacy
nlp = spacy.load('en_core_web_trf')
def dep_pattern(doc):
  for i in range(len(doc)-1):
    if doc[i].dep_ == 'nsubj' and doc[i+1].dep_ == 'aux' and  doc[i+2].dep_ == 'ROOT':
      for tok in doc[i+2].children:
        if tok.dep_ == 'dobj':
          return True
  return False
doc = nlp(u'We can overtake them.')
if dep_pattern(doc):
  print('Found')
else:
  print('Not found')

  from .autonotebook import tqdm as notebook_tqdm


Found


In [5]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_trf')
matcher = Matcher(nlp.vocab)
pattern = [{"DEP": "nsubj"}, {"DEP": "aux"}, {"DEP": "ROOT"}]
matcher.add("NsubjAuxRoot", [pattern])
doc = nlp(u"We can overtake them.")
matches = matcher(doc)
for match_id, start, end in matches:
  span = doc[start:end]
  print("Span: ", span.text)
  print("The positions in the doc are: ", start, "-", end)

Span:  We can overtake
The positions in the doc are:  0 - 3


In [6]:
# multi-pattern i.e., patterns based on both POS + DEP
def dep_pattern(doc):
  for i in range(len(doc)-1):
    if doc[i].dep_ == 'nsubj' and doc[i+1].dep_ == 'aux' and  doc[i+2].dep_ == 'ROOT':
      for tok in doc[i+2].children:
        if tok.dep_ == 'dobj':
          return True
  return False
def pos_pattern(doc):
  for token in doc:
    if token.dep_ == 'nsubj' and token.tag_ != 'PRP':
      return False
    if token.dep_ == 'aux' and token.tag_ != 'MD':
      return False
    if token.dep_ == 'ROOT' and token.tag_ != 'VB':
      return False
    if token.dep_ == 'dobj' and token.tag_ != 'PRP':
      return False
  return True
#Testing code
doc = nlp(u'We can overtake them.')
if dep_pattern(doc) and pos_pattern(doc):
  print('Found')
else:
  print('Not found')

Found


In [4]:
# extracting noun chunks, their text, their dep, and their head word
doc = nlp("Self-driving cars shift insurance liability toward manufacturers.")

print(list(doc.noun_chunks))

for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

[Self-driving cars, insurance liability, manufacturers]
Self-driving cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [5]:
# token meta-data along with the children of each node
doc = nlp("Myriam saw Clement with a telescope.")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children])

Myriam nsubj saw VERB []
saw ROOT saw VERB [Myriam, Clement, with, .]
Clement dobj saw VERB []
with prep saw VERB [telescope]
a det telescope NOUN []
telescope pobj with ADP [a]
. punct saw VERB []


In [9]:
# how to find a root verb(head) of a sentence using nsubj dependency

from spacy.symbols import nsubj, VERB 
doc = nlp("Self-driving cars shift insurance liability toward manufacturers.")
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB: 
        verbs.add(possible_subject.head)

print(verbs)

{shift}


In [10]:
doc = nlp('Self-driving cars shift insurance liability toward manufacturers.')
root = [token for token in doc if token.head == token][0] 
subject = list(root.lefts)[0] 
for descendant in subject.subtree:
    assert subject is descendant or subject.is_ancestor(descendant) 
    print(descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, 
    [ancestor.text for ancestor in descendant.ancestors])

Self npadvmod 0 0 ['driving', 'cars', 'shift']
- punct 0 0 ['driving', 'cars', 'shift']
driving amod 2 0 ['cars', 'shift']
cars nsubj 1 0 ['shift']


In [32]:
#doc = nlp("Measure and compare the lengths and capacities of pairs of objects using uniform informal units")
doc = nlp("Explore differences in words that represent people, places and things (nouns, including pronouns), happenings and states (verbs), qualities (adjectives) and details such as when, where and how (adverbs)")
#doc = nlp('Explore the different contribution of words and images to meaning in stories and informative texts.')
root = [token for token in doc if token.head == token][0] 

print(root.text)

directObject = list(root.rights)[0]

print(directObject.text)

print(list(directObject.subtree))

#for descendant in directObject.subtree:
 #   print(list(descendant.rights))

context = []

print('direct object rights: ', list(directObject.rights))

for node in directObject.rights:
    children = node.children
    for child in children:
        if child.dep_ == 'pobj':
            context.append(node.text +" " + child.text)
            if child.conjuncts:
                for conjunct in child.conjuncts:
                  context.append(node.text +" " + conjunct.text)  

print("context: ", context)




Explore
differences
[differences, in, words, that, represent, people, ,, places, and, things, (, nouns, ,, including, pronouns, ), ,, happenings, and, states, (, verbs, ), ,, qualities, (, adjectives, ), and, details, such, as, when, ,, where, and, how, (, adverbs, )]
direct object rights:  [in]
context:  ['in words']


In [8]:
text = """Apple Computer today introduced the new MacBook line, which includes the Macbook and Macbook Pro. It is the successor to the iBook line and contains Intel Core Duo processors and a host of features, and starting at a price of $1,099. The Macbook features a 13.3" widescreen display, while the Pro can be purchased with either 15" or 17" displays. It comes in two colors: Black (2 GHz model only) and White (1.83 and 2 GHz models). This release leaves only one PowerPC processor computer that has not made the transition to Intel chips, the PowerMac G5."""

doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

print ("\n\n now printing noun chunks...... ")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text)

Apple Computer ORG
today DATE
MacBook PRODUCT
Macbook PRODUCT
Macbook Pro PRODUCT
iBook PRODUCT
Intel ORG
1,099 MONEY
Macbook PRODUCT
13.3" QUANTITY
Pro PRODUCT
15" QUANTITY
17" QUANTITY
two CARDINAL
2 GHz QUANTITY
1.83 CARDINAL
2 GHz QUANTITY
only one CARDINAL
PowerPC ORG
Intel ORG
PowerMac G5 PRODUCT


 now printing noun chunks...... 
Apple Computer Computer
the new MacBook line line
which which
the Macbook Macbook
Macbook Pro Pro
It It
the successor successor
the iBook line line
Intel Core Duo processors processors
a host host
features features
a price price
The Macbook Macbook
a 13.3" widescreen display display
the Pro Pro
either 15" or 17" displays displays
It It
two colors colors
Black Black
2 GHz model model
White White
1.83 and 2 GHz models models
This release release
only one PowerPC processor computer computer
that that
the transition transition
Intel chips chips
the PowerMac G5 G5


In [27]:
import spacy
nlp = spacy.load('en_core_web_trf')
text = """Apple Computer today introduced the new MacBook line, which includes the Macbook and Macbook Pro. It is the successor to the iBook line and contains Intel Core Duo processors and a host of features, and starting at a price of $1,099. The Macbook features a 13.3" widescreen display, while the Pro can be purchased with either 15" or 17" displays. It comes in two colors: Black (2 GHz model only) and White (1.83 and 2 GHz models). This release leaves only one PowerPC processor computer that has not made the transition to Intel chips, the PowerMac G5."""

corpus = nlp.pipe([text])

for doc in corpus:
    for sent in doc.sents:
        print(sent.text)
        for tok in sent:
            #print(tok.idx)
            print(tok.i, " ", tok.text, tok.is_punct)
            

Apple Computer today introduced the new MacBook line, which includes the Macbook and Macbook Pro.
0   Apple False
1   Computer False
2   today False
3   introduced False
4   the False
5   new False
6   MacBook False
7   line False
8   , True
9   which False
10   includes False
11   the False
12   Macbook False
13   and False
14   Macbook False
15   Pro False
16   . True
It is the successor to the iBook line and contains Intel Core Duo processors and a host of features, and starting at a price of $1,099.
17   It False
18   is False
19   the False
20   successor False
21   to False
22   the False
23   iBook False
24   line False
25   and False
26   contains False
27   Intel False
28   Core False
29   Duo False
30   processors False
31   and False
32   a False
33   host False
34   of False
35   features False
36   , True
37   and False
38   starting False
39   at False
40   a False
41   price False
42   of False
43   $ False
44   1,099 False
45   . True
The Macbook features a 13.3" widesc

In [25]:
from py2neo import Graph
from py2neo import *

graph = Graph("bolt://10.1.48.224:7687", auth=("neo4j", "neo123"))
index = 39
nodes = NodeMatcher(graph)
#token_node = nodes.match("TagOccurrence", token_index_doc=index).first

query = "match (x:TagOccurrence {tok_index_doc:" + str(index) + "}) return x"

token_node= graph.evaluate(query) 

#token_node= graph.evaluate("match (x:TagOccurrence {tok_index_doc:" ,index , "}) return x") 

print(token_node)

(_39:TagOccurrence {id: '3_2_212', index: 212, is_stop: true, lemma: 'at', pos: 'IN', text: 'at', tok_index_doc: 39, tok_index_sent: 22})
