# Sentence Pattern

In [1]:
import spacy

nlp = spacy.load('en_core_web_lg')

doc1 = nlp(u'We can overtake them.')
doc2 = nlp(u'You must specify it.')

for i in range(len(doc1)-1):
    if doc1[i].dep_ == doc2[i].dep_:
        print(doc1[i].text, doc2[i].text, doc1[i].dep_, spacy.explain(doc1[i].dep_))
    if doc1[i].pos_ == doc2[i].pos_:
        print(doc1[i].text, doc2[i].text, doc1[i].pos_, spacy.explain(doc1[i].pos_))

We You nsubj nominal subject
We You PRON pronoun
can must aux auxiliary
can must AUX auxiliary
overtake specify ROOT root
overtake specify VERB verb
them it dobj direct object
them it PRON pronoun


We may check whether a sentence has a specific pattern:

In [2]:
def dep_pattern(doc):
    for i in range(len(doc)-1):
        if doc[i].dep_ == 'nsubj' and doc[i+1].dep_ == 'aux' and  doc[i+2].dep_ == 'ROOT':
            for tok in doc[i+2].children:
                if tok.dep_ == 'dobj':
                    return True
    return False

if dep_pattern(doc1):
  print('Found')
else:
  print('Not found')

Found


or we may use *Matcher* to check the pattern: 

In [4]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

pattern = [[{"DEP": "nsubj"}, {"DEP": "aux"}, {"DEP": "ROOT"}]]

matcher.add("NsubjAuxRoot", pattern)

matches = matcher(doc1)

for match_id, start, end in matches:
    span = doc1[start:end]
    print("Span: ", span.text)
    print("The positions in the doc are: ", start, "-", end)

Span:  We can overtake
The positions in the doc are:  0 - 3


## Generating Text

In [7]:
def dep_pattern(doc):
    for i in range(len(doc)-1):
        if doc[i].dep_ == 'nsubj' and doc[i+1].dep_ == 'aux' and  doc[i+2].dep_ == 'ROOT':
            for tok in doc[i+2].children:
                if tok.dep_ == 'dobj':
                    return True
    return False

def pos_pattern(doc):
    for token in doc:
        if token.dep_ == 'nsubj' and token.tag_ != 'PRP':
            return False
        if token.dep_ == 'aux' and token.tag_ != 'MD':
            return False
        if token.dep_ == 'ROOT' and token.tag_ != 'VB':
            return False
        if token.dep_ == 'dobj' and token.tag_ != 'PRP':
            return False
    return True

def pron_pattern(doc):
    plural = ['we','us','they','them']
    for token in doc:
        if token.dep_ == 'dobj' and token.tag_ == 'PRP':
            if token.text in plural:
                return 'plural'
            else:
                return 'singular'
    return 'not found'

def find_noun(sents, num):
    if num == 'plural':
        taglist = ['NNS','NNPS']
    if num == 'singular':
        taglist = ['NN','NNP']
    for sent in reversed(sents):
        for token in sent:
            if token.tag_ in taglist:
                return token.text
    return 'Noun not found'

def gen_utterance(doc, noun):
    sent = ''
    for i,token in enumerate(doc):
        if token.dep_ == 'dobj' and token.tag_ == 'PRP':
            sent = doc[:i].text + ' ' + noun + ' ' + doc[i+1:len(doc)-2].text + 'too.'
            return sent
    return 'Failed to generate an utterance' 

doc = nlp(u'The symbols are clearly distinguishable. I can recognize them promptly.')

sents = list(doc.sents)

response = ''

noun = ''

for i, sent in enumerate(sents):
    if dep_pattern(sent) and pos_pattern(sent):
        noun = find_noun(sents[:i], pron_pattern(sent))
        if noun != 'Noun not found':
            response = gen_utterance(sents[i],noun)
            break

print(response)

I can recognize symbols too.


## Information Extraction

In [5]:
def det_destination(doc):
    for i, token in enumerate(doc):
        if token.ent_type != 0 and token.ent_type_ == 'GPE':
            while True:
                token = token.head
                if token.text == 'to':
                    return doc[i].text
                if token.head == token:
                    return 'Failed to determine'
    return 'Failed to determine'

doc = nlp(u'I am going to the conference in Berlin.')

dest = det_destination(doc)

print('It seems the user wants a ticket to ' + dest)

It seems the user wants a ticket to Berlin


## Text Summarization

In [6]:
doc = nlp(u"The product sales hit a new record in the first quarter, with 18.6 million units sold.")

phrase = ''

for token in doc:
    if token.pos_ == 'NUM':
        while True:
            phrase = phrase + ' ' + token.text
            token = token.head
            if token not in list(token.head.lefts):
                phrase = phrase + ' ' + token.text
                if list(token.rights):
                    phrase = phrase + ' ' + doc[token.i+1:].text
                break
        break

while True:
    token = doc[token.i].head
    if token.pos_ != 'ADP':
        phrase = token.text + phrase
    if token.dep_ == 'ROOT':
        break

for tok in token.lefts:
    if tok.dep_ == 'nsubj':
        phrase = ' '.join([tok.text for tok in tok.lefts]) + ' ' + tok.text + ' '+ phrase
        break

print(phrase.strip())

The product sales hit 18.6 million units sold.
