In [12]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")
nlp

<spacy.lang.en.English at 0x7fabb3377050>

In [13]:
# Read examples.tsv from parent folder in array
import csv

examples = []
with open("../examples.tsv", "r") as f:
    reader = csv.reader(f)
    for a in reader:
        text = a[0]
        result = a[1] if len(a) > 1 else '-'
        expected = None
        if result == "+":
            expected = text
        elif result == '-':
            expected = ''
        else:
            expected = result
        examples.append(dict(text=text, expected=expected))


In [14]:
from spacy.matcher import PhraseMatcher

matcher = Matcher(nlp.vocab)
lt_ms_patterns = [
    [
        {"LOWER": {"IN": ["lt", "ms", "letter", "manuscript"]}},
        {"LOWER": {"REGEX": "^[0-9]{1,3}[a-z]?$"}, "OP": "?"},
        {"IS_PUNCT": True, "OP": "?"},
        {"IS_DIGIT": True, "LENGTH": {">=": 3, "<=": 5}}
    ],
    [
        {"LOWER": {"REGEX": "^(lt|ms|letter|manuscript)[0-9]+[a-z]?$"}},
        {"IS_PUNCT": True, "OP": "?"},
        {"IS_DIGIT": True, "LENGTH": {"==": 4}}
    ]
]
matcher.add("lt_ms", lt_ms_patterns)

import json

phrase_matcher = PhraseMatcher(nlp.vocab)
with open("../models/en-bible-meta.json", "r") as f:
    bible_info = json.load(f)
    for book in bible_info:
        name = book['Name']
        synonyms = [book['Name'], book['EnglishName']]
        synonyms.extend(book['Synonym'])
        synonyms = sorted(list(set(synonyms)), reverse=True, key=len)
        phrase_matcher.add(name, [nlp(t) for t in set(synonyms)])
        # for synonym in synonyms:
        #     doc = nlp(synonym)
        #     for sent in doc.sents:
        #         print(synonym,  list(sent.subtree), list((s.lemma_, s.tag_) for s in  sent.subtree))
        #     # matcher.add(synonym, [[{"LOWER": synonym.lower()}]])



In [15]:


def print_error(message, row, doc, matches):
    print("ERROR: ", message)
    print("Error in " + row['text'])
    print("Expected: " + row['expected'])
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        print(match_id, string_id, start, end, span.text)
    print("*" * 80)


results = []
for row in examples:
    doc = nlp(row['text'])
    matches = matcher(doc)

    if len(matches) == 0:
        if row['expected'] != "":
            print_error("Unexpected match", row, doc, matches)
        continue
    if len(matches) > 1:
        print_error("Unexpected number of matches", row, doc, matches)
        continue
    match_id, start, end = matches[0]
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    if span != row['expected']:
        print_error("Unexpected match", row, doc, matches)
    results.append(dict(
        text=row['text'],
        matches=[(nlp.vocab.strings[match_id], doc[start:end].text) for match_id, start, end in matches]
    ))

In [16]:
texts = [
    "What does the acts of the apostles 15 say about the role of God?",
    "What does the acts of the apostles, 15 say about the role of God?",
    "What does AA 15 say about the role of God?",
    "What is the difference in explanation of genesis between GC and AA?",
]
expected = [
    ["acts of the apostles 15"],
    ["AA 15"],
    ["GC", "AA"],
]


In [17]:
docs = [nlp(text) for text in texts]

In [18]:
for doc in docs:
    print(">>>", doc)
    for match_id, start, end in matcher(doc):
        print("l>", doc[start:end])
    for match_id, start, end in phrase_matcher(doc):
        print("b>", doc[start:end])
    for t in doc:
        print(t.lemma_, t.tag_, t.pos_)



>>> What does the acts of the apostles 15 say about the role of God?
b> the acts
b> acts
what WP PRON
do VBZ AUX
the DT DET
act NNS NOUN
of IN ADP
the DT DET
apostle NNS NOUN
15 CD NUM
say VBP VERB
about IN ADP
the DT DET
role NN NOUN
of IN ADP
God NNP PROPN
? . PUNCT
>>> What does the acts of the apostles, 15 say about the role of God?
b> the acts
b> acts
what WP PRON
do VBZ AUX
the DT DET
act NNS NOUN
of IN ADP
the DT DET
apostle NNS NOUN
, , PUNCT
15 CD NUM
say VBP VERB
about IN ADP
the DT DET
role NN NOUN
of IN ADP
God NNP PROPN
? . PUNCT
>>> What does AA 15 say about the role of God?
what WP PRON
do VBZ AUX
AA NNP PROPN
15 CD NUM
say VB VERB
about IN ADP
the DT DET
role NN NOUN
of IN ADP
God NNP PROPN
? . PUNCT
>>> What is the difference in explanation of genesis between GC and AA?
b> is
b> genesis
what WP PRON
be VBZ AUX
the DT DET
difference NN NOUN
in IN ADP
explanation NN NOUN
of IN ADP
genesis NN NOUN
between IN ADP
GC NNP PROPN
and CC CCONJ
AA NNP PROPN
? . PUNCT


In [41]:


texts = ["What does genesis 1 14 say about genesis in genesis?"]
# "What was written by ellen white  about main theme in genesis?",
# "What does the acts of the apostles, 15 say about genesis in genesis?",
# "What does ellen white say in the acts of apostles?",
# "What AA says about genesis?",
# "What is written in hebrews about faith?",
# "What did Ellen White write in A Call to Medical Evangelism and Health Education about faith?",
# ]
docs = nlp.pipe(texts)

displacy.render(docs,  style="dep", options=dict(fine_grained=True, compact=True, collapse_punct=True))

In [21]:
spacy.explain("aux")

'auxiliary'

* `What does the acts of the apostles 15 say about genesis in genesis?` - Find nsubj. Verb should be one of say, write, explain, describe, talk, speak, tell, mention, discuss, comment, note, state, declare, report, indicate, point, refer,
* `What was written by ellen white  about main theme in genesis??` - prep ('in') -> pobj

In [28]:
spacy.explain("VB")

'verb, base form'