In [1]:
import spacy

nlp_base = spacy.load("en_core_web_sm")
nlp_base

<spacy.lang.en.English at 0x7ff0e35dcf90>

In [37]:
# Read examples.tsv from parent folder in array
import csv

examples = []
with open("../examples.tsv", "r") as f:
    reader = csv.reader(f)
    for a in reader:
        text = a[0]
        result = a[1] if len(a) > 1 else '-'
        expected = None
        if result == "+":
            expected = text
        elif result == '-':
            expected = ''
        else:
            expected = result
        examples.append(dict(text=text, expected=expected))


In [38]:

from spacy.matcher import Matcher

matcher = Matcher(nlp_base.vocab)
lt_ms_patterns = [
    [
        {"LOWER": {"IN": ["lt", "ms", "letter", "manuscript"]}},
        {"LOWER": {"REGEX": "^[0-9]{1,3}[a-z]?$"}, "OP": "?"},
        {"IS_PUNCT": True, "OP": "?"},
        {"IS_DIGIT": True, "LENGTH": {">=": 3, "<=": 5}}
    ],
    [
        {"LOWER": {"REGEX": "^(lt|ms|letter|manuscript)[0-9]+[a-z]?$"}},
        {"IS_PUNCT": True, "OP": "?"},
        {"IS_DIGIT": True, "LENGTH": {"==": 4}}
    ]
]
matcher.add("lt_ms", lt_ms_patterns)



In [39]:


def print_error(message, row, doc, matches):
    print("ERROR: ", message)
    print("Error in " + row['text'])
    print("Expected: " + row['expected'])
    for match_id, start, end in matches:
        string_id = nlp_base.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        print(match_id, string_id, start, end, span.text)
    print("*" * 80)


results = []
for row in examples:
    doc = nlp_base(row['text'])
    matches = matcher(doc)

    if len(matches) == 0:
        if row['expected'] != "":
            print_error("Unexpected match", row, doc, matches)
        continue
    if len(matches) > 1:
        print_error("Unexpected number of matches", row, doc, matches)
        continue
    match_id, start, end = matches[0]
    string_id = nlp_base.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    if span != row['expected']:
        print_error("Unexpected match", row, doc, matches)
    results.append(dict(
        text=row['text'],
        matches=[(nlp_base.vocab.strings[match_id], doc[start:end].text) for match_id, start, end in matches]
    ))
results

[{'text': 'lt 12, 1973', 'matches': [('lt_ms', 'lt 12, 1973')]},
 {'text': 'lt 12 1973', 'matches': [('lt_ms', 'lt 12 1973')]},
 {'text': 'letter 12 1973', 'matches': [('lt_ms', 'letter 12 1973')]},
 {'text': 'lt 1973', 'matches': [('lt_ms', 'lt 1973')]},
 {'text': 'letter 15a 1973', 'matches': [('lt_ms', 'letter 15a 1973')]},
 {'text': 'lt15 1973', 'matches': [('lt_ms', 'lt15 1973')]},
 {'text': 'letter15a 1732', 'matches': [('lt_ms', 'letter15a 1732')]},
 {'text': 'ms 15, 9283', 'matches': [('lt_ms', 'ms 15, 9283')]},
 {'text': 'ms 15, 1234', 'matches': [('lt_ms', 'ms 15, 1234')]},
 {'text': 'ms15 1234', 'matches': [('lt_ms', 'ms15 1234')]},
 {'text': 'manuscript 32a 1973',
  'matches': [('lt_ms', 'manuscript 32a 1973')]},
 {'text': 'manuscript32b 1973', 'matches': [('lt_ms', 'manuscript32b 1973')]},
 {'text': 'manuscript32 1973', 'matches': [('lt_ms', 'manuscript32 1973')]}]

In [57]:


texts = [
    # "What do AA and AH say  about genesis in genesis?",
    # "What does the acts of the apostles 15 say about genesis in genesis?",
    # "What AA says about genesis?",
    # "What was said in AA about genesis?",
    # "What was written by ellen white about main theme in genesis?",
    # "What does ellen white say in the acts of apostles 15?",
    # "What is written in hebrews about faith?",
    # "What did Ellen White write in chapter 1 of A Call to Medical Evangelism and Health Education ?",
    # "In \"My Life Today\", how does Ellen White reflect on her own spiritual journey and experiences",
    # "what did Ellen White write in Education 57 about?",
    # "what did Ellen White write in Education page 57 about?",
    "what did Ellen White write in page 57 of Education about?",
    # "what is the best quote from education pages 57-82",
    # "what is written in chapter 2 of aa",
    # "how the book of matthew describes the birth of Jesus?",
    # "what ellen white wrote about apostles in aa 548",
]

import chategw.models.book_parsing 
import importlib
importlib.reload(chategw.models.book_parsing)
dep_matcher = chategw.models.book_parsing.EgwBookMatcher("../models/ner_model/model-best", patterns=pattern_data)
# nlp2 = spacy.load("../models/ner_model/model-best")
# docs = nlp2.pipe(texts)
for s in texts:
    print(">", s)
    is_found = False
    for code, match in dep_matcher(s):
        print(code, match.model_dump_json(indent=2,exclude_none=True, exclude_unset=True))
        # doc2 = nlp2(match) 
        # for ent in doc2.ents:
        #     print("L:", ent.text, ent.label_)
        # is_found = True
    # if not is_found:
    #     displacy.render(doc, style="dep")
    print("-" * 80)


> what did Ellen White write in page 57 of Education about?
BOOK {
  "book": "Ellen White",
  "text": "Ellen White"
}
BOOK {
  "book": "Education",
  "page": 57,
  "text": "page 57 of Education"
}
--------------------------------------------------------------------------------
