In [145]:
import json

with open('../models/book-synonyms.json', 'r') as file:
    # Load JSON data from file
    publications = json.load(file)

with open('../models/bible-synonyms.json', 'r') as file:
    # Load JSON data from file
    bible = json.load(file)

BOOK_TEMPLATES = [
    "the book %book%",
    "the book of %book%",
    "%book% v. %page%-%endpage%",
    "%book%",
    "%book% %page%",
    "%book% page %page%",
    "%book%, page %page%",
    "%book% pp %page%-%endpage%",
    "%book% pp. %page%-%endpage%",
    "%book%, pp %page%-%endpage%",
    "%book%, pp. %page%-%endpage%",
    "%book% page %page%-%endpage%",
    "%book%, page %page%-%endpage%",
    "%book% pages %page%-%endpage%",
    "%book%, pages %page%-%endpage%",
    "%book% page %page% to %endpage%",
    "%book%, page %page% to %endpage%",
    "%book% pages %page% to %endpage%",
    "%book%, pages %page% to %endpage%",
    "chapter %chapter% of %book%",
    "chapter %chapter%-%endchapter% of %book%",
    "chapters %chapter%-%endchapter% of %book%",
    "chapter %chapter% - %endchapter% of %book%",
    "chapters %chapter% - %endchapter% of %book%",
    "chapter %chapter% to %endchapter% of %book%",
    "chapters %chapter% to %endchapter% of %book%",
    # "article %article% of %book%",
    "page %page% of %book%",
    "page %page%-%endpage% of %book%",
    "pages %page%-%endpage% of %book%",
    "page %page%-%endpage% of %book%",
    "pages %page% - %endpage% of %book%",
    "page %page% - %endpage% of %book%",
    "pages %page%-%endpage% of %book%",
    "page %page% to %endpage% of %book%",
    "pages %page% to %endpage% of %book%",
]

BIBLE_TEMPLATES = [
    "%book%",
    "kjv %book%",
    "%book% %page%",
    "kjv %book% %page%",
    "book %book% %page%",
    "kjv %book% %page%-%endpage%",
    "book %book% %page%-%endpage%",
    "%book% %page% %paragraph%",
    "kjv %book% %page% %paragraph% ",
    "book %book% %page% %paragraph% ",
    "kjv %book% %page% %paragraph%-%endparagraph%",
    "book %book% %page% %paragraph%-%endparagraph%",
]

In [149]:
import typing
from spacy.tokens import Doc
import spacy

import random
import string

nlp = spacy.blank("en")

def make(bookname: str, template: str):
    page = random.randint(1, 500)
    endpage = random.randint(page + 1, page + 20)
    chapter = random.randint(1, 70)
    endchapter = random.randint(chapter + 1, chapter + 10)
    paragraph = random.randint(1, 20)
    endparagraph = random.randint(paragraph + 1, paragraph + 20)

    book_placeholder = 'Й' * len(bookname)
    page_placeholder = 'Ц' * len(str(page))
    endpage_placeholder = 'У' * len(str(endpage))
    chapter_placeholder = 'К' * len(str(chapter))
    endchapter_placeholder = 'Е' * len(str(endchapter))
    paragraph_placeholder = 'Н' * len(str(paragraph))
    endparagraph_placeholder = 'Г' * len(str(endparagraph))

    result = template
    result = result.replace('%book%', book_placeholder)
    result = result.replace('%page%', page_placeholder)
    result = result.replace('%endpage%', endpage_placeholder)
    result = result.replace('%chapter%', chapter_placeholder)
    result = result.replace('%endchapter%', endchapter_placeholder)
    result = result.replace('%paragraph%', paragraph_placeholder)
    result = result.replace('%endparagraph%', endparagraph_placeholder)

    book_position = result.find(book_placeholder)
    book_end = book_position + len(book_placeholder)

    page_position = result.find(page_placeholder)
    page_end = page_position + len(page_placeholder)

    endpage_position = result.find(endpage_placeholder)
    endpage_end = endpage_position + len(endpage_placeholder)

    chapter_position = result.find(chapter_placeholder)
    chapter_end = chapter_position + len(chapter_placeholder)

    endchapter_position = result.find(endchapter_placeholder)
    endchapter_end = endchapter_position + len(endchapter_placeholder)

    paragraph_position = result.find(paragraph_placeholder)
    paragraph_end = paragraph_position + len(paragraph_placeholder)

    endparagraph_position = result.find(endparagraph_placeholder)
    endparagraph_end = endparagraph_position + len(endparagraph_placeholder)

    result = result.replace(book_placeholder, bookname)
    result = result.replace(page_placeholder, str(page))
    result = result.replace(endpage_placeholder, str(endpage))
    result = result.replace(chapter_placeholder, str(chapter))
    result = result.replace(endchapter_placeholder, str(endchapter))
    result = result.replace(paragraph_placeholder, str(paragraph))
    result = result.replace(endparagraph_placeholder, str(endparagraph))

    # print(result)
    doc = nlp(result)
    # print(list(doc.sents))
    # print(bookname, book_position, book_end)
    ents = []
    ents.append(doc.char_span(book_position, book_end, label='BOOK'))
    if page_position >= 0:
        ents.append(doc.char_span(page_position, page_end, label='PAGE'))
    if endpage_position >= 0:
        ents.append(doc.char_span(endpage_position, endpage_end, label='ENDPAGE'))
    if chapter_position >= 0:
        ents.append(doc.char_span(chapter_position, chapter_end, label='CHAPTER'))
    if endchapter_position >= 0:
        ents.append(doc.char_span(endchapter_position, endchapter_end, label='ENDCHAPTER'))
    if paragraph_position >= 0:
        ents.append(doc.char_span(paragraph_position, paragraph_end, label='PARAGRAPH'))
    if endparagraph_position >= 0:
        ents.append(doc.char_span(endparagraph_position, endparagraph_end, label='ENDPARAGRAPH'))
    doc.ents = ents
    return doc


def make_variants(bookname: str, template: str) -> typing.Iterable[Doc]:
    yield make(bookname, template)
    yield make(bookname.lower(), template)
    yield make(bookname.translate(str.maketrans('', '', string.punctuation)), template)


def process(publications, templates):
    for publication in publications:
        for template in templates:
            for synonym in publication['synonyms']:
                yield from make_variants(synonym, template)



In [150]:
!rm ../models/ner_*.spacy || true

zsh:1: no matches found: ../models/ner_*.spacy


In [151]:
def fill_templates(data, templates, count):
    result = []
    for n in range(count):
        result.extend(process(data, templates))
    random.shuffle(result)
    return result


docs = fill_templates(publications, BOOK_TEMPLATES, 5)
bible_docs = fill_templates(bible, BIBLE_TEMPLATES, 20)

# 
# def _save(docs, path):
#     db = DocBin(docs=docs)
#     db.to_disk(path)
# 
# 
# _save(train, "../models/ner_train.spacy")
# _save(dev, "../models/ner_dev.spacy")
# _save(test, "../models/ner_test.spacy")

In [152]:
def do_split(data, train_split=0.8, dev_split=0.1):
    return (data[:int(len(data) * train_split)],
            data[int(len(data) * train_split):int(len(data) * (train_split + dev_split))],
            data[int(len(data) * (train_split + dev_split)):])


train, dev, test = do_split(docs)
train_bible, dev_bible, test_bible = do_split(bible_docs)

In [153]:
(len(train), len(train_bible))

(508200, 132480)

In [154]:
from spacy.tokens import DocBin


def _save(docs, path):
    db = DocBin(docs=docs)
    db.to_disk(path)
# 
# 
_save(train + train_bible, "../models/ner_train.spacy")
_save(dev + dev_bible, "../models/ner_dev.spacy")
_save(test + test_bible, "../models/ner_test.spacy")

In [155]:
!ls -l ../models/ner_*.spacy


-rw-rw-r-- 1 mjr mjr  4717523 Dec 14 13:45 ../models/ner_dev.spacy
-rw-rw-r-- 1 mjr mjr  4713748 Dec 14 13:45 ../models/ner_test.spacy
-rw-rw-r-- 1 mjr mjr 37624422 Dec 14 13:45 ../models/ner_train.spacy


In [156]:
!python -m spacy init fill-config ../models/ner_base_config.cfg ../models/ner_config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
../models/ner_config.cfg
You can now add your data and train your pipeline:
python -m spacy train ner_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [157]:
!rm -Rf ../models/ner_model
!python -m spacy train ../models/ner_config.cfg --output ../models/ner_model --paths.train ../models/ner_train.spacy --paths.dev ../models/ner_dev.spacy --gpu-id 0

[38;5;2m✔ Created output directory: ../models/ner_model[0m
[38;5;4mℹ Saving to output directory: ../models/ner_model[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     73.87    0.39    3.01    0.21    0.00
  0     200        131.57   3543.32   99.52   99.52   99.53    1.00
  0     400         75.24     68.74   99.98   99.98   99.97    1.00
  0     600         30.98     13.06   99.92   99.91   99.93    1.00
  0     800         60.60     22.46   99.97   99.97   99.97    1.00
  0    1000         46.99     13.94   99.99   99.99   99.98    1.00
  0    1200         22.35      7.04  100.00  100.00   99.99    1.00
  0    1400         82.31     22.61   99.99   99.99   99.99    1.00
  0    1600        

In [158]:
!python -m spacy evaluate ../models/ner_model/model-best ../models/ner_test.spacy --gpu-id 0

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   100.00
NER R   100.00
NER F   100.00
SPEED   153320

[1m

                    P        R        F
BOOK           100.00   100.00   100.00
PAGE           100.00   100.00   100.00
ENDPAGE        100.00   100.00   100.00
CHAPTER        100.00   100.00   100.00
ENDCHAPTER     100.00   100.00   100.00
PARAGRAPH      100.00   100.00   100.00
ENDPARAGRAPH   100.00   100.00   100.00



In [159]:
nlp2 = spacy.load("../models/ner_model/model-best")


In [160]:
doc2 = nlp2("romans 5 8")

In [161]:
for s in doc2:
    print("s>", s)

s> romans
s> 5
s> 8


In [162]:
for ent in doc2.ents:
    print(ent, ent.label_)

romans BOOK
5 PAGE
8 PARAGRAPH
