In [134]:
# read json from ../models/Result_28.json
import json

with open('../models/Result_28.json', 'r') as file:
    # Load JSON data from file
    publications = json.load(file)

TEMPLATES = [
    "the book %book%",
    "the book of %book%",
    "%book% v. %page%-%endpage%",
    "%book%",
    "%book% %page%",
    "%book% page %page%",
    "%book%, page %page%",
    "%book% pp %page%-%endpage%",
    "%book% pp. %page%-%endpage%",
    "%book%, pp %page%-%endpage%",
    "%book%, pp. %page%-%endpage%",
    "%book% page %page%-%endpage%",
    "%book%, page %page%-%endpage%",
    "%book% pages %page%-%endpage%",
    "%book%, pages %page%-%endpage%",
    "%book% page %page% to %endpage%",
    "%book%, page %page% to %endpage%",
    "%book% pages %page% to %endpage%",
    "%book%, pages %page% to %endpage%",
    "chapter %chapter% of %book%",
    "chapter %chapter%-%endchapter% of %book%",
    "chapters %chapter%-%endchapter% of %book%",
    "chapter %chapter% - %endchapter% of %book%",
    "chapters %chapter% - %endchapter% of %book%",
    "chapter %chapter% to %endchapter% of %book%",
    "chapters %chapter% to %endchapter% of %book%",
    # "article %article% of %book%",
    "page %page% of %book%",
    "page %page%-%endpage% of %book%",
    "pages %page%-%endpage% of %book%",
    "page %page%-%endpage% of %book%",
    "pages %page% - %endpage% of %book%",
    "page %page% - %endpage% of %book%",
    "pages %page%-%endpage% of %book%",
    "page %page% to %endpage% of %book%",
    "pages %page% to %endpage% of %book%",
]

In [136]:
import typing
from spacy.tokens import Doc, DocBin
import spacy

import random
import string

nlp = spacy.blank("en")


def make(bookname: str, template: str):
    page = random.randint(1, 500)
    endpage = random.randint(page + 1, page + 20)
    chapter = random.randint(1, 70)
    endchapter = random.randint(chapter + 1, chapter + 10)

    book_placeholder = 'Й' * len(bookname)
    page_placeholder = 'Ц' * len(str(page))
    endpage_placeholder = 'У' * len(str(endpage))
    chapter_placeholder = 'К' * len(str(chapter))
    endchapter_placeholder = 'Е' * len(str(endchapter))

    result = template.replace('%book%', book_placeholder)
    result = result.replace('%page%', page_placeholder)
    result = result.replace('%endpage%', endpage_placeholder)
    result = result.replace('%chapter%', chapter_placeholder)
    result = result.replace('%endchapter%', endchapter_placeholder)

    book_position = result.find(book_placeholder)
    book_end = book_position + len(book_placeholder)

    page_position = result.find(page_placeholder)
    page_end = page_position + len(page_placeholder)

    endpage_position = result.find(endpage_placeholder)
    endpage_end = endpage_position + len(endpage_placeholder)

    chapter_position = result.find(chapter_placeholder)
    chapter_end = chapter_position + len(chapter_placeholder)

    endchapter_position = result.find(endchapter_placeholder)
    endchapter_end = endchapter_position + len(endchapter_placeholder)

    result = result.replace(book_placeholder, bookname)
    result = result.replace(page_placeholder, str(page))
    result = result.replace(endpage_placeholder, str(endpage))
    result = result.replace(chapter_placeholder, str(chapter))
    result = result.replace(endchapter_placeholder, str(endchapter))

    # print(result)
    doc = nlp(result)
    # print(list(doc.sents))
    # print(bookname, book_position, book_end)
    ents = []
    ents.append(doc.char_span(book_position, book_end, label='BOOK'))
    if page_position >= 0:
        ents.append(doc.char_span(page_position, page_end, label='PAGE'))
    if endpage_position >= 0:
        ents.append(doc.char_span(endpage_position, endpage_end, label='ENDPAGE'))
    if chapter_position >= 0:
        ents.append(doc.char_span(chapter_position, chapter_end, label='CHAPTER'))
    if endchapter_position >= 0:
        ents.append(doc.char_span(endchapter_position, endchapter_end, label='ENDCHAPTER'))
    doc.ents = ents
    return doc


def make_variants(bookname: str, template: str) -> typing.Iterable[Doc]:
    yield make(bookname, template)
    yield make(bookname.lower(), template)
    yield make(bookname.translate(str.maketrans('', '', string.punctuation)), template)


def process():
    for publication in publications:
        for template in TEMPLATES:
            for synonym in publication['synonyms']:
                yield from make_variants(synonym, template)




In [130]:
!rm ../models/ner_*.spacy
docs = list(process())
random.shuffle(docs)
train, dev, test = (docs[:int(len(docs) * 0.8)],
                    docs[int(len(docs) * 0.8):int(len(docs) * 0.9)],
                    docs[int(len(docs) * 0.9):])
def _save(docs, path):
    db = DocBin(docs=docs)
    db.to_disk(path)
_save(train, "../models/ner_train.spacy")
_save(dev, "../models/ner_dev.spacy")
_save(test, "../models/ner_test.spacy")

In [131]:
!python -m spacy init fill-config ../models/ner_base_config.cfg ../models/ner_config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
../models/ner_config.cfg
You can now add your data and train your pipeline:
python -m spacy train ner_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [132]:
!rm -Rf ../models/ner_model
!python -m spacy train ../models/ner_config.cfg --output ../models/ner_model --paths.train ../models/ner_train.spacy --paths.dev ../models/ner_dev.spacy --gpu-id 0

[38;5;2m✔ Created output directory: ../models/ner_model[0m
[38;5;4mℹ Saving to output directory: ../models/ner_model[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     76.14    0.44    1.77    0.25    0.00
  0     200        131.61   3159.27   99.82   99.81   99.83    1.00
  0     400         19.35     22.46   99.91   99.89   99.92    1.00
  0     600         77.47     47.09   99.99  100.00   99.99    1.00
  0     800         17.64      7.50   99.97   99.98   99.96    1.00
  0    1000         38.62     13.02   99.99   99.99   99.99    1.00
  0    1200         67.39     18.00   99.99   99.99   99.98    1.00
  0    1400         79.74     18.51   99.93   99.91   99.95    1.00
  0    1600        

In [133]:
!python -m spacy evaluate ../models/ner_model/model-best ../models/ner_test.spacy --gpu-id 0

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   100.00
NER R   100.00
NER F   100.00
SPEED   86753 

[1m

                  P        R        F
BOOK         100.00   100.00   100.00
PAGE         100.00   100.00   100.00
ENDPAGE      100.00   100.00   100.00
CHAPTER      100.00   100.00   100.00
ENDCHAPTER   100.00   100.00   100.00

