In [1]:
import spacy

LINKS_DATA = "../../src/ChatEgw.UI.Indexer/paragraphs-raw.tsv"
OUTPUT_FOLDER = "../data/v4"
PROCESS_COUNT = 1
BATCH_SIZE = 512


In [2]:
import typing


def read_lines(filename):
    with open(filename, "r", encoding="utf-8") as f:
        for line in f.readlines():
            try:
                line = line.strip().split("\t", 1)
                para_id = line[0]
                text = line[1]
                yield text
            except Exception as e:
                print(e)
                print(line)
                raise


def line_count(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return len(f.readlines())


In [3]:
data_count = line_count(LINKS_DATA)
LINES = [r for r in read_lines(LINKS_DATA)]


In [4]:
from chategw.util import split_every
import tqdm
import os.path
from spacy.tokens import DocBin

spacy.require_gpu()
nlp = spacy.load("en_core_web_lg")
nlp_blank = spacy.blank("en")
if not os.path.exists(OUTPUT_FOLDER):
    os.mkdir(OUTPUT_FOLDER, mode=0o755)


In [5]:


allowed_labels = {'PERSON': 'PERSON', 'LOC': 'LOC', 'GPE': 'LOC'}


def read_docs(count: int, lines: typing.List[str]) -> typing.Tuple[int, typing.List[typing.Tuple[int, int, str]]]:
    total = 0
    errors = 0
    iterable = tqdm.tqdm(lines, total=len(lines), desc="Loading paragraphs", unit="paragraph")
    for chunk in split_every(count, iterable):
        iterable.set_description(
            f"Processing paragraphs (Errors: {errors}/{total} [{float(errors * 100) / (total + 1):.2f}%])", False)
        docs = nlp.pipe(chunk, n_process=PROCESS_COUNT, batch_size=BATCH_SIZE)
        docs_processed = nlp_blank.pipe(chunk, n_process=PROCESS_COUNT, batch_size=BATCH_SIZE)
        docs_lower_processed = nlp_blank.pipe([s.lower() for s in chunk], n_process=PROCESS_COUNT,
                                              batch_size=BATCH_SIZE)
        for c, doc, doc_result, doc_result_lower in zip(chunk, docs, docs_processed, docs_lower_processed):
            total += 1
            ents = []
            for e in doc.ents:
                if e.label_ in allowed_labels:
                    ents.append((e.start, e.end, allowed_labels[e.label_]))
            doc_processed_ents = []
            for start, end, label in ents:
                doc_processed_ents.append(spacy.tokens.Span(doc_result, start, end, label=label))
            doc_result.ents = doc_processed_ents
            yield doc_result
            if len(doc_result) == len(doc_result_lower):
                doc_lower_processed_ents = []
                for start, end, label in ents:
                    doc_lower_processed_ents.append(spacy.tokens.Span(doc_result_lower, start, end, label=label))
                doc_result_lower.ents = doc_lower_processed_ents
                yield doc_result_lower
            else:
                errors += 1




In [6]:
all_docs = []
import torch

with torch.no_grad():
    with torch.inference_mode():

        if not os.path.exists(OUTPUT_FOLDER):
            os.makedirs(OUTPUT_FOLDER, mode=0o755, exist_ok=True)
        print(OUTPUT_FOLDER)
        for i, chunk in enumerate(split_every(100_000, read_docs(BATCH_SIZE, LINES))):
            db = DocBin(docs=chunk)
            db.to_disk(os.path.join(OUTPUT_FOLDER, f"chunk-{i + 1}.spacy"))


../data/v4/train


Processing paragraphs (Errors: 76737/1231360 [6.23%]): 100%|██████████| 1232206/1232206 [38:34<00:00, 532.45paragraph/s] 
