In [40]:
import codecs
import spacy
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
from tqdm import tqdm

FILENAME = "../paragraphs-raw.tsv"
OUTPUT_FILENAME = "../split.tsv"
# SPACY_MODEL = "../data/v3-model.cpu/model-best"
SPACY_MODEL = "en_core_web_lg"
allowed_labels = {'PERSON', 'LOC', 'GPE'}

In [41]:
from chategw.models.sentence_splitter import SpacySentenceSplitter
spacy.require_cpu()
splitter = SpacySentenceSplitter(n_process=8, spacy_model=SPACY_MODEL)

def line_count(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return len(f.readlines())

In [42]:
total_lines = line_count(FILENAME)

In [43]:
from chategw.util import split_every

f = codecs.open(FILENAME, 'r', encoding='utf-8')
out_f = codecs.open(OUTPUT_FILENAME, 'w', buffering=True, encoding='utf-8')
for lines in split_every(20_000, tqdm(f, total=total_lines, desc="Splitting", mininterval=2)):
    block = [line.strip().split('\t', 1) for line in lines]
    chunks = splitter.split([b[1] for b in block])
    for i, item in enumerate(chunks):
        sentences = [sentence for sentence in item if len(sentence.text) > 100]
        for sentence in sentences:
            out_f.write(block[i][0])
            out_f.write('\t')
            out_f.write(sentence.text)
            for entity in sentence.entities:
                if entity.type not in allowed_labels:
                    continue
                out_f.write('\t')
                out_f.write(entity.type)
                out_f.write('-')
                out_f.write(entity.text)
            out_f.write('\n')
out_f.flush()
out_f.close()
f.close()

Splitting: 100%|██████████| 1232178/1232178 [34:51<00:00, 589.17it/s]
