Умеет нормализовать, сохранять тэги и находить именованные сущности

In [1]:
import re
import pandas as pd
from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()

from natasha import Combinator, DEFAULT_GRAMMARS

# DEFAULT_GRAMMARS содержит стандартный набор правил:
# [
#    <enum 'Person'>,
#    <enum 'Location'>,
#    <enum 'Organisation'>,
#           ...
# ]

combinator = Combinator(DEFAULT_GRAMMARS)

Total grammars count: 146


In [2]:
text =  """
 А. Н. Островский.
Бедная невеста.
Комедия в пяти действиях.
ЛИЦА: Анна Петровна Незабудкина, вдова небогатого чиновника.
Марья Андреевна, ее дочь.
Владимир Васильевич Мерич |} молодые люди, знакомые Незабудкиной.
Иван Иванович Милашин |Платон Маркович Добротворский, старый стряпчий.
Максим Дорофеевич Беневоленский, чиновник.
Арина Егоровна Хорькова, вдова, мещанка."""

In [3]:
symbs = re.compile(r"[^А-Яа-я:!\?,\.\"— -]")
clear = re.compile(r"[ _]{2,}")
punct = re.compile(r"(\.\.\.|!\.\.|\?\.\.|[:!\?,\.\"—])")

In [4]:
def morph_line(line, normalize=True, tag=True, ner=True):
    if ner:
        for grammar, tokens in combinator.resolve_matches(combinator.extract(line)):
            for token in tokens:
                start, end = token.position
                line = line[:start] + "@"*(end-start) + line[end:]
#                 line = line.replace(token.value, "<@>", 1)

    line = re.sub(punct, r" <\1>", line)
    words  = line.split(" ")
    if normalize or tag:
        parsed = [morph.parse(w)[0] for w in words]
        if normalize:
            words  = [wparsed.normal_form  for wparsed in parsed]
        if tag:
            tags   = [wparsed.tag.cyr_repr for wparsed in parsed]

        line = '_'.join(words) + ';' + '_'.join(tags)
    else:
        line = '_'.join(words)
    return line

In [5]:
def prepare_line(line):
    line = re.sub(symbs, "", line)
    line = re.sub(r"[ ]+", " ", line)
    return line

In [6]:
def clear_line(line):
    line = re.sub("@+", "<@>", line)
    line = re.sub(clear, " ", line)
    line = re.sub("><", ">_<", line)
    line = line.strip("_")
    line = line.strip()
    return line

In [7]:
def proc_line(line, normalize=True, tag=True, ner=True):
    line = prepare_line(line)
    line =   morph_line(line, normalize, tag, ner)
    line =   clear_line(line)
    return line

In [8]:
normalize = False
tag = False
ner = True

In [9]:
fnin  = "data/merged_sent_split.txt"
fnout = "data/merged_punct_tokenize.csv"

# with open(fnin, encoding="utf-8") as fin, open(fnout, "w", encoding="utf-8") as fout:
#     for i, line in enumerate(fin.readlines()):
#         pline = proc_line(line, normalize, tag, ner)
#         print(pline, file=fout, sep='')

In [10]:
def worker(line_t):
    return proc_line(line_t, normalize, tag, ner)

In [11]:
from multiprocessing import Pool
p = Pool(8)

with open(fnin, encoding="utf-8") as fin:
    lines = fin.readlines()


with open(fnout, "w", encoding="utf-8") as fout:
    l = len(lines)
    step = l // 1000
    for i in range(0, l + step, step):
        plines = p.map(worker, lines[i:i+step])
        print(i, "/", l, end="\r")
        for pline in plines:
            print(pline, file=fout, sep='')

print("Done")

Done126 / 2126493
