# Data

In [1]:
from tqdm import tqdm as log_progress

from itertools import chain
from razdel.utils import (
    join_path,
    list_paths,
    load_lines,
    dump_lines,
    load_xml
)
from razdel.eval.etl.syntag import parse_sents as parse_syntag_sents
from razdel.eval.etl.corpora import parse_sents as parse_corpora_sents
from razdel.eval.etl.gicrya import parse_sents as parse_gicrya_sents
from razdel.eval.etl.rnc import parse_sents as parse_rnc_sents
from razdel.eval.etl.common import group_partitions
from razdel.tests.partition import (
    format_partitions,
    parse_partitions
)

In [2]:
# !mkdir -p data/raw

In [3]:
# !wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/archive/master.zip -P data/raw
# !unzip data/raw/master.zip -d data/raw
# !rm data/raw/master.zip

In [4]:
# paths = list_paths(join_path('data', 'UD_Russian-SynTagRus-master', 'raw', '*.conllu'))
# lines = chain.from_iterable(load_lines(_) for _ in paths)
# records = parse_syntag_sents(lines)
# records = log_progress(records)
# partitions = group_partitions(records)
# lines = format_partitions(partitions)
# dump_lines(lines, join_path('data', 'syntag_sents.txt'))

In [5]:
lines = load_lines(join_path('data', 'syntag_sents.txt'))
syntag_sents = list(parse_partitions(lines))

In [6]:
# !wget http://opencorpora.org/files/export/annot/annot.opcorpora.xml.bz2 -P data/raw
# !bunzip2 -d data/raw/annot.opcorpora.xml.bz2

In [7]:
# path = join_path('data', 'raw', 'annot.opcorpora.xml')
# stream = load_xml(path)
# records = parse_corpora_sents(stream)
# records = log_progress(records)
# partitions = group_partitions(records)
# lines = format_partitions(partitions)
# dump_lines(lines, join_path('data', 'corpora_sents.txt'))

In [8]:
lines = load_lines(join_path('data', 'corpora_sents.txt'))
corpora_sents = list(parse_partitions(lines))

In [9]:
# !wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/GICRYA_texts.zip -P data/raw
# !unzip data/raw/GICRYA_texts.zip -d data/raw
# !rm data/raw/GICRYA_texts.zip

In [11]:
# lines = load_lines(join_path('data', 'raw', 'gikrya_fixed.txt'))
# records = parse_gicrya_sents(lines)
# records = log_progress(records)
# partitions = group_partitions(records)
# lines = format_partitions(partitions)
# dump_lines(lines, join_path('data', 'gicrya_sents.txt'))

In [12]:
lines = load_lines(join_path('data', 'gicrya_sents.txt'))
gicrya_sents = list(parse_partitions(lines))

In [13]:
# !wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/RNC_texts.rar -P data/raw
# !unrar x data/raw/RNC_texts.rar 
# !mv RNCgoldInUD_Morpho.conll data/raw
# !rm data/raw/RNC_texts.rar

In [15]:
# lines = load_lines(join_path('data', 'raw', 'RNCgoldInUD_Morpho.conll'))
# records = parse_rnc_sents(lines)
# records = log_progress(records)
# partitions = group_partitions(records)
# lines = format_partitions(partitions)
# dump_lines(lines, join_path('data', 'rnc_sents.txt'))

In [16]:
lines = load_lines(join_path('data', 'rnc_sents.txt'))
rnc_sents = list(parse_partitions(lines))

# Eval

In [17]:
from razdel.eval.zoo import dot_sentenize

# pip install rusenttokenize
from razdel.eval.zoo import deepmipt_sentenize

# pip install nltk
# nltk.download('punkt')
# wget https://raw.githubusercontent.com/mhq/train_punkt/master/russian.pickle
#      -O ~/nltk_data/tokenizers/punkt/PY3/russian.pickle
from razdel.eval.zoo import nltk_sentenize

# pip install segtok
from razdel.eval.zoo import segtok_sentenize

from razdel import sentenize


# Texterra
# Можно ещё сравнивать с https://texterra.ispras.ru/products, но
# 1. она медленно работает, как минимум затраты на http
# 2. иногда кидает ошибку (возможно дело в английских предложениях)
# 3. качество немного выше segtok


# Polyglot
# реализует http://www.unicode.org/reports/tr29/

# Сорян, не смог установить. Дикие траблы с ICU
# brew install icu4c
# export ICU_VERSION=62.1
# export BASE=/usr/local/Cellar/icu4c/
# export PATH=$PATH:$BASE/$ICU_VERSION/bin
# export PYICU_INCLUDES=$BASE/$ICU_VERSION/include
# export PYICU_LFLAGS=-L$BASE/$ICU_VERSION/lib
# pip install pyicu polyglot

# Вроде установилось но 
# > from polyglot.text import Text
# > Text('...')
# Symbol not found: __ZNK6icu_6214Transliterator12getTargetSetERNS_10UnicodeSetE

In [35]:
%%time
from joblib import Parallel, delayed

from razdel.eval.tests import (
    generate_precision_tests,
    generate_recall_tests,
    correct_precision,
    correct_recall
)
from razdel.eval.report import (
    Dataset,
    Model,
    Metric,
    report_tasks,
    run_task
)

datasets = [
    Dataset('corpora', corpora_sents),
    Dataset('syntag', syntag_sents),
    Dataset('gicrya', gicrya_sents),
    Dataset('rnc', rnc_sents),
]
models = [
    Model('re.split([.?!…])', dot_sentenize),
    Model('nltk.sent_tokenize', nltk_sentenize),
    Model('segtok.split_single', segtok_sentenize),
    Model('deepmipt', deepmipt_sentenize),
    Model('razdel.sentenize', sentenize),
]
metrics = [
    Metric('precision', generate_precision_tests, correct_precision),
    Metric('recall', generate_recall_tests, correct_recall)
]


tasks = list(report_tasks(datasets, models, metrics))
results = Parallel(n_jobs=4)(
    delayed(run_task)(_)
    for _ in log_progress(tasks)
)

100%|██████████| 40/40 [01:57<00:00,  2.95s/it]


CPU times: user 9.68 s, sys: 5.07 s, total: 14.7 s
Wall time: 2min 23s


In [43]:
import pandas as pd

from razdel.eval.report import (
    result_rows,
    show_results
)

table = pd.DataFrame(result_rows(results))
print(show_results(table).round(5).to_html())
show_results(table).round(5)

<table border="1" class="dataframe">
  <thead>
    <tr>
      <th></th>
      <th colspan="2" halign="left">corpora</th>
      <th colspan="2" halign="left">syntag</th>
      <th colspan="2" halign="left">gicrya</th>
      <th colspan="2" halign="left">rnc</th>
    </tr>
    <tr>
      <th></th>
      <th>precision</th>
      <th>recall</th>
      <th>precision</th>
      <th>recall</th>
      <th>precision</th>
      <th>recall</th>
      <th>precision</th>
      <th>recall</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>re.split([.?!…])</th>
      <td>0.94223</td>
      <td>0.97420</td>
      <td>0.97146</td>
      <td>0.99301</td>
      <td>0.96976</td>
      <td>0.97880</td>
      <td>0.93066</td>
      <td>0.98311</td>
    </tr>
    <tr>
      <th>nltk.sent_tokenize</th>
      <td>0.96190</td>
      <td>0.95668</td>
      <td>0.98459</td>
      <td>0.98435</td>
      <td>0.98961</td>
      <td>0.98258</td>
      <td>0.93457</td>
      <td>0.94552</td>
    </tr>
    <tr>
   

Unnamed: 0_level_0,corpora,corpora,syntag,syntag,gicrya,gicrya,rnc,rnc
Unnamed: 0_level_1,precision,recall,precision,recall,precision,recall,precision,recall
re.split([.?!…]),0.94223,0.9742,0.97146,0.99301,0.96976,0.9788,0.93066,0.98311
nltk.sent_tokenize,0.9619,0.95668,0.98459,0.98435,0.98961,0.98258,0.93457,0.94552
segtok.split_single,0.97628,0.92859,0.98234,0.98558,0.98855,0.01466,0.9873,0.10452
deepmipt,0.99048,0.96145,0.99815,0.99239,0.97936,0.97663,0.93721,0.98397
razdel.sentenize,0.99189,0.97895,0.99462,0.99547,0.98396,0.9434,0.9756,0.97054


<table border="1" class="dataframe">
  <thead>
    <tr>
      <th></th>
      <th colspan="2" halign="left">corpora</th>
      <th colspan="2" halign="left">syntag</th>
      <th colspan="2" halign="left">gicrya</th>
      <th colspan="2" halign="left">rnc</th>
    </tr>
    <tr>
      <th></th>
      <th>precision</th>
      <th>recall</th>
      <th>precision</th>
      <th>recall</th>
      <th>precision</th>
      <th>recall</th>
      <th>precision</th>
      <th>recall</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>re.split([.?!…])</th>
      <td>0.94223</td>
      <td>0.97420</td>
      <td>0.97146</td>
      <td>0.99301</td>
      <td>0.96976</td>
      <td>0.97880</td>
      <td>0.93066</td>
      <td>0.98311</td>
    </tr>
    <tr>
      <th>nltk.sent_tokenize</th>
      <td>0.96190</td>
      <td>0.95668</td>
      <td>0.98459</td>
      <td>0.98435</td>
      <td>0.98961</td>
      <td>0.98258</td>
      <td>0.93457</td>
      <td>0.94552</td>
    </tr>
    <tr>
      <th>segtok.split_single</th>
      <td>0.97628</td>
      <td>0.92859</td>
      <td>0.98234</td>
      <td>0.98558</td>
      <td>0.98855</td>
      <td>0.01466</td>
      <td>0.98730</td>
      <td>0.10452</td>
    </tr>
    <tr>
      <th>deepmipt</th>
      <td>0.99048</td>
      <td>0.96145</td>
      <td>0.99815</td>
      <td>0.99239</td>
      <td>0.97936</td>
      <td>0.97663</td>
      <td>0.93721</td>
      <td>0.98397</td>
    </tr>
    <tr>
      <th>razdel.sentenize</th>
      <td>0.99189</td>
      <td>0.97895</td>
      <td>0.99462</td>
      <td>0.99547</td>
      <td>0.98396</td>
      <td>0.94340</td>
      <td>0.97560</td>
      <td>0.97054</td>
    </tr>
  </tbody>
</table>