Ссылка на данные: https://github.com/vdobrovolskii/rucoco/releases/

In [1]:
from tg.grammar_ru import *
from tg.grammar_ru import Separator
from yo_fluq_ds import *
from typing import *
import json
import os
import re

In [508]:
rucoco_dir = Loc.data_cache_path/'rucoco'
data_dir = rucoco_dir/'v1.0.0'

In [26]:
def get_relative_paths(path: str) -> Iterator[str]:
    return map(lambda entry: entry.path,
               filter(lambda entry: entry.name.endswith(".json"),
                      recursive_scandir(path)))

def recursive_scandir(path: str) -> Iterator[os.DirEntry]:
    for entry in os.scandir(path):
        if entry.is_dir():
            yield from recursive_scandir(entry.path)
        else:
            yield entry
            
def read_markup_dict(path: str) -> dict:
    with open(path, mode="r", encoding="utf8") as f:
        markup_dict = json.load(f)
    markup_dict["entities"] = [[tuple(span) for span in entity]
                               for entity in markup_dict["entities"]]
    return markup_dict

In [517]:
json_paths = list(get_relative_paths(data_dir))
pattern = r'\d+([,-][\w\d]+)*|[-\w]+|[%&≈+╧\']'
skip = 0
frame_counter = 0

def get_rucoco_and_tg_dataframe(text):
    df = Separator.separate_string(text)
    bounded_words = [[text[m.start(0):m.end(0)], m.start(0), m.end(0)] for m in re.finditer(pattern, text)]
    bounded_words = [x for x in bounded_words if x[0] != '-']
    df_bounds = pd.DataFrame(bounded_words, columns=['word', 'start_letter', 'end_letter'])
    if (len(df[df.word_type != 'punct']) != len(bounded_words)):
        return None
    df_bounds = df_bounds.set_index(df[df.word_type != 'punct'].index)
    merged = df.merge(df_bounds, left_index=True, right_index=True, how='left')
    fail_index = merged[(merged.word_type != 'punct') & (merged.word_x != merged.word_y)].index
    if len(fail_index) > 0:
        merged = merged.iloc[:fail_index[0]]
    return merged

def get_category_df(entities, text):
    category = 0
    pre_df = []
    for seq in entities:
        category += 1
        for pair in seq:
            if ' ' in text[pair[0]:pair[1]]:
                pre_df.append([-1, pair[1], category])
                pre_df.append([pair[0], -1, category])
            else:
                pre_df.append([-1, pair[1], category])
    return pd.DataFrame(pre_df, columns=['start_letter', 'end_letter', 'category'])

for file_path in json_paths:
    ref_dict = read_markup_dict(file_path)
    text = ref_dict['text'][:ref_dict['text'].find('Источник:')]
    merged = get_rucoco_and_tg_dataframe(text)
    if merged is None:
        skip += 1
        continue
    annotated = get_category_df(ref_dict['entities'], text)
    
    frame = (merged
             .merge(annotated[['end_letter', 'category']], on='end_letter', how='left')
             .merge(annotated[['start_letter', 'category']], on='start_letter', how='left')
             .fillna(0))
    frame['category'] = frame['category_x'] + frame['category_y']
    frame = (frame
             .drop(columns=['start_letter', 'end_letter', 'category_x', 'category_y', 'word_y'])
             .rename(columns={'word_x': 'word'}))
    frame.to_csv(rucoco_dir/'frames'/(str(frame_counter) + '.csv'))
    frame_counter += 1

In [519]:
skip, frame_counter

(486, 2589)

In [532]:
def get_frames():
    def iter_frames():
        for entry in os.scandir(rucoco_dir/'frames'):
            yield pd.read_csv(entry).drop(columns=['Unnamed: 0'])
    return Queryable(iter_frames())

In [533]:
for i in get_frames():
    print(i)
    break

     word_id  sentence_id  word_index  paragraph_id  word_tail  \
0          0            0           0             0          1   
1          1            0           1             0          1   
2          2            0           2             0          1   
3          3            0           3             0          1   
4          4            0           4             0          1   
..       ...          ...         ...           ...        ...   
363      359           15          24            12          1   
364      360           15          25            12          1   
365      361           15          26            12          1   
366      362           15          27            12          0   
367      363           15          28            12          0   

               word word_type  word_length  category  
0        Компромисс        ru           10       0.0  
1             между        ru            5       0.0  
2        депутатами        ru           10

In [485]:
s = r'c:\users\alexandra\desktop\grammar_ru\venv\lib\site-packages\grammar_ru_dev-0.0.0-py3.9.egg\data-cache\rucoco\v1.0.0\2000_russia_communist.json'
a = read_markup_dict(json_paths[0])

In [486]:
text = a['text'][:a['text'].find('Источник:')]
df = Separator.separate_string(text)

In [489]:
pattern = r'\d+([,-][\w\d]+)*|[-\w]+|[%&≈+╧\']'
words = [[text[m.start(0):m.end(0)], m.start(0), m.end(0)] for m in re.finditer(pattern, text)]
words = [x for x in words if x[0] != '-']
df_bounds = pd.DataFrame(words, columns=['word', 'start_letter', 'end_letter'])
df_bounds = df_bounds.set_index(df[df.word_type != 'punct'].index)

In [491]:
merged = df.merge(df_bounds, left_index=True, right_index=True, how='left')
fail_index = merged[(merged.word_type != 'punct') & (merged.word_x != merged.word_y)].index
if len(fail_index) > 0:
    print('yuy')
    merged = merged.iloc[:fail_index[0]]

In [493]:
merged.tail(20)

Unnamed: 0,word_id,sentence_id,word_index,paragraph_id,word_tail,word_x,word_type,word_length,word_y,start_letter,end_letter
344,344,15,9,12,1,депутатов,ru,9,депутатов,2033.0,2042.0
345,345,15,10,12,1,Думы,ru,4,Думы,2043.0,2047.0
346,346,15,11,12,1,и,ru,1,и,2048.0,2049.0
347,347,15,12,12,1,членов,ru,6,членов,2050.0,2056.0
348,348,15,13,12,1,Совета,ru,6,Совета,2057.0,2063.0
349,349,15,14,12,1,Федерации,ru,9,Федерации,2064.0,2073.0
350,350,15,15,12,1,принять,ru,7,принять,2074.0,2081.0
351,351,15,16,12,1,бюджет,ru,6,бюджет,2082.0,2088.0
352,352,15,17,12,1,в,ru,1,в,2089.0,2090.0
353,353,15,18,12,1,том,ru,3,том,2091.0,2094.0


In [118]:
import pandas as pd

pd.DataFrame(words_and_bounds(a['text']),
                   columns=['word', 'start_letter', 'end_letter'])

Unnamed: 0,word,start_letter,end_letter
0,Компромисс,0,10
1,между,11,16
2,депутатами,17,27
3,и,28,29
4,правительством,30,44
...,...,...,...
301,он,2111,2113
302,был,2114,2117
303,внесен,2118,2124
304,кабинетом,2125,2134


In [91]:
a['text'].find('Источник:')

2147

In [100]:
w1 = set(df[df.word_type != 'punct'].word)

In [113]:
w2.difference(w1)

{'https://www.newsru.com/finance/13Sep2000/bujet.html', 'Источник'}

-----
(17, 27) депутатами
(202, 212) депутатами
(1297, 1306) депутатов
(1538, 1548) депутатами
(2033, 2047) депутатов Думы
-----
(30, 44) правительством
(185, 199) правительством
(1551, 1565) правительством
(1629, 1642) правительства
(1886, 1899) правительство
(2002, 2015) правительство
(2125, 2144) кабинетом министров
-----
(78, 127) Глава бюджетного комитета Госдумы Александр Жуков
(233, 240) Депутат
(398, 404) Жукова
(709, 715) Жукова
(1034, 1090) председатель бюджетного комитета Госдумы Александр Жуков
(1166, 1191) главы бюджетного комитета
(1476, 1491) Александр Жуков
(1591, 1593) Он
(1702, 1704) Он
-----
(84, 111) бюджетного комитета Госдумы
(1047, 1074) бюджетного комитета Госдумы
(1172, 1191) бюджетного комитета
-----
(104, 111) Госдумы
(965, 972) Госдуму
(1067, 1074) Госдумы
(1108, 1112) Думе
(2043, 2047) Думы
-----
(918, 926) том виде
(930, 935) каком
-----
(936, 950) проект бюджета
(974, 976) он
(2082, 2088) бюджет
(2111, 2113) он
-----
(1193, 1264) председатель думского ком

In [507]:
m = (merged
 .merge(annotated[['end_letter', 'category']], on='end_letter', how='left')
 .merge(annotated[['start_letter', 'category']], on='start_letter', how='left')
.fillna(0))
m['category'] = m['category_x'] + m['category_y']
m = m.drop(columns=['start_letter', 'end_letter', 'category_x', 'category_y', 'word_y']).rename(columns={'word_x': 'word'})
m.head(20)

Unnamed: 0,word_id,sentence_id,word_index,paragraph_id,word_tail,word,word_type,word_length,category
0,0,0,0,0,1,Компромисс,ru,10,0.0
1,1,0,1,0,1,между,ru,5,0.0
2,2,0,2,0,1,депутатами,ru,10,1.0
3,3,0,3,0,1,и,ru,1,0.0
4,4,0,4,0,1,правительством,ru,14,2.0
5,5,0,5,0,1,по,ru,2,0.0
6,6,0,6,0,1,бюджету,ru,7,0.0
7,7,0,7,0,1,на,ru,2,0.0
8,8,0,8,0,1,2001,unk,4,0.0
9,9,0,9,0,1,год,ru,3,0.0
