In [3]:
from pymystem3 import Mystem
import logging
import math
import pandas as pd
import re

from random import random
from pandas.core.frame import DataFrame

## Имена файлов

In [16]:
text_file = "data/text.txt"
pairs_with_gr_file = "data/pairs_with_grammar.tsv"
relations_pairs_file = "data/relations_pairs.tsv"
train_file = "data/train.csv"
test_file = "data/test.csv"
disperse_file = "data/disperse_data.tsv"

## Настройка логгера

In [4]:
logging.basicConfig(filename='preparation_results.log',
                    format='[%(asctime)s] [%(levelname)s] %(message)s',
                    level=logging.DEBUG)

lg = logging.getLogger("L")
lg.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
ch.setFormatter(formatter)
lg.addHandler(ch)

## Загрузка стеммера

In [24]:
lg.info("Loading mystem")
m = Mystem()
lg.info("Loaded mystem")

2016-10-30 22:50:54,851 [INFO] Loading mystem
2016-10-30 22:50:54,851 [INFO] Loading mystem
2016-10-30 22:50:54,857 [INFO] Loaded mystem
2016-10-30 22:50:54,857 [INFO] Loaded mystem


## Вспомогательные функции

In [25]:
def parse_gr(gr):
    options = re.search('\(([^\)]*)\)', gr, re.IGNORECASE)

    if options:
        title = options.group(1)
        for stuff in title.split('|'):
            yield gr.replace("(" + title + ")", stuff)
    else:
        yield gr

## Запись пар с их грамматической информацией

Создаем множество кортежей типа  
(грамматическое описание, начальная форма, слово)

In [27]:
lines = set([])

with open(text_file, "rt") as input_file:
    lg.info("file opened")

    for line in input_file:
        
        for w in m.analyze(line):
            if 'analysis' in w:
                for item in w['analysis']:
                    for gramm_info in parse_gr(item['gr']):
                        lines.add("\t".join(
                            [gramm_info, item['lex'], w['text'].lower()]) + "\n")
                        # lines.add("\t".join(
                        #     [gramm_info, item['lex'], w['text'].lower()]).encode("utf-8") + "\n")


2016-10-30 22:50:58,414 [INFO] file opened
2016-10-30 22:50:58,414 [INFO] file opened


Записываем их в файл pairs_with_grammar

In [29]:
with open(pairs_with_gr_file, "wt") as f:
    for line in lines:
        f.write(line)

## Запись данных для обучения

Читаем файл

In [31]:
dict = {}

for line in open(pairs_with_gr_file, "rt"):
    if line.strip():
        desc, normal, form = line.strip().split("\t")
        if desc not in dict:
            dict[desc] = []
        dict[desc].append((normal, form))

lg.info("Pairs acquired")

2016-10-30 22:56:07,484 [INFO] Pairs acquired
2016-10-30 22:56:07,484 [INFO] Pairs acquired


И создаем декартово произведение пар (слово, нач. форма)

In [32]:
writer = open(relations_pairs_file, "w+")

for desc in dict:
    for p0 in dict[desc]:
        for p1 in dict[desc]:
            if not p0 == p1:
                writer.write("\t".join([p0[0], p0[1], p1[0], p1[1]]) + "\n")

writer.close()
lg.info("Relations pairs acquired")

2016-10-30 22:57:31,722 [INFO] Relations pairs acquired
2016-10-30 22:57:31,722 [INFO] Relations pairs acquired


Прорежаем данные, если нужно (уменьшаем размер в factor раз)

In [17]:
factor = 1000

if factor != 1:
    with open(relations_pairs_file, "rt") as input_:
        with open(disperse_file, "wt") as output:
            for line in input_:
                if random() < 1.0 / factor:
                    output.write(line)
            lg.info("Use Disperse data")

2016-10-30 23:43:20,097 [INFO] Use Disperse data


Делим данные на данные для обучения и для тестирования (было в функции prepare_relations)

In [18]:
splitting = 0.9

all_data_list = pd.read_csv(relations_pairs_file if factor == 1 else disperse_file, 
                            header=None, encoding="utf-8", sep="\t")
all_data_list.dropna()

# shuffle(all_data_list)

splitting = int(math.floor(splitting * len(all_data_list)))
train_ds = DataFrame(all_data_list[:splitting])
test_ds = DataFrame(all_data_list[splitting:])

train_ds.to_csv(train_file, encoding="utf-8", index=False, header=False, sep=",", quotechar='"')
test_ds.to_csv(test_file, encoding="utf-8", index=False, header=False, sep=",", quotechar='"')
lg.info("Data acquired")

2016-10-30 23:44:05,907 [INFO] Data acquired
