## SummaRuNNer

Загрузка данных:

In [None]:
!wget https://www.dropbox.com/s/mbj3sb6jaw3d9s3/judgements_test.json
!wget https://www.dropbox.com/s/hbh7ioaiet16fcw/judgements_train.json

Установка необходимых пакетов:

In [None]:
!pip install youtokentome rouge

Collecting youtokentome
[?25l  Downloading https://files.pythonhosted.org/packages/c8/1c/224cdc3d9a32ed706c8fb1f30b491be6ea5da114ff4edc174014cc24fa43/youtokentome-1.0.6-cp37-cp37m-manylinux2010_x86_64.whl (1.7MB)
[K     |▏                               | 10kB 3.3MB/s eta 0:00:01[K     |▍                               | 20kB 5.7MB/s eta 0:00:01[K     |▋                               | 30kB 7.8MB/s eta 0:00:01[K     |▊                               | 40kB 9.4MB/s eta 0:00:01[K     |█                               | 51kB 10.2MB/s eta 0:00:01[K     |█▏                              | 61kB 11.5MB/s eta 0:00:01[K     |█▍                              | 71kB 11.5MB/s eta 0:00:01[K     |█▌                              | 81kB 7.8MB/s eta 0:00:01[K     |█▊                              | 92kB 8.5MB/s eta 0:00:01[K     |██                              | 102kB 9.1MB/s eta 0:00:01[K     |██                              | 112kB 9.1MB/s eta 0:00:01[K     |██▎                    

In [None]:
import json
from rouge import Rouge
import youtokentome as yttm
from tqdm.notebook import tqdm

In [None]:
with open ('summa_data_train.json', encoding='utf-8') as f:
  data_train = json.load(f)

with open ('summa_data_val.json', encoding='utf-8') as f:
  data_val = json.load(f)

with open ('summa_data_test.json', encoding='utf-8') as f:
  data_test = json.load(f)

Функция для создания временного файла, необходимого для BPE кодирования

In [None]:
def train_bpe(records, model_path, model_type="bpe", vocab_size=10000):
    temp_file_name = "temp.txt"
    with open(temp_file_name, "w") as temp:
      for item in tqdm(records.keys()):
        temp.write(records[item]['text'] + "\n")
        temp.write(records[item]['summary'] + "\n")
    yttm.BPE.train(data=temp_file_name, vocab_size=vocab_size, model=model_path)

train_bpe(data_train, "BPE_model.bin")

HBox(children=(FloatProgress(value=0.0, max=12925.0), HTML(value='')))




Создание объекта с BPE токенизатором (он будет нужен для получения предсказаний) и словаря

In [None]:
bpe_processor = yttm.BPE('BPE_model.bin')
vocabulary = bpe_processor.vocab()

Добавление валидационной выборки:

In [None]:
from sklearn.model_selection import train_test_split

_, val_keys = train_test_split(list(data_train.keys()), test_size=0.15, random_state=10)

data_val = {}

for k in val_keys:
  data_val[k] = data_train[k]
  data_train.pop(k)

In [None]:
import math
import nltk
import numpy as np
from nltk import sent_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

За основу были взяты материалы семинаров Школы глубокого обучения на базе ФПМИ МФТИ (https://www.dlschool.org)

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

В качестве верных меток алгоритм берет индексы предложений входящих в итоговое summary. Так как в авторских аннотациях это невозможо - они написаны с нуля и не содержат в себе исходных предложений судебного документа - то нужно сгенерировать такие summary, которые максимально похожи на образцовые, т.е. метрика ROUGE максимально высока

In [None]:
import copy

def build_oracle_summary_greedy(text, gold_summary, calc_score, max_sentences=100):
    '''
    Жадное построение oracle summary
    '''
    sentences = [s for s in sent_tokenize(text) if len(s)>1][:max_sentences]
    n_sentences = len(sentences)
    oracle_summary_sentences = set()
    
    score = -1.0
    summaries = []
    for _ in range(n_sentences):
        for i in range(n_sentences):
            if i in oracle_summary_sentences:
                continue
            current_summary_sentences = copy.copy(oracle_summary_sentences)
            # Добавляем какое-то предложения к уже существующему summary
            current_summary_sentences.add(i)
            current_summary = " ".join([sentences[index] for index in sorted(list(current_summary_sentences))])
            # Считаем метрики
            if current_summary == '':
              print('привет')
            current_score = calc_score(current_summary, gold_summary)
            summaries.append((current_score, current_summary_sentences))
        # Если получилось улучшить метрики с добавлением какого-либо предложения, то пробуем добавить ещё
        # Иначе на этом заканчиваем
        best_summary_score, best_summary_sentences = max(summaries)
        if best_summary_score <= score:
            break
        oracle_summary_sentences = best_summary_sentences
        score = best_summary_score
    oracle_summary = " ".join([sentences[index] for index in sorted(list(oracle_summary_sentences))])
    return oracle_summary, oracle_summary_sentences

def calc_single_score(pred_summary, gold_summary, rouge):
  if len([pred_summary])<=0:
    print(pred_summary)
  return rouge.get_scores([pred_summary], [gold_summary], avg=True)['rouge-2']['f']

Сохраним эти сгенерированные аннотации в общий словарь с данными

In [None]:
rouge = Rouge()
for i in tqdm(data_train.keys()):
        text = data_train[i]["text"]
        summary = data_train[i]["summary"]

        sentences = [s for s in sent_tokenize(text) if len(s)>0][:100]
        oracle_summary, sentences_indicies = build_oracle_summary_greedy(text, summary, calc_score=lambda x, y: calc_single_score(x, y, rouge))
        data_train[i]["sentences"] = sentences
        data_train[i]["oracle_sentences"] = list(sentences_indicies)
        data_train[i]["oracle_summary"] = oracle_summary

HBox(children=(FloatProgress(value=0.0, max=12925.0), HTML(value='')))




То же самое для валидационной и тестовой выборок

In [None]:
for i in tqdm(data_val.keys()):
        text = data_val[i]["text"]
        summary = data_val[i]["summary"]

        sentences = [s for s in sent_tokenize(text) if len(s)>0][:100]
        oracle_summary, sentences_indicies = build_oracle_summary_greedy(text, summary, calc_score=lambda x, y: calc_single_score(x, y, rouge))
        data_val[i]["sentences"] = sentences
        data_val[i]["oracle_sentences"] = list(sentences_indicies)
        data_val[i]["oracle_summary"] = oracle_summary

HBox(children=(FloatProgress(value=0.0, max=2282.0), HTML(value='')))




In [None]:
for i in tqdm(data_test.keys()):
        text = data_test[i]["text"]
        summary = data_test[i]["summary"]

        sentences = [s for s in sent_tokenize(text) if len(s)>0][:100]
        oracle_summary, sentences_indicies = build_oracle_summary_greedy(text, summary, calc_score=lambda x, y: calc_single_score(x, y, rouge))
        data_test[i]["sentences"] = sentences
        data_test[i]["oracle_sentences"] = list(sentences_indicies)
        data_test[i]["oracle_summary"] = oracle_summary

HBox(children=(FloatProgress(value=0.0, max=2684.0), HTML(value='')))




Класс, который будет позволять итерироваться батчами по данным:

In [None]:
class BatchIterator():
    def __init__(self, records, vocabulary, batch_size, bpe_processor, shuffle=True, max_sentences=100, max_sentence_length=50, device=torch.device('cpu')):
        self.records = records
        self.num_samples = len(records)
        self.batch_size = batch_size
        self.bpe_processor = bpe_processor
        self.shuffle = shuffle
        self.batches_count = int(math.ceil(self.num_samples / batch_size))
        self.rouge = Rouge()
        self.vocabulary = vocabulary
        self.max_sentences = max_sentences
        self.max_sentence_length = max_sentence_length
        self.device = device
        
    def __len__(self):
        return self.batches_count
    
    def __iter__(self):
        indices = list(self.records.keys())
        if self.shuffle:
            np.random.shuffle(indices)

        for start in range(0, self.num_samples, self.batch_size):
            end = min(start + self.batch_size, self.num_samples)
            batch_indices = indices[start:end]

            batch_inputs = []
            batch_outputs = []
            max_sentence_length = 0
            max_sentences = 0
            batch_records = []

            for data_ind in batch_indices:
                
                record = self.records[data_ind]
                batch_records.append(record)
                text = record["text"]
                summary = record["summary"]
                
                sentences = record['sentences']
                max_sentences = max(len(sentences), max_sentences)

                sentences_indicies = record['oracle_sentences']

                
                # inputs - индексы слов в предложении
                inputs = [bpe_processor.encode(sentence)[:self.max_sentence_length] for sentence in sentences]
                max_sentence_length = max(max_sentence_length, max([len(tokens) for tokens in inputs]))
                
                # получение метки класса предложения
                outputs = [int(i in sentences_indicies) for i in range(len(sentences))]
                batch_inputs.append(inputs)
                batch_outputs.append(outputs)

            tensor_inputs = torch.zeros((self.batch_size, max_sentences, max_sentence_length), dtype=torch.long, device=self.device)
            tensor_outputs = torch.zeros((self.batch_size, max_sentences), dtype=torch.float32, device=self.device)


            for i, inputs in enumerate(batch_inputs):
                for j, sentence_tokens in enumerate(inputs):
                    tensor_inputs[i][j][:len(sentence_tokens)] = torch.LongTensor(sentence_tokens)

            for i, outputs in enumerate(batch_outputs):
                tensor_outputs[i][:len(outputs)] = torch.LongTensor(outputs)

            yield {
                'inputs': tensor_inputs,
                'outputs': tensor_outputs,
                'records': batch_records
            }

In [None]:
train_iterator = BatchIterator(data_train, vocabulary, 10, bpe_processor, device=device)
val_iterator = BatchIterator(data_val, vocabulary, 10, bpe_processor, device=device)
test_iterator = BatchIterator(data_test, vocabulary, 10, bpe_processor, device=device)

Главный цикл обучения модели:

In [None]:
import torch.nn as nn
import torch.optim as optim
import time

def train_model(model, train_iterator, val_iterator, vocabulary, bpe_processor,
                epochs_count=2, loss_every_nsteps=16, lr=0.001, device_name="cuda"):

    
    params_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Trainable params: {}".format(params_count))

    device = torch.device(device_name)
    model = model.to(device)

    total_loss = 0
    start_time = time.time()

    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.BCEWithLogitsLoss().to(device)

    for epoch in range(epochs_count):
        for step, batch in enumerate(train_iterator):

            model.train()
            logits = model(batch["inputs"]) # Прямой проход

            loss = loss_function(logits, batch["outputs"]) # Подсчёт ошибки

            optimizer.zero_grad() # Зануление градиентов, чтобы их спокойно менять на следующей итерации
            loss.backward() # Подсчёт градиентов dL/dw
            optimizer.step() # Градиентный спуск или его модификации (в данном случае Adam)
            
            total_loss += float(loss) 
            if step % loss_every_nsteps == 0 and step != 0:
                val_total_loss = 0
                val_batch_count = 0

                model.eval()
                with torch.no_grad():
                  for _, val_batch in enumerate(val_iterator):
                    logits = model(val_batch["inputs"]) # Прямой проход
                    val_total_loss += loss_function(logits, val_batch["outputs"]) # Подсчёт ошибки
                    val_batch_count += 1

                  avg_val_loss = val_total_loss/val_batch_count
                  print("Epoch = {}, Avg Train Loss = {:.4f}, Avg val loss = {:.4f}, Time = {:.2f}s".format(epoch, total_loss / loss_every_nsteps, avg_val_loss, time.time() - start_time))
                  total_loss = 0
                  start_time = time.time()

        total_loss = 0
        start_time = time.time()

Инициализация самой модели:

In [None]:
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


class SentenceEncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, n_layers=3, dropout=0.3, bidirectional=True):
        super().__init__()

        num_directions = 2 if bidirectional else 1
        assert hidden_size % num_directions == 0
        hidden_size = hidden_size // num_directions

        self.embedding_dim = embedding_dim
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.bidirectional = bidirectional

        self.embedding_layer = nn.Embedding(input_size, embedding_dim)
        self.rnn_layer = nn.LSTM(embedding_dim, hidden_size, n_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
        self.dropout_layer = nn.Dropout(dropout)

    def forward(self, inputs, hidden=None):
        embedded = self.embedding_layer(inputs)
        outputs, _ = self.rnn_layer(embedded, hidden)
        sentences_embeddings = torch.mean(outputs, 1)
        # [batch_size, hidden_size]
        return sentences_embeddings

class SentenceTaggerRNN(nn.Module):
    def __init__(self,
                 vocabulary_size,
                 token_embedding_dim=256,
                 sentence_encoder_hidden_size=256,
                 hidden_size=256,
                 bidirectional=True,
                 sentence_encoder_n_layers=2,
                 sentence_encoder_dropout=0.3,
                 sentence_encoder_bidirectional=True,
                 n_layers=1,
                 dropout=0.3):
        
        super().__init__()

        num_directions = 2 if bidirectional else 1
        assert hidden_size % num_directions == 0
        hidden_size = hidden_size // num_directions

        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.bidirectional = bidirectional

        self.sentence_encoder = SentenceEncoderRNN(vocabulary_size, token_embedding_dim,
                                                   sentence_encoder_hidden_size, sentence_encoder_n_layers, 
                                                   sentence_encoder_dropout, sentence_encoder_bidirectional)
        
        self.rnn_layer = nn.LSTM(sentence_encoder_hidden_size, hidden_size, n_layers, dropout=dropout,
                           bidirectional=bidirectional, batch_first=True)
        
        self.dropout_layer = nn.Dropout(dropout)
        self.content_linear_layer = nn.Linear(hidden_size * 2, 1)
        self.document_linear_layer = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.salience_linear_layer = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.tanh_layer = nn.Tanh()

    def forward(self, inputs, hidden=None):

        # [batch_size, seq num, seq_len]
        batch_size = inputs.size(0)
        sentences_count = inputs.size(1)
        tokens_count = inputs.size(2)
        inputs = inputs.reshape(-1, tokens_count)
        # [batch_size * seq num, seq_len]

        embedded_sentences = self.sentence_encoder(inputs)
        embedded_sentences = embedded_sentences.reshape(batch_size, sentences_count, -1)
        # [batch_size *  seq num, seq_len, hidden_size] -> [batch_size, seq num, hidden_size]

        outputs, _ = self.rnn_layer(embedded_sentences, hidden)
        outputs = self.dropout_layer(outputs)
        # [batch_size, seq num, hidden_size]

        document_embedding = self.tanh_layer(self.document_linear_layer(torch.mean(outputs, 1)))
        # [batch_size, hidden_size]

        # W * h^T
        content = self.content_linear_layer(outputs).squeeze(2) # 1-representation
        # [batch_size, seq num]

        # h^T * W * d
        salience = torch.bmm(outputs, self.salience_linear_layer(document_embedding).unsqueeze(2)).squeeze(2) # 2-representation

        # [batch_size, seq num, hidden_size] * [batch_size, hidden_size, 1] = [batch_size, seq num, ]
        return content + salience

Процесс обучения:

In [None]:
model = SentenceTaggerRNN(len(vocabulary))
train_model(model, train_iterator, val_iterator, vocabulary, bpe_processor, device_name="cuda")

  "num_layers={}".format(dropout, num_layers))


Trainable params: 3877633
Epoch = 0, Avg Train Loss = 0.2063, Avg val loss = 0.1289, Time = 8.51s
Epoch = 0, Avg Train Loss = 0.1254, Avg val loss = 0.1183, Time = 8.29s
Epoch = 0, Avg Train Loss = 0.1234, Avg val loss = 0.1122, Time = 8.35s
Epoch = 0, Avg Train Loss = 0.1073, Avg val loss = 0.1079, Time = 8.43s
Epoch = 0, Avg Train Loss = 0.1086, Avg val loss = 0.0994, Time = 8.63s
Epoch = 0, Avg Train Loss = 0.1045, Avg val loss = 0.0981, Time = 8.56s
Epoch = 0, Avg Train Loss = 0.0958, Avg val loss = 0.0928, Time = 8.65s
Epoch = 0, Avg Train Loss = 0.1013, Avg val loss = 0.0931, Time = 8.52s
Epoch = 0, Avg Train Loss = 0.0959, Avg val loss = 0.0914, Time = 8.68s
Epoch = 0, Avg Train Loss = 0.1016, Avg val loss = 0.0932, Time = 8.73s
Epoch = 0, Avg Train Loss = 0.0934, Avg val loss = 0.0894, Time = 8.72s
Epoch = 0, Avg Train Loss = 0.0902, Avg val loss = 0.0875, Time = 8.77s
Epoch = 0, Avg Train Loss = 0.0929, Avg val loss = 0.0878, Time = 8.75s
Epoch = 0, Avg Train Loss = 0.0920, Av

Наглядно видно, что ошибка уменьшается как на тренировочной, так и тестовой выборке - соответственно, модель обучается, не склоняется к переобучению

Получаем для тестовых данных итоговые summary, состоящие из трех предложений

In [None]:
references = []
predictions = []


top_k=3

model.eval()
with torch.no_grad():
  
  for batch in test_iterator:
    
    logits = model(batch['inputs'])
    sum_in = torch.argsort(logits, dim=1)[:, -top_k:]
        
    for i in range(len(batch['outputs'])):
      
      summary = batch['records'][i]['summary']

      sum_sents_nums.extend([int(ind) for ind in sum_in.sort(dim=1)[0][i]])

      pred_summary = ' '.join([batch['records'][i]['sentences'][ind] for ind in sum_in.sort(dim=1)[0][i]])
      
      references.append(summary)
      predictions.append(pred_summary)

Подсчет метрики:

In [None]:
rouge = Rouge()

rouge.get_scores(predictions, references, avg=True)

{'rouge-1': {'f': 0.2896457342004031,
  'p': 0.2299175410082272,
  'r': 0.4460687432557202},
 'rouge-2': {'f': 0.17573891196548805,
  'p': 0.13870930676188106,
  'r': 0.2727606395552116},
 'rouge-l': {'f': 0.2874684291334634,
  'p': 0.23389792348556737,
  'r': 0.41193972563214903}}

Для случайных 10 примеров распечатаны пары образцовых и сгенерированных аннотаций:

In [None]:
import random 

ind_to_test = random.sample(range(len(references)), 10)
for ind in ind_to_test:
  print(references[ind])
  print('\n')
  print(predictions[ind])
  print('--------------------------')

рассматривается дело o взыскании неустойки по государственному контракту.  истец ссылается на неисполнение ответчиком обязательств по своевременной поставке товара.  требование удовлетворено частично, поскольку установлен факт просрочки поставки товара ответчиком, однако истцом неверно определена начальная дата просрочки исполнения обязательств по контракту, а также неустойка неправомерно начислена от цены всего контракта, а не от этапа поставки.


Обращаясь в суд с настоящими требованиями, истец казал на нарушение ответчиком сроков поставки, в связи с чем Министерством начислена неустойка на основании п. Удовлетворяя заявленные требования в части, суд первой инстанции, руководствуясь положениями , , , , ,  ГК РФ, Федеральным  от 05.04.2013 N 44-ФЗ "О контрактной системе в сфере закупок товаров, работ, услуг для обеспечения государственных и муниципальных нужд", проанализировав условия договора, исходил из того, что истцом неправильно определена начальная дата просрочки исполнения обяз