## TextRank

In [35]:
import os
import glob
import re
# for 'sentance_collection' method
%run ../rfej_preprocessing/sentence_collection.ipynb
# for 'red_txt' method
%run ../rfej_preprocessing/find_shortenings.ipynb

In [36]:
# Путь к файлам с текстами
PATH = os.path.abspath('..\\rfej_parser\\articles\\') + '\\'

In [37]:
from natasha import(
    Doc, 
    Segmenter, 
    NewsNERTagger, 
    NewsEmbedding, 
    MorphVocab, 
    DatesExtractor, 
    MoneyExtractor, 
    AddrExtractor,
)
segmenter = Segmenter()
emb = NewsEmbedding()
morph_vocab = MorphVocab()
ner_tagger = NewsNERTagger(emb)
dates_extractor = DatesExtractor(morph_vocab)
money_extractor = MoneyExtractor(morph_vocab)
addr_extractor = AddrExtractor(morph_vocab)

In [38]:
def articles_to_sentances(path: str) -> dict:
    """
    Split all .txt files to sentances.
    
    path: path to .txt files
    artcls_sentncs: number of text -> {number of sentance -> 'sentance', ...}
    
    """
    files = [f for f in glob.glob(PATH + '*.txt') if re.findall(r'_a.txt', f)]
    artcls_sentncs = {}
    for f in files:
        text = read_txt(f)
        artcls_sentncs[int(re.findall(r'([\d]*)_a', f)[0])] = {
            key: sentance for key, sentance in enumerate(sentence_collection(text))}
    return artcls_sentncs

In [39]:
import logging
# supress 'WARNING:root:Something went wrong while tokenizing'
logging.root.level = logging.ERROR

# Разбиваем текст на предложения:
s = articles_to_sentances(PATH)

logging.root.level = logging.WARNING

In [40]:
def is_entity_ignored(word: str) -> bool:
    """Defines do we have to use this entity in algorithm. 
    If `yes` -> returns True, else -> False.
    """
    entities_ignored = ['LOC', 'ORG', 'PER']
    doc = Doc(word)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)
    if doc.spans and doc.spans[0].type in entities_ignored:
        return True
    return False

In [41]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def sentances_to_words(sent_dict: dict) -> dict:
    """
    sent_dict: number of text -> {number of sentance -> 'sentance', ...}
    artcls_words: number of text -> {number of sentance -> ['word_0', 'word_1', ...], ...}
    
    """
    candidate_pos = ['NOUN', 'ADJF', 'VERB', 'INFN', 'PRTS', 'PRTF']
    selected_words = []
    artcls_words = {}
    for artcl in sent_dict:
#         if int(artcl) < 10:  # ТЕСТ НА 10 ПЕРВЫХ СТАТЬЯХ!
            artcls_words[artcl] = {}
            for sentance in sent_dict[artcl]:
                words_of_sentance = []
                doc = Doc(sent_dict[artcl][sentance])
                doc.segment(segmenter)
                words_raw = [word.text for word in doc.tokens]
                for w in words_raw:
                    if morph.parse(w)[0].tag.POS in candidate_pos and not is_entity_ignored(w):
                        final_word = morph.parse(w)[0].normal_form  # привести к начальной форме
                        # Убрать мусор в 1 символ
                        if len(final_word) != 1:
                            words_of_sentance.append(final_word) 
                words_of_sentance = set(words_of_sentance)  # Убираем поторяющиеся слова
                if len(words_of_sentance) > 2:  # Порог для длины предложений  - 3 слова
                    artcls_words[artcl][sentance] = [word for word in words_of_sentance]
    return artcls_words

In [42]:
# Разбиваем предложения на слова
w = sentances_to_words(s)

In [43]:
from itertools import combinations
from textdistance import Sorensen

# Вычисление схожести предложений по коэффициенту Сёренсена.
res_scores = {}
srnsn = Sorensen()
for artcl in w:
    nbrs = []
    for sent_nb in range(len(w[artcl])):
        if sent_nb in w[artcl].keys():
            nbrs.append(sent_nb)
    pairs = combinations(nbrs, 2)
    scores = [(i, j, srnsn.similarity(w[artcl][i], w[artcl][j])) for i, j in pairs]
    # Убираем не связанные пары (0.0)
    res_scores[artcl] = [el for el in filter(lambda x: x[2], scores)]

In [44]:
import networkx as nx
# Создаем граф: вершина - это предложение, ребро - это вес, 
# равный похожести вершин, которым оно инцидентно 
def scores_to_graph(w: dict, res_scores: dict) -> dict:
    """Create graphs from sentances.
    """
    result = {}
    for article in res_scores:
        g = nx.Graph()
        g.add_weighted_edges_from(res_scores[article])
        # pr: 'sentance number' -> PageRank
        pr = nx.pagerank(g)
        # Каждый score делим на длину предложения, чтобы она не влияла на ранк
        for score in pr:
            pr[score] = pr[score] / len(w[article][score])
        # Сортируем предложения по PageRank
        result[article] = sorted(((i, pr[i]) for i in w[article].keys() if i in pr), 
                key=lambda x: pr[x[0]], reverse=True)
    return result

In [45]:
ranks = scores_to_graph(w, res_scores)

In [46]:
def find_average_score(pr):
    sum_values = 0
    for sentance in pr:
        sum_values += sentance[1]
    average = sum_values / len(pr)
    return average

In [48]:
# Считаем порог ранка для каждого текста (средний по тексту)
# threshold = {}
# for artcl in ranks:
#     threshold[artcl] = find_average_score(ranks[artcl])

In [125]:
from math import ceil

res_sentances_nb = {}
for artcl in ranks:
#     number_of_sentances = [sent[0] for sent in filter(lambda x: x[1] >= threshold[artcl], ranks[artcl])]
    number_of_sentances = [sent[0] for sent in ranks[artcl]]
    NB = ceil(len(number_of_sentances) * 16 / 100) 
    number_of_sentances = number_of_sentances[:NB]  # БЕРЕМ 20% ПРЕДЛ.
    number_of_sentances.sort()
    res_sentances_nb[artcl] = number_of_sentances

## Result (Summary)

In [126]:
# Словарь с суммари: 'number of article' -> ['sentance1, sentance1', ...]
sum_textrank = {}

for article in w:
    sum_textrank[article] = []
    for sentance in s[article]:
        if sentance in res_sentances_nb[article]:
            sum_textrank[article].append(s[article][sentance])

#### Example of TextRank summary:

In [140]:
# print(sum_textrank[9])

### ROUGE metrics

In [124]:
metrics_textrank # 16% of sentances

{'rouge-1': {'f': 0.13971197701954277,
  'p': 0.10240924542658952,
  'r': 0.2197598565972747},
 'rouge-2': {'f': 0.02415363094347205,
  'p': 0.017704073728404116,
  'r': 0.037995205082793325},
 'rouge-l': {'f': 0.12965111751697095,
  'p': 0.09499136299179528,
  'r': 0.20413406304525372}}

In [61]:
metrics_textrank_8_sentance  # 8 sentances

{'rouge-1': {'f': 0.14106136813929335,
  'p': 0.1232744008653751,
  'r': 0.1648467030676705},
 'rouge-2': {'f': 0.02208817713994442,
  'p': 0.018807195458214607,
  'r': 0.026755825390976602},
 'rouge-l': {'f': 0.13044844876831158,
  'p': 0.11427454976768742,
  'r': 0.15195554396767527}}