/
summarizer.py
95 lines (72 loc) · 2.84 KB
/
summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
Extraction of raw text from lxml tree and text summarization
"""
from itertools import combinations
import snowballstemmer
import networkx as nx
import re
from wanish import lang_identifier
from segtok.segmenter import split_multi
from segtok.tokenizer import word_tokenizer
LANG_CODES = {
'da': 'danish',
'de': 'german',
'en': 'english',
'es': 'spanish',
'fi': 'finnish',
'fr': 'french',
'hu': 'hungarian',
'it': 'italian',
'nl': 'dutch',
'no': 'norwegian',
'pt': 'portuguese',
'ru': 'russian',
'sv': 'swedish',
'tr': 'turkish',
}
# regexp to strip off dialog sentences
dialog_re = re.compile("^\s*[-—]\s*", re.U)
def get_plain_text(cleaned_html_node, summary_sentences_qty):
"""
Summarizes text from html element.
:param cleaned_html_node: html node to extract text sentences
:param summary_sentences_qty: quantity of sentences of summarized text
:return: summarized text, two-digit language code
"""
clean_text = ""
# assembling text only with complete sentences, ended with respective punctuations.
for node in cleaned_html_node.iter('p'):
if node.text is not None:
for sentence in split_multi(node.text):
if len(sentence) > 0 and sentence[-1:] in ['.', '!', '?', '…'] and \
not sentence.strip(' .!?…').isdigit() and not dialog_re.match(sentence):
clean_text = clean_text + ' ' + sentence
# creating summary, obtaining language code and total sentences quantity
final_result, lang_code, sent_qty = create_referat(clean_text, '', summary_sentences_qty)
return final_result, lang_code
def similarity(s1, s2):
if not len(s1) or not len(s2):
return 0.0
return len(s1.intersection(s2))/(1.0 * (len(s1) + len(s2)))
def textrank(text, hdr):
# finding out the most possible language of the text
lang_code = lang_identifier.classify(' '.join([hdr, text]))[0]
# tokenizing for words
sentences = [sentence for sentence in split_multi(text)]
stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english'))
words = [set(stemmer.stemWord(word) for word in word_tokenizer(sentence.lower()) if word.isalpha())
for sentence in sentences]
pairs = combinations(range(len(sentences)), 2)
scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
scores = filter(lambda x: x[2], scores)
g = nx.Graph()
g.add_weighted_edges_from(scores)
pr = nx.pagerank(g)
return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
key=lambda x: pr[x[0]], reverse=True), lang_code
def create_referat(text, hdr, n=5):
tr, lang_code = textrank(text, hdr)
if n > len(tr):
n = len(tr)
top_n = sorted(tr[:n])
return ' '.join(x[2] for x in top_n), lang_code, len(top_n)