-
Notifications
You must be signed in to change notification settings - Fork 79
/
paragraph.py
69 lines (50 loc) · 1.62 KB
/
paragraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
import re
from .utils import normalize_whitespace
class Paragraph(object):
"""Object representing one block of text in HTML."""
def __init__(self, path):
self.dom_path = path.dom
self.xpath = path.xpath
self.text_nodes = []
self.chars_count_in_links = 0
self.tags_count = 0
@property
def is_heading(self):
return bool(re.search(r"\bh\d\b", self.dom_path))
@property
def is_boilerplate(self):
return self.class_type != "good"
@property
def text(self):
text = "".join(self.text_nodes)
return normalize_whitespace(text.strip())
def __len__(self):
return len(self.text)
@property
def words_count(self):
return len(self.text.split())
def contains_text(self):
return bool(self.text_nodes)
def append_text(self, text):
text = normalize_whitespace(text)
self.text_nodes.append(text)
return text
def stopwords_count(self, stopwords):
count = 0
for word in self.text.split():
if word.lower() in stopwords:
count += 1
return count
def stopwords_density(self, stopwords):
words_count = self.words_count
if words_count == 0:
return 0
return self.stopwords_count(stopwords) / words_count
def links_density(self):
text_length = len(self.text)
if text_length == 0:
return 0
return self.chars_count_in_links / text_length