-
-
Notifications
You must be signed in to change notification settings - Fork 395
/
rs_porto_alegre.py
54 lines (44 loc) · 1.77 KB
/
rs_porto_alegre.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re
from .base_parser import BaseParser
class RsPortoAlegre(BaseParser):
END_OF_PAGE_MARKER = '\n\n\nhttp://www.portoalegre.rs.gov.br/dopa'
EXEMPTIONS_ATTR_REGEX = r'^( +[A-ZÀ-Ÿ \-]+:)'
def pages(self):
return self.text.split(self.END_OF_PAGE_MARKER)[:-1]
def text_sections(self):
return re.split(r'\n{3,}', self._source_text())
def bidding_exemption_sections(self):
return [
section
for section in self.text_sections()
if 'dispensa de licitação' in section.lower()
]
def bidding_exemptions(self):
items = []
for section in self.bidding_exemption_sections():
items.append(
{'data': self.bidding_exemption(section), 'source_text': section}
)
return items
def bidding_exemption(self, section):
lines = re.split(self.EXEMPTIONS_ATTR_REGEX, section, flags=re.MULTILINE)
for index, line in enumerate(lines):
is_header_line = re.match(self.EXEMPTIONS_ATTR_REGEX, line)
if is_header_line:
lines = lines[index:]
break
for index, line in enumerate(lines):
is_header_line = re.match(self.EXEMPTIONS_ATTR_REGEX, line)
is_last_section = index == len(lines) - 1
if is_header_line:
line = line[:-1]
if is_last_section:
footer = re.split(r'\n{2,}', line)
line = footer[0].strip()
lines[index] = re.sub(r'\s{2,}', ' ', line.strip())
return dict(zip(lines[0::2], lines[1::2]))
def _source_text(self):
source_text = ''
for page in self.pages():
source_text += '\n'.join(page.split('\n')[3:-2])
return source_text