-
-
Notifications
You must be signed in to change notification settings - Fork 382
/
am_manaus.py
65 lines (53 loc) 路 2.03 KB
/
am_manaus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import datetime
import re
import scrapy
from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider
class AmManausSpider(BaseGazetteSpider):
zyte_smartproxy_enabled = True
name = "am_manaus"
allowed_domains = ["dom.manaus.am.gov.br"]
start_date = datetime.date(2000, 4, 3)
start_urls = ["http://dom.manaus.am.gov.br/diario-oficial-de-manaus"]
TERRITORY_ID = "1302603"
def parse(self, response):
follow_next_page = True
gazettes = response.css(".listing tbody tr")
for gazette in gazettes:
gazette_date_raw = gazette.xpath("./td[1]//text()").re_first(
r"\d{2}\/\d{2}\/\d{4}"
)
gazette_date = datetime.datetime.strptime(
gazette_date_raw, "%d/%m/%Y"
).date()
if gazette_date < self.start_date:
follow_next_page = False
break
title = "".join(gazette.xpath("./td[2]//text()").getall()).strip()
edition_number = self._extract_edition_number(title, gazette_date)
is_extra_edition = re.search(r"eex|ext", title.lower()) is not None
gazette_url = gazette.css("a::attr(href)").get()
yield Gazette(
date=gazette_date,
edition_number=edition_number,
is_extra_edition=is_extra_edition,
file_urls=[gazette_url],
power="executive",
)
if follow_next_page:
next_page_url = response.css(".next a::attr(href)").get()
yield scrapy.Request(next_page_url)
def _extract_edition_number(self, text, gazette_date):
year = gazette_date.year
pattern = r"|".join(
[
r"Edi莽茫o (\d+)\s",
rf"dom{year}(\d{{4}})[A-Za-z.]",
r"dom(\d{4})[A-Za-z.]",
r"(\d+)\s",
]
)
matches = re.search(pattern, text)
if matches:
return matches.group(matches.lastindex)
return ""