-
-
Notifications
You must be signed in to change notification settings - Fork 382
/
ba_barreiras.py
60 lines (47 loc) 路 1.92 KB
/
ba_barreiras.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import datetime as dt
import re
import scrapy
from dateutil.rrule import YEARLY, rrule
from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider
class BaBarreirasSpider(BaseGazetteSpider):
zyte_smartproxy_enabled = True
name = "ba_barreiras"
TERRITORY_ID = "2903201"
allowed_domains = ["barreiras.ba.gov.br"]
base_url = "https://barreiras.ba.gov.br/diario-oficial"
start_date = dt.date(2008, 1, 2)
def start_requests(self):
for date_of_interest in rrule(
freq=YEARLY, dtstart=self.start_date, until=self.end_date
):
if date_of_interest.year == dt.date.today().year:
base_url = f"{self.base_url}/"
else:
base_url = f"{self.base_url}-{date_of_interest.year}/"
yield scrapy.Request(url=base_url)
def parse(self, response):
editions = response.css("div.content .style16")
for edition in editions:
link = edition.xpath(".//@href").get()
metadata_str = "".join(edition.xpath(".//text()").getall())
raw_date = re.search(r"\d{2}/\d{2}/\d{4}", metadata_str).group()
gazette_date = dt.datetime.strptime(raw_date, "%d/%m/%Y").date()
gazette_edition = re.search(r"Edi莽茫o (\d+)", metadata_str)
edition_number = ""
if gazette_edition is not None:
edition_number = gazette_edition.group(1)
is_extra_edition = False
if "extra" in metadata_str.lower():
is_extra_edition = True
if gazette_date > self.end_date:
continue
if gazette_date < self.start_date:
return
yield Gazette(
power="executive",
file_urls=[link],
date=gazette_date,
edition_number=edition_number,
is_extra_edition=is_extra_edition,
)