From b178d134ad3a51f306f2d78ad245d784aeccc504 Mon Sep 17 00:00:00 2001 From: rodps Date: Sat, 24 Apr 2021 13:21:47 -0300 Subject: [PATCH 01/40] Spider for Campo Mourao/PR --- .../gazette/spiders/pr_campo_mourao.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 data_collection/gazette/spiders/pr_campo_mourao.py diff --git a/data_collection/gazette/spiders/pr_campo_mourao.py b/data_collection/gazette/spiders/pr_campo_mourao.py new file mode 100644 index 000000000..84ca607fa --- /dev/null +++ b/data_collection/gazette/spiders/pr_campo_mourao.py @@ -0,0 +1,43 @@ +import scrapy +from dateparser import parse +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + +class PrCampoMouraoSpider(BaseGazetteSpider): + TERRITORY_ID = "4104303" + name = "pr_campo_mourao" + start_urls = ["https://campomourao.atende.net/?pg=diariooficial&pagina=1"] + + def parse(self, response): + url_base = response.url[0:56] + page = int(response.url[56:]) + + list = response.xpath("//div[@class='nova_listagem ']/div[@class='linha']") + + if not list: + return + + for row in list: + date = row.xpath("//div[@class='info']/div[@class='data']/text()").get() + date = parse(date, languages=["pt"]).date() + + edition_type = row.xpath("//div[@class='info']/div[@class='tipo']/text()").get() + edition_number = row.xpath("//div[@class='info']/div[@class='titulo']/text()").get() + edition_number = edition_number.split(' ')[1] + + code = row.xpath("//button[@data-acao='download']/@data-codigo").get() + id = row.xpath("//button[@data-acao='download']/@data-id").get() + + is_extra = True if edition_type == "Extraordinária" else False + + url = f"https://campomourao.atende.net/atende.php?rot=54002&aca=737&processo=download&codigo={code}&hash={id}" + + yield Gazette( + date=date, + edition_number=edition_number, + file_urls=[url], + is_extra_edition = is_extra, + power='executive_legislative' + ) + + yield response.follow(url_base + str(page + 1)) \ No newline at end of file From efe5f052bcf9a4005a8dc1a8c8b5161652bce536 Mon Sep 17 00:00:00 2001 From: allison Date: Mon, 26 Apr 2021 13:21:52 -0300 Subject: [PATCH 02/40] automated code formatting --- .../gazette/spiders/pr_campo_mourao.py | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/data_collection/gazette/spiders/pr_campo_mourao.py b/data_collection/gazette/spiders/pr_campo_mourao.py index 84ca607fa..390ea4210 100644 --- a/data_collection/gazette/spiders/pr_campo_mourao.py +++ b/data_collection/gazette/spiders/pr_campo_mourao.py @@ -1,8 +1,9 @@ -import scrapy from dateparser import parse + from gazette.items import Gazette from gazette.spiders.base import BaseGazetteSpider + class PrCampoMouraoSpider(BaseGazetteSpider): TERRITORY_ID = "4104303" name = "pr_campo_mourao" @@ -20,24 +21,28 @@ def parse(self, response): for row in list: date = row.xpath("//div[@class='info']/div[@class='data']/text()").get() date = parse(date, languages=["pt"]).date() - - edition_type = row.xpath("//div[@class='info']/div[@class='tipo']/text()").get() - edition_number = row.xpath("//div[@class='info']/div[@class='titulo']/text()").get() - edition_number = edition_number.split(' ')[1] + + edition_type = row.xpath( + "//div[@class='info']/div[@class='tipo']/text()" + ).get() + edition_number = row.xpath( + "//div[@class='info']/div[@class='titulo']/text()" + ).get() + edition_number = edition_number.split(" ")[1] code = row.xpath("//button[@data-acao='download']/@data-codigo").get() id = row.xpath("//button[@data-acao='download']/@data-id").get() - + is_extra = True if edition_type == "Extraordinária" else False - + url = f"https://campomourao.atende.net/atende.php?rot=54002&aca=737&processo=download&codigo={code}&hash={id}" - + yield Gazette( date=date, edition_number=edition_number, file_urls=[url], - is_extra_edition = is_extra, - power='executive_legislative' + is_extra_edition=is_extra, + power="executive_legislative", ) - yield response.follow(url_base + str(page + 1)) \ No newline at end of file + yield response.follow(url_base + str(page + 1)) From 5ebd2ed9c7b8fb2d3b401e1e4d63f696fafb9047 Mon Sep 17 00:00:00 2001 From: rodps Date: Sun, 23 May 2021 09:31:25 -0300 Subject: [PATCH 03/40] Changes in Campo Mourao spider --- .../gazette/spiders/pr_campo_mourao.py | 40 ++++++++++--------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/data_collection/gazette/spiders/pr_campo_mourao.py b/data_collection/gazette/spiders/pr_campo_mourao.py index 390ea4210..9f1840c7a 100644 --- a/data_collection/gazette/spiders/pr_campo_mourao.py +++ b/data_collection/gazette/spiders/pr_campo_mourao.py @@ -1,3 +1,5 @@ +import datetime + from dateparser import parse from gazette.items import Gazette @@ -7,31 +9,28 @@ class PrCampoMouraoSpider(BaseGazetteSpider): TERRITORY_ID = "4104303" name = "pr_campo_mourao" + start_date = datetime.date(2012, 3, 2) start_urls = ["https://campomourao.atende.net/?pg=diariooficial&pagina=1"] - def parse(self, response): - url_base = response.url[0:56] - page = int(response.url[56:]) - - list = response.xpath("//div[@class='nova_listagem ']/div[@class='linha']") + def parse(self, response, page=1): - if not list: - return + gazettes = response.xpath("//div[@class='nova_listagem ']/div[@class='linha']") + follow_next_page = False if not gazettes else True - for row in list: - date = row.xpath("//div[@class='info']/div[@class='data']/text()").get() + for gazette in gazettes: + date = gazette.xpath(".//div[@class='data']/text()").get() date = parse(date, languages=["pt"]).date() - edition_type = row.xpath( - "//div[@class='info']/div[@class='tipo']/text()" - ).get() - edition_number = row.xpath( - "//div[@class='info']/div[@class='titulo']/text()" - ).get() + if date < self.start_date: + follow_next_page = False + break + + edition_type = gazette.xpath(".//div[@class='tipo']/text()").get() + edition_number = gazette.xpath(".//div[@class='titulo']/text()").get() edition_number = edition_number.split(" ")[1] - code = row.xpath("//button[@data-acao='download']/@data-codigo").get() - id = row.xpath("//button[@data-acao='download']/@data-id").get() + code = gazette.xpath("//button[@data-acao='download']/@data-codigo").get() + id = gazette.xpath("//button[@data-acao='download']/@data-id").get() is_extra = True if edition_type == "Extraordinária" else False @@ -45,4 +44,9 @@ def parse(self, response): power="executive_legislative", ) - yield response.follow(url_base + str(page + 1)) + if follow_next_page: + next_page = page + 1 + yield response.follow( + f"https://campomourao.atende.net/?pg=diariooficial&pagina={next_page}", + cb_kwargs={"page": next_page}, + ) From 0b8a21071a8c4ad4a24daab30a78b77593fddb11 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 27 Nov 2023 13:23:50 -0300 Subject: [PATCH 04/40] =?UTF-8?q?Move=20pr=5Fcampo=5Fmourao=20criado=20em?= =?UTF-8?q?=20#438=20para=20servir=20de=20refer=C3=AAncia=20na=20cria?= =?UTF-8?q?=C3=A7=C3=A3o=20do=20spider=20base=20do=20sistema=20replic?= =?UTF-8?q?=C3=A1vel=20Atende.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_collection/gazette/spiders/{ => pr}/pr_campo_mourao.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename data_collection/gazette/spiders/{ => pr}/pr_campo_mourao.py (100%) diff --git a/data_collection/gazette/spiders/pr_campo_mourao.py b/data_collection/gazette/spiders/pr/pr_campo_mourao.py similarity index 100% rename from data_collection/gazette/spiders/pr_campo_mourao.py rename to data_collection/gazette/spiders/pr/pr_campo_mourao.py From 88f036c307a5989382b672a4c60ad81410b745af Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Wed, 29 Nov 2023 19:43:42 -0300 Subject: [PATCH 05/40] =?UTF-8?q?Adiciona=20spider=20base=20do=20sistema?= =?UTF-8?q?=20replic=C3=A1vel=20Atende.=20Essa=20vers=C3=A3o=20implementa?= =?UTF-8?q?=20a=20classe=20'BaseAtendeT2Spider'=20para=20buscar=20os=20dir?= =?UTF-8?q?=C3=A1rios=20nas=20p=C3=A1ginas=20com=20layout=20'Tipo=202',=20?= =?UTF-8?q?identificadas=20pelo=20Mapeador=20Atende=20desenvolvido=20em=20?= =?UTF-8?q?#1043.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/base/atende.py | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 data_collection/gazette/spiders/base/atende.py diff --git a/data_collection/gazette/spiders/base/atende.py b/data_collection/gazette/spiders/base/atende.py new file mode 100644 index 000000000..8e31c9a22 --- /dev/null +++ b/data_collection/gazette/spiders/base/atende.py @@ -0,0 +1,77 @@ +import dateparser +from scrapy import Request + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class BaseAtendeT2Spider(BaseGazetteSpider): + """ + Base spider for Gazzetes that are available from cities listed on https://{city_subdomain}.atende.net + This base class deals with layout 'Type 2' gazette pages, usually requested + from 'https://{city_subdomain}.atende.net/diariooficial'. + """ + + # Must be defined into child classes + city_subdomain = "" + + power = "executive_legislative" + + start_page = 1 + end_page = 0 + extra_edition_options = ("suplementar", "retificação", "extraordinária", "extra") + allowed_domains = ["atende.net"] + + def start_requests(self): + yield Request(self.get_url(self.start_page)) + + def parse(self, response, page=start_page): + lines = response.css("div.nova_listagem div.linha") + for line in lines: + date_raw = line.css("div.data::text").get() + date_time = dateparser.parse(date_raw, languages=["pt"]) + if date_time is None: + self.logger.debug(f"Unable to parse date from text {date_raw}!") + continue + date = date_time.date() + + if date > self.end_date: + continue + if date < self.start_date: + return + + edition_type = line.css("div.tipo::text").get() + is_extra = ( + edition_type.lower() in self.extra_edition_options + if edition_type + else False + ) + edition_number = line.css("div.titulo::text").get() + # edition_number = lines.css("div.titulo::text").re_first(r"[^\s][\d.]+") + download_urls = line.css("button::attr(data-link)") + if len(download_urls) < 1: + self.logger.debug("Unable to find an url for download!") + continue + download_url = download_urls[-1].get() + + yield Gazette( + date=date, + edition_number=edition_number, + file_urls=[download_url], + is_extra_edition=is_extra, + power=self.power, + ) + + if self.end_page < 1: + pages = response.css("div#paginacao li.dst button::attr(value)").getall() + if len(pages) > 1: + self.end_page = int(pages[-1]) + else: + self.logger.debug("Unable to find the last page!") + + page += 1 + if page <= self.end_page: + yield response.follow(self.get_url(page), cb_kwargs={"page": page}) + + def get_url(self, page): + return f"https://{self.city_subdomain}.atende.net/diariooficial/edicao/pagina/atende.php?rot=54015&aca=101&ajax=t&processo=loadPluginDiarioOficial¶metro=%7B%22codigoPlugin%22%3A1,%22filtroPlugin%22%3A%7B%22pagina%22%3A%22{page}%22%7D%7D" From 5158d721482abbb7f00f36745e499435d1406f54 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Wed, 29 Nov 2023 19:58:45 -0300 Subject: [PATCH 06/40] =?UTF-8?q?Ajusta=20o=20spider=20'pr=5Fcampo=5Fmoura?= =?UTF-8?q?o'=20criado=20em=20#438=20para=20trabalhar=20com=20o=20spider?= =?UTF-8?q?=20base=20do=20sistema=20replic=C3=A1vel=20'Atende'.=20Resolve?= =?UTF-8?q?=20#430=20Adiciona=20spider=20para=20Campo=20Mour=C3=A3o=20-=20?= =?UTF-8?q?PR.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/pr/pr_campo_mourao.py | 52 ++----------------- 1 file changed, 5 insertions(+), 47 deletions(-) diff --git a/data_collection/gazette/spiders/pr/pr_campo_mourao.py b/data_collection/gazette/spiders/pr/pr_campo_mourao.py index 9f1840c7a..a44b8e382 100644 --- a/data_collection/gazette/spiders/pr/pr_campo_mourao.py +++ b/data_collection/gazette/spiders/pr/pr_campo_mourao.py @@ -1,52 +1,10 @@ -import datetime +from datetime import date -from dateparser import parse +from gazette.spiders.base.atende import BaseAtendeT2Spider -from gazette.items import Gazette -from gazette.spiders.base import BaseGazetteSpider - -class PrCampoMouraoSpider(BaseGazetteSpider): +class PrCampoMouraoSpider(BaseAtendeT2Spider): TERRITORY_ID = "4104303" name = "pr_campo_mourao" - start_date = datetime.date(2012, 3, 2) - start_urls = ["https://campomourao.atende.net/?pg=diariooficial&pagina=1"] - - def parse(self, response, page=1): - - gazettes = response.xpath("//div[@class='nova_listagem ']/div[@class='linha']") - follow_next_page = False if not gazettes else True - - for gazette in gazettes: - date = gazette.xpath(".//div[@class='data']/text()").get() - date = parse(date, languages=["pt"]).date() - - if date < self.start_date: - follow_next_page = False - break - - edition_type = gazette.xpath(".//div[@class='tipo']/text()").get() - edition_number = gazette.xpath(".//div[@class='titulo']/text()").get() - edition_number = edition_number.split(" ")[1] - - code = gazette.xpath("//button[@data-acao='download']/@data-codigo").get() - id = gazette.xpath("//button[@data-acao='download']/@data-id").get() - - is_extra = True if edition_type == "Extraordinária" else False - - url = f"https://campomourao.atende.net/atende.php?rot=54002&aca=737&processo=download&codigo={code}&hash={id}" - - yield Gazette( - date=date, - edition_number=edition_number, - file_urls=[url], - is_extra_edition=is_extra, - power="executive_legislative", - ) - - if follow_next_page: - next_page = page + 1 - yield response.follow( - f"https://campomourao.atende.net/?pg=diariooficial&pagina={next_page}", - cb_kwargs={"page": next_page}, - ) + start_date = date(2012, 2, 3) # Edição 1511 + city_subdomain = "campomourao" From c4635643f2159ea045ee90e64a1eac5c88c339e7 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Sat, 2 Dec 2023 19:35:53 -0300 Subject: [PATCH 07/40] =?UTF-8?q?Ajusta=20o=20spider=20'rs=5Fgravatai'=20p?= =?UTF-8?q?ara=20trabalhar=20com=20o=20spider=20base=20do=20sistema=20repl?= =?UTF-8?q?ic=C3=A1vel=20'Atende'.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/rs/rs_gravatai.py | 61 ++----------------- 1 file changed, 6 insertions(+), 55 deletions(-) diff --git a/data_collection/gazette/spiders/rs/rs_gravatai.py b/data_collection/gazette/spiders/rs/rs_gravatai.py index e9eded723..fb394476a 100644 --- a/data_collection/gazette/spiders/rs/rs_gravatai.py +++ b/data_collection/gazette/spiders/rs/rs_gravatai.py @@ -1,60 +1,11 @@ -from dateparser import parse -from scrapy import Request +from datetime import date -from gazette.items import Gazette -from gazette.spiders.base import BaseGazetteSpider +from gazette.spiders.base.atende import BaseAtendeT2Spider -class RsGravataiSpider(BaseGazetteSpider): +class RsGravataiSpider(BaseAtendeT2Spider): TERRITORY_ID = "4309209" name = "rs_gravatai" - allowed_domains = ["gravatai.atende.net"] - start_urls = ["https://gravatai.atende.net/?pg=diariooficial"] - - extra_editions_options = ("Suplementar", "Retificação") - - def parse(self, response): - """ - @url https://gravatai.atende.net/?pg=diariooficial - @returns requests 1 - """ - - last_page_number_css = "#paginacao > ul > li:nth-child(7) > button::attr(value)" - last_page_number = int(response.css(last_page_number_css).extract_first()) - - for page_number in range(1, last_page_number + 1): - yield Request( - f"https://gravatai.atende.net/?pg=diariooficial&pagina={page_number}", - callback=self.parse_gazette, - ) - - def parse_gazette(self, response): - """ - @url https://gravatai.atende.net/?pg=diariooficial&pagina=1 - @returns items 1 - @scrapes date file_urls is_extra_edition power - """ - - for element in response.css(".nova_listagem > .linha"): - info = element.css(".info") - - is_extra_edition = ( - info.css(".tipo::text").extract_first() in self.extra_editions_options - ) - - date = parse( - info.css(".data::text").extract_first(), languages=["pt"] - ).date() - - code = element.css(".opcoes > button::attr(data-codigo)").extract_first() - url = ( - "https://gravatai.atende.net/atende.php?rot=54002&aca=737" - f"&processo=download&codigo={code}" - ) - - yield Gazette( - date=date, - file_urls=[url], - is_extra_edition=is_extra_edition, - power="executive", - ) + start_date = date(2015, 5, 4) # Edição 1 + city_subdomain = "gravatai" + power = "executive" From 06e517f448d97e412fee1c447f7341333bb207db Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Sun, 3 Dec 2023 19:07:50 -0300 Subject: [PATCH 08/40] =?UTF-8?q?Adiciona=20o=20spider=20'pr=5Faraucaria'?= =?UTF-8?q?=20de=20Arauc=C3=A1ria=20PR.=20Sistema=20replic=C3=A1vel=20'Ate?= =?UTF-8?q?nde'.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_collection/gazette/spiders/pr/pr_araucaria.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_araucaria.py diff --git a/data_collection/gazette/spiders/pr/pr_araucaria.py b/data_collection/gazette/spiders/pr/pr_araucaria.py new file mode 100644 index 000000000..9c4786d45 --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_araucaria.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrCampoMouraoSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4101804" + name = "pr_araucaria" + start_date = date(2018, 12, 19) # Edição 246 + city_subdomain = "araucaria" From 8aa4867b5d83da100d91ca271b9e602a00e5a565 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Sun, 3 Dec 2023 19:21:34 -0300 Subject: [PATCH 09/40] =?UTF-8?q?Ajuste=20no=20spider=20base=20'atende'=20?= =?UTF-8?q?para=20buscar=20a=20url=20de=20download=20na=20p=C3=A1gina=20de?= =?UTF-8?q?=20'Detalhes=20da=20Edi=C3=A7=C3=A3o'=20quando=20n=C3=A3o=20enc?= =?UTF-8?q?ontrar=20na=20p=C3=A1gina=20padr=C3=A3o=20(layout=20Tipo=202).?= =?UTF-8?q?=20Caso=20conhecido:=20Arauc=C3=A1ria=20-=20PR=20(https://arauc?= =?UTF-8?q?aria.atende.net/diariooficial)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/base/atende.py | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/data_collection/gazette/spiders/base/atende.py b/data_collection/gazette/spiders/base/atende.py index 8e31c9a22..ffd13a794 100644 --- a/data_collection/gazette/spiders/base/atende.py +++ b/data_collection/gazette/spiders/base/atende.py @@ -48,19 +48,23 @@ def parse(self, response, page=start_page): ) edition_number = line.css("div.titulo::text").get() # edition_number = lines.css("div.titulo::text").re_first(r"[^\s][\d.]+") - download_urls = line.css("button::attr(data-link)") - if len(download_urls) < 1: - self.logger.debug("Unable to find an url for download!") - continue - download_url = download_urls[-1].get() - - yield Gazette( + gazette = Gazette( date=date, edition_number=edition_number, - file_urls=[download_url], is_extra_edition=is_extra, power=self.power, ) + download_urls = line.css("button::attr(data-link)") + if len(download_urls) > 0: + gazette["file_urls"] = [download_urls[-1].get()] + yield gazette + else: + # self.logger.debug("Unable to find an url for download! Trying edition details.") + edition_id = line.css("span.bt_detalhes::attr(data-id)").get() + edition_url = f"{self.get_base_url()}¶metro=%7B%22codigoPlugin%22%3A2,%22filtroPlugin%22%3A%7B%22codigoEdicao%22%3A%22{edition_id}%22%7D%7D" + yield Request( + edition_url, self.parse_edition, cb_kwargs={"gazette": gazette} + ) if self.end_page < 1: pages = response.css("div#paginacao li.dst button::attr(value)").getall() @@ -73,5 +77,15 @@ def parse(self, response, page=start_page): if page <= self.end_page: yield response.follow(self.get_url(page), cb_kwargs={"page": page}) + def parse_edition(self, response, gazette): + download_url = response.css( + "button.visualizacao_versao_completa::attr(data-link)" + ).get() + gazette["file_urls"] = [download_url] + yield gazette + + def get_base_url(self): + return f"https://{self.city_subdomain}.atende.net/diariooficial/edicao/pagina/atende.php?rot=54015&aca=101&ajax=t&processo=loadPluginDiarioOficial" + def get_url(self, page): - return f"https://{self.city_subdomain}.atende.net/diariooficial/edicao/pagina/atende.php?rot=54015&aca=101&ajax=t&processo=loadPluginDiarioOficial¶metro=%7B%22codigoPlugin%22%3A1,%22filtroPlugin%22%3A%7B%22pagina%22%3A%22{page}%22%7D%7D" + return f"{self.get_base_url()}¶metro=%7B%22codigoPlugin%22%3A1,%22filtroPlugin%22%3A%7B%22pagina%22%3A%22{page}%22%7D%7D" From 443b256b702340fd5e6c232df5a554f0ecbdccb5 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:01:23 -0300 Subject: [PATCH 10/40] Adiciona o spider 'pr_apucarana' de Apucarana - PR. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/pr/pr_apucarana.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_apucarana.py diff --git a/data_collection/gazette/spiders/pr/pr_apucarana.py b/data_collection/gazette/spiders/pr/pr_apucarana.py new file mode 100644 index 000000000..c61eb9029 --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_apucarana.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrApucaranaSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4101408" + name = "pr_apucarana" + start_date = date(2022, 2, 23) # Edição 1 + city_subdomain = "apucarana" From 1e709d9691907efb74189a9eebad899e9f34cd64 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:01:36 -0300 Subject: [PATCH 11/40] Adiciona o spider 'sc_araquari' de Araquari - SC. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/sc/sc_araquari.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/sc/sc_araquari.py diff --git a/data_collection/gazette/spiders/sc/sc_araquari.py b/data_collection/gazette/spiders/sc/sc_araquari.py new file mode 100644 index 000000000..02841d875 --- /dev/null +++ b/data_collection/gazette/spiders/sc/sc_araquari.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class ScAraquariSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4201307" + name = "sc_araquari" + start_date = date(2018, 1, 2) # Edição 1 + city_subdomain = "araquari" From 79d3e6e04306d8b7b7f40f38539855890bd3a5a3 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:01:47 -0300 Subject: [PATCH 12/40] =?UTF-8?q?Adiciona=20o=20spider=20'rs=5Fbento=5Fgon?= =?UTF-8?q?calves'=20de=20Bento=20Gon=C3=A7alves=20-=20RS.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- .../gazette/spiders/rs/rs_bento_goncalves.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/rs/rs_bento_goncalves.py diff --git a/data_collection/gazette/spiders/rs/rs_bento_goncalves.py b/data_collection/gazette/spiders/rs/rs_bento_goncalves.py new file mode 100644 index 000000000..068a6cf8d --- /dev/null +++ b/data_collection/gazette/spiders/rs/rs_bento_goncalves.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class RsBentoGoncalvesSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4302105" + name = "rs_bento_goncalves" + start_date = date(2019, 4, 1) # Edição 1124 + city_subdomain = "bentogoncalves" From 9e287198584bb7d5f3ea9f6841a197180c264aa5 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:01:58 -0300 Subject: [PATCH 13/40] Adiciona o spider 'pr_campo_largo' de Campo Largo - PR. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/pr/pr_campo_largo.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_campo_largo.py diff --git a/data_collection/gazette/spiders/pr/pr_campo_largo.py b/data_collection/gazette/spiders/pr/pr_campo_largo.py new file mode 100644 index 000000000..d240faaf5 --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_campo_largo.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrCampoLargoSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4104204" + name = "pr_campo_largo" + start_date = date(2006, 1, 20) # Edição 1 + city_subdomain = "campolargo" From b5b88fc60172a49248c43c3247098a6f0dc7ffd1 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:02:12 -0300 Subject: [PATCH 14/40] =?UTF-8?q?Adiciona=20o=20spider=20'rs=5Fcandelaria'?= =?UTF-8?q?=20de=20Candel=C3=A1ria=20-=20RS.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/rs/rs_candelaria.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/rs/rs_candelaria.py diff --git a/data_collection/gazette/spiders/rs/rs_candelaria.py b/data_collection/gazette/spiders/rs/rs_candelaria.py new file mode 100644 index 000000000..5d0f7e717 --- /dev/null +++ b/data_collection/gazette/spiders/rs/rs_candelaria.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class RsCandelariaSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4304200" + name = "rs_candelaria" + start_date = date(2023, 5, 7) # Edição 1 + city_subdomain = "candelaria" From d5563a21717fdd6758e16b8e06ef322463d07374 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:02:28 -0300 Subject: [PATCH 15/40] =?UTF-8?q?Adiciona=20o=20spider=20'mg=5Fcarmopolis?= =?UTF-8?q?=5Fde=5Fminas'=20de=20Carm=C3=B3polis=20de=20Minas=20-=20MG.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- .../gazette/spiders/mg/mg_carmopolis_de_minas.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/mg/mg_carmopolis_de_minas.py diff --git a/data_collection/gazette/spiders/mg/mg_carmopolis_de_minas.py b/data_collection/gazette/spiders/mg/mg_carmopolis_de_minas.py new file mode 100644 index 000000000..0c8986b3d --- /dev/null +++ b/data_collection/gazette/spiders/mg/mg_carmopolis_de_minas.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class MgCarmopolisDeMinasSpider(BaseAtendeT2Spider): + TERRITORY_ID = "3114501" + name = "mg_carmopolis_de_minas" + start_date = date(2013, 1, 24) # Edição 64 + city_subdomain = "carmopolisdeminas" From 7320399bafcdf11bf4a9c549d3cce048dc9b2b30 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:02:44 -0300 Subject: [PATCH 16/40] Adiciona o spider 'pr_castro' de Castro - PR. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/pr/pr_castro.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_castro.py diff --git a/data_collection/gazette/spiders/pr/pr_castro.py b/data_collection/gazette/spiders/pr/pr_castro.py new file mode 100644 index 000000000..6025759cd --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_castro.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrCastroSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4104907" + name = "pr_castro" + start_date = date(2010, 6, 4) # Edição 222 + city_subdomain = "castro" From 544aaee7571c27738c24b81e7c76946d4a787638 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:03:00 -0300 Subject: [PATCH 17/40] =?UTF-8?q?Adiciona=20o=20spider=20'pr=5Fclevelandia?= =?UTF-8?q?'=20de=20Clevel=C3=A2ndia=20-=20PR.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/pr/pr_clevelandia.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_clevelandia.py diff --git a/data_collection/gazette/spiders/pr/pr_clevelandia.py b/data_collection/gazette/spiders/pr/pr_clevelandia.py new file mode 100644 index 000000000..a55c06d03 --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_clevelandia.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrClevelandiaSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4105706" + name = "pr_clevelandia" + start_date = date(2012, 3, 26) # Edição 60 + city_subdomain = "clevelandia" From 1cacfd82cb43f07469cd859f0c7bfc1101a8890f Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:03:15 -0300 Subject: [PATCH 18/40] =?UTF-8?q?Adiciona=20o=20spider=20'pr=5Fcorbelia'?= =?UTF-8?q?=20de=20Corb=C3=A9lia=20-=20PR.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/pr/pr_corbelia.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_corbelia.py diff --git a/data_collection/gazette/spiders/pr/pr_corbelia.py b/data_collection/gazette/spiders/pr/pr_corbelia.py new file mode 100644 index 000000000..13482d9db --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_corbelia.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrCorbeliaSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4106308" + name = "pr_corbelia" + start_date = date(2015, 11, 20) # Edição 1 + city_subdomain = "corbelia" From 240da152409446d2904d3c21e9fba695cb9dd2ec Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:03:32 -0300 Subject: [PATCH 19/40] =?UTF-8?q?Adiciona=20o=20spider=20'rs=5Fdois=5Firma?= =?UTF-8?q?os'=20de=20Dois=20Irm=C3=A3os=20-=20RS.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/rs/rs_dois_irmaos.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/rs/rs_dois_irmaos.py diff --git a/data_collection/gazette/spiders/rs/rs_dois_irmaos.py b/data_collection/gazette/spiders/rs/rs_dois_irmaos.py new file mode 100644 index 000000000..2b1ca5436 --- /dev/null +++ b/data_collection/gazette/spiders/rs/rs_dois_irmaos.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class RsDoisIrmaosSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4306403" + name = "rs_dois_irmaos" + start_date = date(2020, 1, 7) # Edição 1 + city_subdomain = "doisirmaos" From b56b8c31575dab481ff8b0a73b2c3ad92171acb6 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:03:47 -0300 Subject: [PATCH 20/40] Adiciona o spider 'rs_estrela' de Estrela - RS. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/rs/rs_estrela.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/rs/rs_estrela.py diff --git a/data_collection/gazette/spiders/rs/rs_estrela.py b/data_collection/gazette/spiders/rs/rs_estrela.py new file mode 100644 index 000000000..391ee6ea1 --- /dev/null +++ b/data_collection/gazette/spiders/rs/rs_estrela.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class RsEstrelaSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4307807" + name = "rs_estrela" + start_date = date(2021, 3, 29) # Edição 1 + city_subdomain = "estrela" From a7c1884e3c781161340875c9b932ee663c9dcd41 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:04:03 -0300 Subject: [PATCH 21/40] =?UTF-8?q?Adiciona=20o=20spider=20'pr=5Fguaraniacu'?= =?UTF-8?q?=20de=20Guarania=C3=A7u=20-=20PR.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/pr/pr_guaraniacu.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_guaraniacu.py diff --git a/data_collection/gazette/spiders/pr/pr_guaraniacu.py b/data_collection/gazette/spiders/pr/pr_guaraniacu.py new file mode 100644 index 000000000..712777b9e --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_guaraniacu.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrGuaraniacuSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4109302" + name = "pr_guaraniacu" + start_date = date(2023, 5, 3) # Edição 1 + city_subdomain = "guaraniacu" From 31a2cf2bd182e7c89d82ebfc3d6d7486eccc09f3 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:04:20 -0300 Subject: [PATCH 22/40] Adiciona o spider 'rs_horizontina' de Horizontina - RS. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/rs/rs_horizontina.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/rs/rs_horizontina.py diff --git a/data_collection/gazette/spiders/rs/rs_horizontina.py b/data_collection/gazette/spiders/rs/rs_horizontina.py new file mode 100644 index 000000000..262e8b11b --- /dev/null +++ b/data_collection/gazette/spiders/rs/rs_horizontina.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class RsHorizontinaSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4309605" + name = "rs_horizontina" + start_date = date(2016, 6, 15) # Edição 1 + city_subdomain = "horizontina" From f5c30009bc720e8d1b90d3e086a99ac37bd7c456 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:04:35 -0300 Subject: [PATCH 23/40] Adiciona o spider 'pr_juranda' de Juranda - PR. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/pr/pr_juranda.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_juranda.py diff --git a/data_collection/gazette/spiders/pr/pr_juranda.py b/data_collection/gazette/spiders/pr/pr_juranda.py new file mode 100644 index 000000000..dd699120b --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_juranda.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrJurandaSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4112959" + name = "pr_juranda" + start_date = date(2021, 3, 24) # Edição 1 + city_subdomain = "juranda" From de9f280b639e9edcef72a2df1f4435f9fb9c2d91 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:04:50 -0300 Subject: [PATCH 24/40] Adiciona o spider 'sc_laurentino' de Laurentino - SC. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/sc/sc_laurentino.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/sc/sc_laurentino.py diff --git a/data_collection/gazette/spiders/sc/sc_laurentino.py b/data_collection/gazette/spiders/sc/sc_laurentino.py new file mode 100644 index 000000000..76144582f --- /dev/null +++ b/data_collection/gazette/spiders/sc/sc_laurentino.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class ScLaurentinoSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4209508" + name = "sc_laurentino" + start_date = date(2021, 7, 1) # Edição 1 + city_subdomain = "laurentino" From 59a3565fb26d737ec02b77481b3c2bd1d4c3ca5e Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:05:02 -0300 Subject: [PATCH 25/40] =?UTF-8?q?Adiciona=20o=20spider=20'pr=5Fmambore'=20?= =?UTF-8?q?de=20Mambor=C3=AA=20-=20PR.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/pr/pr_mambore.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_mambore.py diff --git a/data_collection/gazette/spiders/pr/pr_mambore.py b/data_collection/gazette/spiders/pr/pr_mambore.py new file mode 100644 index 000000000..d2b301667 --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_mambore.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrMamboreSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4114005" + name = "pr_mambore" + start_date = date(2020, 5, 25) # Edição 1 + city_subdomain = "mambore" From 238dbef30509efb929b04b18779f2c98c6620000 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:05:13 -0300 Subject: [PATCH 26/40] Adiciona o spider 'mg_oliveira' de Oliveira - MG. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/mg/mg_oliveira.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/mg/mg_oliveira.py diff --git a/data_collection/gazette/spiders/mg/mg_oliveira.py b/data_collection/gazette/spiders/mg/mg_oliveira.py new file mode 100644 index 000000000..afb2d12d6 --- /dev/null +++ b/data_collection/gazette/spiders/mg/mg_oliveira.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class MgOliveiraSpider(BaseAtendeT2Spider): + TERRITORY_ID = "3145604" + name = "mg_oliveira" + start_date = date(2014, 10, 22) # Edição 1 + city_subdomain = "oliveira" From 1b56af0b928b1c296b77fc86cb0eaf4569353237 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:05:27 -0300 Subject: [PATCH 27/40] Adiciona o spider 'pr_ouro_verde_do_oeste' de Ouro Verde do Oeste - PR. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- .../gazette/spiders/pr/pr_ouro_verde_do_oeste.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_ouro_verde_do_oeste.py diff --git a/data_collection/gazette/spiders/pr/pr_ouro_verde_do_oeste.py b/data_collection/gazette/spiders/pr/pr_ouro_verde_do_oeste.py new file mode 100644 index 000000000..278346250 --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_ouro_verde_do_oeste.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrOuroVerdeDoOesteSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4117453" + name = "pr_ouro_verde_do_oeste" + start_date = date(2021, 3, 31) # Edição 1 + city_subdomain = "ouroverdedooeste" From b1c76b671414fe380bf418e17253ae935dffc9db Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:05:44 -0300 Subject: [PATCH 28/40] Adiciona o spider 'rs_panambi' de Panambi - RS. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/rs/rs_panambi.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/rs/rs_panambi.py diff --git a/data_collection/gazette/spiders/rs/rs_panambi.py b/data_collection/gazette/spiders/rs/rs_panambi.py new file mode 100644 index 000000000..797286525 --- /dev/null +++ b/data_collection/gazette/spiders/rs/rs_panambi.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class RsPanambiSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4313904" + name = "rs_panambi" + start_date = date(2021, 4, 14) # Edição 1 + city_subdomain = "panambi" From a3f834a964f61a2a26f126eb16ee79bee99ff2c3 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:06:04 -0300 Subject: [PATCH 29/40] Adiciona o spider 'pr_pinhais' de Pinhais - PR. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/pr/pr_pinhais.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_pinhais.py diff --git a/data_collection/gazette/spiders/pr/pr_pinhais.py b/data_collection/gazette/spiders/pr/pr_pinhais.py new file mode 100644 index 000000000..d302b9ba1 --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_pinhais.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrPinhaisSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4119152" + name = "pr_pinhais" + start_date = date(2017, 5, 26) # Edição 1 + city_subdomain = "pinhais" From 5bedc3c95c5cf48cb4bfba9908bc3abd8ac42197 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:06:21 -0300 Subject: [PATCH 30/40] Adiciona o spider 'pr_rio_branco_do_sul' de Rio Branco do Sul - PR. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- .../gazette/spiders/pr/pr_rio_branco_do_sul.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_rio_branco_do_sul.py diff --git a/data_collection/gazette/spiders/pr/pr_rio_branco_do_sul.py b/data_collection/gazette/spiders/pr/pr_rio_branco_do_sul.py new file mode 100644 index 000000000..2e9212ece --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_rio_branco_do_sul.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrRioBrancoDoSulSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4122206" + name = "pr_rio_branco_do_sul" + start_date = date(2022, 11, 20) # Edição 2608 + city_subdomain = "riobrancodosul" From 7a04b8689e48cf41f7478fcbfdfe8878b36b6071 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:06:38 -0300 Subject: [PATCH 31/40] Adiciona o spider 'rs_santa_rosa' de Santa Rosa - RS. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/rs/rs_santa_rosa.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/rs/rs_santa_rosa.py diff --git a/data_collection/gazette/spiders/rs/rs_santa_rosa.py b/data_collection/gazette/spiders/rs/rs_santa_rosa.py new file mode 100644 index 000000000..881beb010 --- /dev/null +++ b/data_collection/gazette/spiders/rs/rs_santa_rosa.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class RsSantaRosaSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4317202" + name = "rs_santa_rosa" + start_date = date(2022, 8, 23) # Edição 1 + city_subdomain = "santarosa" From 93a47bc0b000e1f2a88dc0f669d833622b928d79 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:06:52 -0300 Subject: [PATCH 32/40] =?UTF-8?q?Adiciona=20o=20spider=20'pr=5Fsanto=5Fant?= =?UTF-8?q?onio=5Fda=5Fplatina'=20de=20Santo=20Ant=C3=B4nio=20da=20Platina?= =?UTF-8?q?=20-=20PR.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- .../gazette/spiders/pr/pr_santo_antonio_da_platina.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_santo_antonio_da_platina.py diff --git a/data_collection/gazette/spiders/pr/pr_santo_antonio_da_platina.py b/data_collection/gazette/spiders/pr/pr_santo_antonio_da_platina.py new file mode 100644 index 000000000..038715e6b --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_santo_antonio_da_platina.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrSantoAntonioDaPlatinaSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4124103" + name = "pr_santo_antonio_da_platina" + start_date = date(2012, 11, 27) # Edição 1 + city_subdomain = "santoantoniodaplatina" From 2ed19e341bcff464c2841cf1445c319e70008948 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:07:10 -0300 Subject: [PATCH 33/40] =?UTF-8?q?Adiciona=20o=20spider=20'rs=5Fsao=5Fjoao?= =?UTF-8?q?=5Fdo=5Fpolesine'=20de=20S=C3=A3o=20Jo=C3=A3o=20do=20Pol=C3=AAs?= =?UTF-8?q?ine=20-=20RS.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- .../gazette/spiders/rs/rs_sao_joao_do_polesine.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/rs/rs_sao_joao_do_polesine.py diff --git a/data_collection/gazette/spiders/rs/rs_sao_joao_do_polesine.py b/data_collection/gazette/spiders/rs/rs_sao_joao_do_polesine.py new file mode 100644 index 000000000..d93e213ec --- /dev/null +++ b/data_collection/gazette/spiders/rs/rs_sao_joao_do_polesine.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class RsSaoJoaoDoPolesineSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4318432" + name = "rs_sao_joao_do_polesine" + start_date = date(2021, 5, 28) # Edição 1 + city_subdomain = "saojoaodopolesine" From 9ad4d0cda0bb28a1a21f78908cb7c120ea9323c4 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:07:27 -0300 Subject: [PATCH 34/40] Adiciona o spider 'rs_sobradinho' de Sobradinho - RS. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/rs/rs_sobradinho.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/rs/rs_sobradinho.py diff --git a/data_collection/gazette/spiders/rs/rs_sobradinho.py b/data_collection/gazette/spiders/rs/rs_sobradinho.py new file mode 100644 index 000000000..74511995f --- /dev/null +++ b/data_collection/gazette/spiders/rs/rs_sobradinho.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class RsSobradinhoSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4320701" + name = "rs_sobradinho" + start_date = date(2020, 3, 5) # Edição 1 + city_subdomain = "sobradinho" From 8479713fd504062c85eb10e92439097e6cfa8085 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:07:46 -0300 Subject: [PATCH 35/40] =?UTF-8?q?Adiciona=20o=20spider=20'pr=5Ftupassi'=20?= =?UTF-8?q?de=20Tup=C3=A3ssi=20-=20PR.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/pr/pr_tupassi.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_tupassi.py diff --git a/data_collection/gazette/spiders/pr/pr_tupassi.py b/data_collection/gazette/spiders/pr/pr_tupassi.py new file mode 100644 index 000000000..0ca393fa9 --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_tupassi.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrTupassiSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4127957" + name = "pr_tupassi" + start_date = date(2016, 6, 27) # Edição 14 + city_subdomain = "tupassi" From cf144d78a6edc23cca3ba423c3ba87b250c7e3a9 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Mon, 4 Dec 2023 13:08:05 -0300 Subject: [PATCH 36/40] Adiciona o spider 'pr_pato_bragado' de Pato Bragado - PR. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sistema replicável 'Atende' (layout Tipo 2). --- data_collection/gazette/spiders/pr/pr_pato_bragado.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 data_collection/gazette/spiders/pr/pr_pato_bragado.py diff --git a/data_collection/gazette/spiders/pr/pr_pato_bragado.py b/data_collection/gazette/spiders/pr/pr_pato_bragado.py new file mode 100644 index 000000000..97f43577f --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_pato_bragado.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT2Spider + + +class PrPatoBragadoSpider(BaseAtendeT2Spider): + TERRITORY_ID = "4118451" + name = "pr_pato_bragado" + start_date = date(2012, 5, 30) # Edição 1 + city_subdomain = "patobragado" From 9d362ff169bc60714a20a1e90227b1df38d8d921 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Tue, 5 Dec 2023 22:37:06 -0300 Subject: [PATCH 37/40] =?UTF-8?q?Adiciona=20a=20classe=20'BaseAtendeT1Spid?= =?UTF-8?q?er'=20para=20buscar=20os=20di=C3=A1rios=20nas=20p=C3=A1ginas=20?= =?UTF-8?q?do=20sistema=20replic=C3=A1vel=20'atende'=20com=20layout=20'Tip?= =?UTF-8?q?o=201'.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/base/atende.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/data_collection/gazette/spiders/base/atende.py b/data_collection/gazette/spiders/base/atende.py index ffd13a794..1328112f6 100644 --- a/data_collection/gazette/spiders/base/atende.py +++ b/data_collection/gazette/spiders/base/atende.py @@ -1,10 +1,108 @@ +import datetime as dt +import locale + import dateparser +from dateutil.relativedelta import relativedelta +from dateutil.rrule import MONTHLY, rrule from scrapy import Request from gazette.items import Gazette from gazette.spiders.base import BaseGazetteSpider +class BaseAtendeT1Spider(BaseGazetteSpider): + """ + Base spider for Gazzetes that are available from cities listed on https://{city_subdomain}.atende.net + This base class deals with layout 'Type 1' gazette pages, usually requested + from 'https://{city_subdomain}.atende.net/cidadao/pagina/diario-oficial'. + """ + + # Must be defined into child classes + city_subdomain = "" + perm_url = "" + + start_edition = 0 + power = "executive_legislative" + extra_edition_options = ("extra", "anexo", "segunda", "2ª", "especial", "decreto") + allowed_domains = ["atende.net"] + + def start_requests(self): + locale.setlocale(locale.LC_ALL, "pt_BR.utf8") + yield Request(self.get_url(self.perm_url)) + + def parse(self, response): + # cachoeirinha files aren't available in the first page. They are in month pages. + month_pages = response.css("a.itemPaginasRelacionadas::attr(href)") + if len(month_pages) > 0: + # There is no link for 05/2016 in the first page. + # for month_page in month_pages: + # yield Request(self.get_url(month_page.split("/")[-1]), self.parse) + monthly_rec = rrule( + MONTHLY, + dtstart=self.start_date + relativedelta(day=1), + until=self.end_date, + )[::-1] + month_urls = [ + self.get_url(f"diario-oficial-{rec.strftime('%B-%Y')}") + for rec in monthly_rec + ] + for month_url in month_urls: + yield Request(month_url, self.parse) + else: + click_links = response.css("div.arquivos a::attr(onclick)") + title_list = response.css("div.arquivos li::attr(data-original-title)") + if len(click_links) != len(title_list): + self.logger.warning( + f"Size of titles ({len(title_list)}) not equal to size of links ({len(click_links)})!" + ) + for link, title in zip(click_links, title_list): + edition_number = title.re_first(r"[^\s/-][\d]+") + date_raw = title.re( + r"(\d{1,4})?[_ -]*([\dº]{2})[/._-](\d{2})[/._-](\d{2,4})" + ) + date_time = dateparser.parse( + ("/".join(date_raw[1:])).replace("º", ""), languages=["pt"] + ) + if date_time is None: + # self.logger.debug(f"Unable to parse date from {date_raw[1:]}!") + if not edition_number: + self.logger.debug( + f"Unable to parse edition number from '{title.get()}'!" + ) + continue + # date will be extracted from gazette in data processing stage. + gazette_date = dt.date.max + else: + gazette_date = date_time.date() + if not (self.start_date <= gazette_date <= self.end_date): + continue + if edition_number and int(edition_number) < self.start_edition: + continue + edition = title.get().strip() + is_extra = self.is_extra_edition(edition.lower()) + file_id = link.re_first(r"arquivo\('(\w+)") + download_url = f"{self.get_base_url()}?rot=1&aca=571&ajax=t&processo=downloadFile&file={file_id}&sistema=WPO&classe=UploadMidia" + yield Gazette( + date=gazette_date, + edition_number=edition, + file_urls=[download_url], + is_extra_edition=is_extra, + power=self.power, + ) + + def is_extra_edition(self, edition_text): + for option in self.extra_edition_options: + if option in edition_text: + return True + return False + + def get_base_url(self): + return f"https://{self.city_subdomain}.atende.net/cidadao/pagina/atende.php" + + def get_url(self, p_url): + return f"{self.get_base_url()}?rot=49094&aca=101&ajax=t&processo=loadPluginPortal¶metro=%7B%22codigoPlugin%22%3A65,%22parametroPlugin%22%3A%7B%22loadDetalhes%22%3Atrue%7D,%22filtroPlugin%22%3A%7B%22paginaUrlPermanente%22%3A%22{p_url}%22%7D%7D" + + class BaseAtendeT2Spider(BaseGazetteSpider): """ Base spider for Gazzetes that are available from cities listed on https://{city_subdomain}.atende.net From 6e952ce4c423ae7e27c6344a26441dca31d7b4b0 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Tue, 5 Dec 2023 22:42:01 -0300 Subject: [PATCH 38/40] =?UTF-8?q?Adiciona=20o=20spider=20'rs=5Fcachoeirinh?= =?UTF-8?q?a'=20de=20Cachoeirinha=20-=20RS.=20Sistema=20replic=C3=A1vel=20?= =?UTF-8?q?'Atende'=20(layout=20Tipo=201).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_collection/gazette/spiders/rs/rs_cachoeirinha.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 data_collection/gazette/spiders/rs/rs_cachoeirinha.py diff --git a/data_collection/gazette/spiders/rs/rs_cachoeirinha.py b/data_collection/gazette/spiders/rs/rs_cachoeirinha.py new file mode 100644 index 000000000..c09a911b1 --- /dev/null +++ b/data_collection/gazette/spiders/rs/rs_cachoeirinha.py @@ -0,0 +1,11 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT1Spider + + +class RsCachoeirinhaSpider(BaseAtendeT1Spider): + TERRITORY_ID = "4303103" + name = "rs_cachoeirinha" + start_date = date(2013, 5, 15) # Edição 1 + city_subdomain = "cachoeirinha" + perm_url = "diario-oficial-de-cachoeirinha" From a944881b4dcc3070870bced7e6fd3f9625c6e589 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Tue, 5 Dec 2023 22:58:21 -0300 Subject: [PATCH 39/40] =?UTF-8?q?Adiciona=20o=20spider=20'rs=5Fcamaqua=5F2?= =?UTF-8?q?023'=20de=20Camaqu=C3=A3=20-=20RS.=20Sistema=20replic=C3=A1vel?= =?UTF-8?q?=20'Atende'=20(layout=20Tipo=201).=20Resolve=20#1038=20Conforme?= =?UTF-8?q?=20comentado=20em=20https://github.com/okfn-brasil/querido-diar?= =?UTF-8?q?io/issues/1038#issuecomment-1821603875=20esse=20novo=20spider?= =?UTF-8?q?=20coleta=20apenas=20da=20edi=C3=A7=C3=A3o=20333=20em=20diante.?= =?UTF-8?q?=20Como=20falta=20a=20informa=C3=A7=C3=A3o=20de=20data=20para?= =?UTF-8?q?=20muitas=20edi=C3=A7=C3=B5es=20na=20p=C3=A1gina,=20a=20estrat?= =?UTF-8?q?=C3=A9gia=20foi=20incluir=20a=20verifica=C3=A7=C3=A3o=20do=20n?= =?UTF-8?q?=C3=BAmero=20da=20edi=C3=A7=C3=A3o=20para=20limitar=20o=20downl?= =?UTF-8?q?oad=20dos=20arquivos=20e=20salvar=20a=20data=20`datetime.date.m?= =?UTF-8?q?ax`=20nos=20metadados=20para=20que=20a=20etapa=20de=20'data=20p?= =?UTF-8?q?rocessing'=20realize=20a=20extra=C3=A7=C3=A3o=20dessa=20informa?= =?UTF-8?q?=C3=A7=C3=A3o.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/rs/rs_camaqua_2023.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 data_collection/gazette/spiders/rs/rs_camaqua_2023.py diff --git a/data_collection/gazette/spiders/rs/rs_camaqua_2023.py b/data_collection/gazette/spiders/rs/rs_camaqua_2023.py new file mode 100644 index 000000000..922e8d397 --- /dev/null +++ b/data_collection/gazette/spiders/rs/rs_camaqua_2023.py @@ -0,0 +1,12 @@ +from datetime import date + +from gazette.spiders.base.atende import BaseAtendeT1Spider + + +class RsCamaqua2023Spider(BaseAtendeT1Spider): + TERRITORY_ID = "4303509" + name = "rs_camaqua_2023" + start_date = date(2023, 7, 20) # Edição 333 + city_subdomain = "camaqua" + perm_url = "diario-oficial" + start_edition = 333 From ded3a1efbbb5ad89540b1ae385a4b26bb629fc4e Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Tue, 5 Dec 2023 23:06:23 -0300 Subject: [PATCH 40/40] =?UTF-8?q?Inclui=20a=20informa=C3=A7=C3=A3o=20de=20?= =?UTF-8?q?'end=5Fdate'=20para=20o=20spider=20original=20de=20Camaqu=C3=A3?= =?UTF-8?q?,=20para=20refletir=20o=20=C3=BAltimo=20dia=20antes=20da=20publ?= =?UTF-8?q?ica=C3=A7=C3=A3o=20da=20edi=C3=A7=C3=A3o=20333=20no=20novo=20si?= =?UTF-8?q?te=20atende.net=20Ajuste=20associado=20=C3=A0=20issue=20#1038?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_collection/gazette/spiders/rs/rs_camaqua.py | 1 + 1 file changed, 1 insertion(+) diff --git a/data_collection/gazette/spiders/rs/rs_camaqua.py b/data_collection/gazette/spiders/rs/rs_camaqua.py index d3523e92b..5e002a9f1 100644 --- a/data_collection/gazette/spiders/rs/rs_camaqua.py +++ b/data_collection/gazette/spiders/rs/rs_camaqua.py @@ -9,3 +9,4 @@ class RsCamaquaSpider(BaseInstarSpider): allowed_domains = ["camaqua.rs.gov.br"] base_url = "https://www.camaqua.rs.gov.br/portal/diario-oficial" start_date = date(2019, 7, 25) + end_date = date(2023, 7, 19)