Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adiciona spider base para o sistema replicável Atende, e spiders derivados para as cidades com D.O.s nesse sistema. #1046

Draft
wants to merge 40 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
b178d13
Spider for Campo Mourao/PR
rodps Apr 24, 2021
efe5f05
automated code formatting
Apr 26, 2021
5ebd2ed
Changes in Campo Mourao spider
rodps May 23, 2021
0b8a210
Move pr_campo_mourao criado em #438 para servir de referência na cria…
AlexJBSilva Nov 27, 2023
88f036c
Adiciona spider base do sistema replicável Atende.
AlexJBSilva Nov 29, 2023
5158d72
Ajusta o spider 'pr_campo_mourao' criado em #438
AlexJBSilva Nov 29, 2023
c463564
Ajusta o spider 'rs_gravatai' para trabalhar com o
AlexJBSilva Dec 2, 2023
06e517f
Adiciona o spider 'pr_araucaria' de Araucária PR.
AlexJBSilva Dec 3, 2023
8aa4867
Ajuste no spider base 'atende' para buscar a url
AlexJBSilva Dec 3, 2023
443b256
Adiciona o spider 'pr_apucarana' de Apucarana - PR.
AlexJBSilva Dec 4, 2023
1e709d9
Adiciona o spider 'sc_araquari' de Araquari - SC.
AlexJBSilva Dec 4, 2023
79d3e6e
Adiciona o spider 'rs_bento_goncalves' de Bento Gonçalves - RS.
AlexJBSilva Dec 4, 2023
9e28719
Adiciona o spider 'pr_campo_largo' de Campo Largo - PR.
AlexJBSilva Dec 4, 2023
b5b88fc
Adiciona o spider 'rs_candelaria' de Candelária - RS.
AlexJBSilva Dec 4, 2023
d5563a2
Adiciona o spider 'mg_carmopolis_de_minas' de Carmópolis de Minas - MG.
AlexJBSilva Dec 4, 2023
7320399
Adiciona o spider 'pr_castro' de Castro - PR.
AlexJBSilva Dec 4, 2023
544aaee
Adiciona o spider 'pr_clevelandia' de Clevelândia - PR.
AlexJBSilva Dec 4, 2023
1cacfd8
Adiciona o spider 'pr_corbelia' de Corbélia - PR.
AlexJBSilva Dec 4, 2023
240da15
Adiciona o spider 'rs_dois_irmaos' de Dois Irmãos - RS.
AlexJBSilva Dec 4, 2023
b56b8c3
Adiciona o spider 'rs_estrela' de Estrela - RS.
AlexJBSilva Dec 4, 2023
a7c1884
Adiciona o spider 'pr_guaraniacu' de Guaraniaçu - PR.
AlexJBSilva Dec 4, 2023
31a2cf2
Adiciona o spider 'rs_horizontina' de Horizontina - RS.
AlexJBSilva Dec 4, 2023
f5c3000
Adiciona o spider 'pr_juranda' de Juranda - PR.
AlexJBSilva Dec 4, 2023
de9f280
Adiciona o spider 'sc_laurentino' de Laurentino - SC.
AlexJBSilva Dec 4, 2023
59a3565
Adiciona o spider 'pr_mambore' de Mamborê - PR.
AlexJBSilva Dec 4, 2023
238dbef
Adiciona o spider 'mg_oliveira' de Oliveira - MG.
AlexJBSilva Dec 4, 2023
1b56af0
Adiciona o spider 'pr_ouro_verde_do_oeste' de Ouro Verde do Oeste - PR.
AlexJBSilva Dec 4, 2023
b1c76b6
Adiciona o spider 'rs_panambi' de Panambi - RS.
AlexJBSilva Dec 4, 2023
a3f834a
Adiciona o spider 'pr_pinhais' de Pinhais - PR.
AlexJBSilva Dec 4, 2023
5bedc3c
Adiciona o spider 'pr_rio_branco_do_sul' de Rio Branco do Sul - PR.
AlexJBSilva Dec 4, 2023
7a04b86
Adiciona o spider 'rs_santa_rosa' de Santa Rosa - RS.
AlexJBSilva Dec 4, 2023
93a47bc
Adiciona o spider 'pr_santo_antonio_da_platina' de Santo Antônio da P…
AlexJBSilva Dec 4, 2023
2ed19e3
Adiciona o spider 'rs_sao_joao_do_polesine' de São João do Polêsine -…
AlexJBSilva Dec 4, 2023
9ad4d0c
Adiciona o spider 'rs_sobradinho' de Sobradinho - RS.
AlexJBSilva Dec 4, 2023
8479713
Adiciona o spider 'pr_tupassi' de Tupãssi - PR.
AlexJBSilva Dec 4, 2023
cf144d7
Adiciona o spider 'pr_pato_bragado' de Pato Bragado - PR.
AlexJBSilva Dec 4, 2023
9d362ff
Adiciona a classe 'BaseAtendeT1Spider' para buscar
AlexJBSilva Dec 6, 2023
6e952ce
Adiciona o spider 'rs_cachoeirinha' de Cachoeirinha - RS.
AlexJBSilva Dec 6, 2023
a944881
Adiciona o spider 'rs_camaqua_2023' de Camaquã - RS.
AlexJBSilva Dec 6, 2023
ded3a1e
Inclui a informação de 'end_date' para o spider
AlexJBSilva Dec 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 189 additions & 0 deletions data_collection/gazette/spiders/base/atende.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import datetime as dt
import locale

import dateparser
from dateutil.relativedelta import relativedelta
from dateutil.rrule import MONTHLY, rrule
from scrapy import Request

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class BaseAtendeT1Spider(BaseGazetteSpider):
"""
Base spider for Gazzetes that are available from cities listed on https://{city_subdomain}.atende.net
This base class deals with layout 'Type 1' gazette pages, usually requested
from 'https://{city_subdomain}.atende.net/cidadao/pagina/diario-oficial'.
"""

# Must be defined into child classes
city_subdomain = ""
perm_url = ""

start_edition = 0
power = "executive_legislative"
extra_edition_options = ("extra", "anexo", "segunda", "2ª", "especial", "decreto")
allowed_domains = ["atende.net"]

def start_requests(self):
locale.setlocale(locale.LC_ALL, "pt_BR.utf8")
yield Request(self.get_url(self.perm_url))

def parse(self, response):
# cachoeirinha files aren't available in the first page. They are in month pages.
month_pages = response.css("a.itemPaginasRelacionadas::attr(href)")
if len(month_pages) > 0:
# There is no link for 05/2016 in the first page.
# for month_page in month_pages:
# yield Request(self.get_url(month_page.split("/")[-1]), self.parse)
monthly_rec = rrule(
MONTHLY,
dtstart=self.start_date + relativedelta(day=1),
until=self.end_date,
)[::-1]
month_urls = [
self.get_url(f"diario-oficial-{rec.strftime('%B-%Y')}")
for rec in monthly_rec
]
for month_url in month_urls:
yield Request(month_url, self.parse)
else:
click_links = response.css("div.arquivos a::attr(onclick)")
title_list = response.css("div.arquivos li::attr(data-original-title)")
if len(click_links) != len(title_list):
self.logger.warning(
f"Size of titles ({len(title_list)}) not equal to size of links ({len(click_links)})!"
)
for link, title in zip(click_links, title_list):
edition_number = title.re_first(r"[^\s/-][\d]+")
date_raw = title.re(
r"(\d{1,4})?[_ -]*([\dº]{2})[/._-](\d{2})[/._-](\d{2,4})"
)
date_time = dateparser.parse(
("/".join(date_raw[1:])).replace("º", ""), languages=["pt"]
)
if date_time is None:
# self.logger.debug(f"Unable to parse date from {date_raw[1:]}!")
if not edition_number:
self.logger.debug(
f"Unable to parse edition number from '{title.get()}'!"
)
continue
# date will be extracted from gazette in data processing stage.
gazette_date = dt.date.max
else:
gazette_date = date_time.date()
if not (self.start_date <= gazette_date <= self.end_date):
continue
if edition_number and int(edition_number) < self.start_edition:
continue
edition = title.get().strip()
is_extra = self.is_extra_edition(edition.lower())
file_id = link.re_first(r"arquivo\('(\w+)")
download_url = f"{self.get_base_url()}?rot=1&aca=571&ajax=t&processo=downloadFile&file={file_id}&sistema=WPO&classe=UploadMidia"
yield Gazette(
date=gazette_date,
edition_number=edition,
file_urls=[download_url],
is_extra_edition=is_extra,
power=self.power,
)

def is_extra_edition(self, edition_text):
for option in self.extra_edition_options:
if option in edition_text:
return True
return False

def get_base_url(self):
return f"https://{self.city_subdomain}.atende.net/cidadao/pagina/atende.php"

def get_url(self, p_url):
return f"{self.get_base_url()}?rot=49094&aca=101&ajax=t&processo=loadPluginPortal&parametro=%7B%22codigoPlugin%22%3A65,%22parametroPlugin%22%3A%7B%22loadDetalhes%22%3Atrue%7D,%22filtroPlugin%22%3A%7B%22paginaUrlPermanente%22%3A%22{p_url}%22%7D%7D"


class BaseAtendeT2Spider(BaseGazetteSpider):
"""
Base spider for Gazzetes that are available from cities listed on https://{city_subdomain}.atende.net
This base class deals with layout 'Type 2' gazette pages, usually requested
from 'https://{city_subdomain}.atende.net/diariooficial'.
"""

# Must be defined into child classes
city_subdomain = ""

power = "executive_legislative"

start_page = 1
end_page = 0
extra_edition_options = ("suplementar", "retificação", "extraordinária", "extra")
allowed_domains = ["atende.net"]

def start_requests(self):
yield Request(self.get_url(self.start_page))

def parse(self, response, page=start_page):
lines = response.css("div.nova_listagem div.linha")
for line in lines:
date_raw = line.css("div.data::text").get()
date_time = dateparser.parse(date_raw, languages=["pt"])
if date_time is None:
self.logger.debug(f"Unable to parse date from text {date_raw}!")
continue
date = date_time.date()

if date > self.end_date:
continue
if date < self.start_date:
return

edition_type = line.css("div.tipo::text").get()
is_extra = (
edition_type.lower() in self.extra_edition_options
if edition_type
else False
)
edition_number = line.css("div.titulo::text").get()
# edition_number = lines.css("div.titulo::text").re_first(r"[^\s][\d.]+")
gazette = Gazette(
date=date,
edition_number=edition_number,
is_extra_edition=is_extra,
power=self.power,
)
download_urls = line.css("button::attr(data-link)")
if len(download_urls) > 0:
gazette["file_urls"] = [download_urls[-1].get()]
yield gazette
else:
# self.logger.debug("Unable to find an url for download! Trying edition details.")
edition_id = line.css("span.bt_detalhes::attr(data-id)").get()
edition_url = f"{self.get_base_url()}&parametro=%7B%22codigoPlugin%22%3A2,%22filtroPlugin%22%3A%7B%22codigoEdicao%22%3A%22{edition_id}%22%7D%7D"
yield Request(
edition_url, self.parse_edition, cb_kwargs={"gazette": gazette}
)

if self.end_page < 1:
pages = response.css("div#paginacao li.dst button::attr(value)").getall()
if len(pages) > 1:
self.end_page = int(pages[-1])
else:
self.logger.debug("Unable to find the last page!")

page += 1
if page <= self.end_page:
yield response.follow(self.get_url(page), cb_kwargs={"page": page})

def parse_edition(self, response, gazette):
download_url = response.css(
"button.visualizacao_versao_completa::attr(data-link)"
).get()
gazette["file_urls"] = [download_url]
yield gazette

def get_base_url(self):
return f"https://{self.city_subdomain}.atende.net/diariooficial/edicao/pagina/atende.php?rot=54015&aca=101&ajax=t&processo=loadPluginDiarioOficial"

def get_url(self, page):
return f"{self.get_base_url()}&parametro=%7B%22codigoPlugin%22%3A1,%22filtroPlugin%22%3A%7B%22pagina%22%3A%22{page}%22%7D%7D"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/mg/mg_carmopolis_de_minas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class MgCarmopolisDeMinasSpider(BaseAtendeT2Spider):
TERRITORY_ID = "3114501"
name = "mg_carmopolis_de_minas"
start_date = date(2013, 1, 24) # Edição 64
city_subdomain = "carmopolisdeminas"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/mg/mg_oliveira.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class MgOliveiraSpider(BaseAtendeT2Spider):
TERRITORY_ID = "3145604"
name = "mg_oliveira"
start_date = date(2014, 10, 22) # Edição 1
city_subdomain = "oliveira"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_apucarana.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrApucaranaSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4101408"
name = "pr_apucarana"
start_date = date(2022, 2, 23) # Edição 1
city_subdomain = "apucarana"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_araucaria.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrCampoMouraoSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4101804"
name = "pr_araucaria"
start_date = date(2018, 12, 19) # Edição 246
city_subdomain = "araucaria"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_campo_largo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrCampoLargoSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4104204"
name = "pr_campo_largo"
start_date = date(2006, 1, 20) # Edição 1
city_subdomain = "campolargo"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_campo_mourao.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrCampoMouraoSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4104303"
name = "pr_campo_mourao"
start_date = date(2012, 2, 3) # Edição 1511
city_subdomain = "campomourao"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_castro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrCastroSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4104907"
name = "pr_castro"
start_date = date(2010, 6, 4) # Edição 222
city_subdomain = "castro"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_clevelandia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrClevelandiaSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4105706"
name = "pr_clevelandia"
start_date = date(2012, 3, 26) # Edição 60
city_subdomain = "clevelandia"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_corbelia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrCorbeliaSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4106308"
name = "pr_corbelia"
start_date = date(2015, 11, 20) # Edição 1
city_subdomain = "corbelia"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_guaraniacu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrGuaraniacuSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4109302"
name = "pr_guaraniacu"
start_date = date(2023, 5, 3) # Edição 1
city_subdomain = "guaraniacu"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_juranda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrJurandaSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4112959"
name = "pr_juranda"
start_date = date(2021, 3, 24) # Edição 1
city_subdomain = "juranda"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_mambore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrMamboreSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4114005"
name = "pr_mambore"
start_date = date(2020, 5, 25) # Edição 1
city_subdomain = "mambore"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_ouro_verde_do_oeste.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrOuroVerdeDoOesteSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4117453"
name = "pr_ouro_verde_do_oeste"
start_date = date(2021, 3, 31) # Edição 1
city_subdomain = "ouroverdedooeste"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_pato_bragado.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrPatoBragadoSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4118451"
name = "pr_pato_bragado"
start_date = date(2012, 5, 30) # Edição 1
city_subdomain = "patobragado"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_pinhais.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrPinhaisSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4119152"
name = "pr_pinhais"
start_date = date(2017, 5, 26) # Edição 1
city_subdomain = "pinhais"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_rio_branco_do_sul.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrRioBrancoDoSulSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4122206"
name = "pr_rio_branco_do_sul"
start_date = date(2022, 11, 20) # Edição 2608
city_subdomain = "riobrancodosul"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_santo_antonio_da_platina.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrSantoAntonioDaPlatinaSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4124103"
name = "pr_santo_antonio_da_platina"
start_date = date(2012, 11, 27) # Edição 1
city_subdomain = "santoantoniodaplatina"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_tupassi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class PrTupassiSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4127957"
name = "pr_tupassi"
start_date = date(2016, 6, 27) # Edição 14
city_subdomain = "tupassi"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/rs/rs_bento_goncalves.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.atende import BaseAtendeT2Spider


class RsBentoGoncalvesSpider(BaseAtendeT2Spider):
TERRITORY_ID = "4302105"
name = "rs_bento_goncalves"
start_date = date(2019, 4, 1) # Edição 1124
city_subdomain = "bentogoncalves"
Loading
Loading