Skip to content

Commit

Permalink
Remove Scrapy Contracts from existing spiders
Browse files Browse the repository at this point in the history
The initial idea of using Contracts was to have a quick
(for experienced developers) and beginner-friendly way of writing
Scrapy tests, exempting developers from having to write test cases,
creating fixtures and maintaining them up-to-date.

The downside is that these contracts commonly raise false positives,
when the scraper is working correctly but the contract fails,
temporarily or not, for other reasons.

Since the number of false positives grow in the past months without
maintenance, I am currently removing them. We should think about
alternatives in the future.

Following discussion from
#44.
  • Loading branch information
Irio committed Jun 1, 2019
1 parent b575ed1 commit 3eaeba4
Show file tree
Hide file tree
Showing 24 changed files with 8 additions and 168 deletions.
3 changes: 1 addition & 2 deletions .travis.yml
Expand Up @@ -14,5 +14,4 @@ before_install:
install:
- make setup
script:
- make unit_test
- docker-compose run --rm processing black . --check
- make test
17 changes: 7 additions & 10 deletions Makefile
@@ -1,12 +1,3 @@
test: unit_test integration_test

unit_test:
docker-compose run --rm processing pytest

integration_test:
docker-compose run --rm processing bash -c "cd data_collection && scrapy check"
docker-compose run --rm processing black . --check

setup:
cp .env.example .env
docker-compose pull
Expand All @@ -17,5 +8,11 @@ setup:

seed:
docker-compose up -d postgres
docker-compose run --rm processing python3 -c "import database; database.initialize()"
docker-compose run --rm processing python3 -c 'import database; database.initialize()'
docker-compose run --rm processing bash -c 'echo "\copy territories FROM /mnt/data/territories.csv CSV HEADER;" | psql $$DATABASE_URL'

test: unit_test
docker-compose run --rm processing black . --check

unit_test:
docker-compose run --rm processing pytest
12 changes: 0 additions & 12 deletions processing/data_collection/gazette/spiders/al_maceio.py
Expand Up @@ -15,12 +15,6 @@ class AlMaceioSpider(BaseGazetteSpider):
page_number = 1

def parse(self, response):
"""
@url http://www.maceio.al.gov.br/noticias/diario-oficial/
@returns items 0 9
@returns requests 1 10
@scrapes date file_urls is_extra_edition municipality_id power scraped_at
"""
gazettes = list(response.xpath("//article"))
for gazette in gazettes:
url = gazette.xpath("a/@href").extract_first()
Expand Down Expand Up @@ -52,12 +46,6 @@ def parse(self, response):
)

def parse_additional_page(self, response):
"""
@url http://www.maceio.al.gov.br/noticias/diario-oficial/
@returns items 1 9
@returns requests 1
@scrapes date file_urls is_extra_edition municipality_id power scraped_at
"""
url = response.xpath('//p[@class="attachment"]/a/@href').extract_first()
gazette = self.create_gazette(
response.meta["date"], url, response.meta["is_extra_edition"]
Expand Down
12 changes: 0 additions & 12 deletions processing/data_collection/gazette/spiders/am_manaus.py
Expand Up @@ -35,12 +35,6 @@ def start_requests(self):
yield Request(self.LEGISLATIVE_URL, self.parse_legislative)

def parse_executive(self, response):
"""
@url http://dom.manaus.am.gov.br/diario-oficial-de-manaus
@returns requests 1
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""

for element in response.css(self.EXECUTIVE_GAZETTE_ROW_CSS):
url = element.css(self.EXECUTIVE_PDF_HREF_CSS).extract_first()
date = element.css(self.EXECUTIVE_DATE_CSS).extract_first()
Expand All @@ -56,12 +50,6 @@ def parse_executive(self, response):
yield Request(url, self.parse_executive)

def parse_legislative(self, response):
"""
@url http://www.cmm.am.gov.br/diario-oficial/
@returns requests 1
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""

for element in response.css(self.LEGISLATIVE_GAZETTE_ROW_CSS):
if not element.css("td"):
continue
Expand Down
Expand Up @@ -17,10 +17,6 @@ class BaFeiraDeSantanaSpider(BaseGazetteSpider):
last_page = 1

def parse(self, response):
"""
@url http://www.diariooficial.feiradesantana.ba.gov.br/?p=29
@returns requests 30
"""
gazette_table = response.css(".style166")
gazettes_links = gazette_table.xpath("a//@href").extract()
dates = gazette_table.css("a::text").extract()
Expand Down
6 changes: 0 additions & 6 deletions processing/data_collection/gazette/spiders/ce_fortaleza.py
Expand Up @@ -27,12 +27,6 @@ def start_requests(self):
yield Request(year_url)

def parse(self, response):
"""
@url http://apps.fortaleza.ce.gov.br/diariooficial/
@returns requests 1
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""

for element in response.css(self.GAZETTE_ELEMENT_CSS):
url = response.urljoin(element.css("a::attr(href)").extract_first())
date = dateparser.parse(
Expand Down
Expand Up @@ -14,12 +14,6 @@ class EsAssociacaoMunicipiosSpider(BaseGazetteSpider):
start_urls = ["https://diariomunicipales.org.br/?r=site/edicoes&Edicao_page=1"]

def parse(self, response):
"""
@url https://diariomunicipales.org.br/?r=site/edicoes&Edicao_page=1
@returns items 15 15
@returns requests 1 1
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""
for gazette_node in response.css(".items tbody tr"):
url = gazette_node.css("[download]::attr(href)").extract_first()
date = gazette_node.css("td::text")[1].extract()
Expand Down
9 changes: 0 additions & 9 deletions processing/data_collection/gazette/spiders/go_goiania.py
Expand Up @@ -18,21 +18,12 @@ class GoGoianiaSpider(BaseGazetteSpider):
)

def parse(self, response):
"""
@url http://www4.goiania.go.gov.br/portal/site.asp?s=775&m=2075
@returns requests 4
"""
current_year = dt.date.today().year
for year in range(current_year, 2014, -1):
url = self.gazettes_list_url.format(year)
yield scrapy.Request(url, self.parse_year)

def parse_year(self, response):
"""
@url http://www.goiania.go.gov.br/shtml//portal/casacivil/lista_diarios.asp?ano=2018
@returns items 75
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""
# The page with the list of gazettes is simply a table with links
links = response.css("a")
items = []
Expand Down
5 changes: 0 additions & 5 deletions processing/data_collection/gazette/spiders/ms_campo_grande.py
Expand Up @@ -27,11 +27,6 @@ def start_requests(self):
)

def parse(self, response):
"""
@url http://portal.capital.ms.gov.br/diogrande/diarioOficial
@returns items 1
@scrapes date file_urls is_extra_edition municipality_id power scraped_at
"""
year = response.css("#leftToRight > h3").extract_first().split("/")[1]
docs = response.css(".arquivos li")
for doc in docs:
Expand Down
5 changes: 0 additions & 5 deletions processing/data_collection/gazette/spiders/pr_cascavel.py
Expand Up @@ -14,11 +14,6 @@ class PrCascavelSpider(BaseGazetteSpider):
download_url = "http://www.cascavel.pr.gov.br/anexos/{}"

def parse(self, response):
"""
@url http://www.cascavel.pr.gov.br/servicos/orgao_oficial.php
@returns items 1
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""
for row in response.xpath("//table//tr[position()>1]"):
date = row.xpath(".//td[2]//font//text()").extract_first()
date = parse(date, languages=["pt"]).date()
Expand Down
6 changes: 0 additions & 6 deletions processing/data_collection/gazette/spiders/pr_curitiba.py
Expand Up @@ -14,12 +14,6 @@ class PrCuritibaSpider(BaseGazetteSpider):
custom_settings = {"DEFAULT_REQUEST_HEADERS": {"user-agent": "Mozilla/5.0"}}

def start_requests(self):
"""
The Curitiba website is a statefull page, so we can't just build the
request from zero, we have to resend the viewstate with every request.
@url http://legisladocexterno.curitiba.pr.gov.br/DiarioConsultaExterna_Pesquisa.aspx
@returns requests 1
"""
for year in range(date.today().year, 2006, -1):
yield scrapy.FormRequest(
"http://legisladocexterno.curitiba.pr.gov.br/DiarioConsultaExterna_Pesquisa.aspx",
Expand Down
Expand Up @@ -14,10 +14,6 @@ class PrFozDoIguacuSpider(BaseGazetteSpider):
start_urls = [f"{BASE_URL}/utilidades/diario/index.xhtml"]

def parse(self, response):
"""
@url http://www.pmfi.pr.gov.br/utilidades/diario/index.xhtml
@returns requests 1 1
"""
selector = '(//span[@class="ui-paginator-current"])[1]/text()'
paginator_text = response.xpath(selector)
quantity_of_documents = int(paginator_text.re_first("\([\d]+ de ([\d]+)\)")) + 1
Expand All @@ -33,10 +29,6 @@ def parse(self, response):
return FormRequest(response.url, formdata=data, callback=self.parse_items)

def parse_items(self, response):
"""
@url http://www.pmfi.pr.gov.br/utilidades/diario/index.xhtml
@returns items 10 10
"""
lines = response.xpath('//tr[@role="row"]')
for line in lines:
date, url, is_extra_edition = self.get_gazette_data(line)
Expand Down
4 changes: 0 additions & 4 deletions processing/data_collection/gazette/spiders/pr_ponta_grossa.py
Expand Up @@ -16,10 +16,6 @@ class PrPontaGrossaSpider(BaseGazetteSpider):
starting_year = 2015

def parse(self, response):
"""
@url http://www.pontagrossa.pr.gov.br/diario-oficial/
@returns requests 1
"""
links = response.css(".view-content .field a")
smallest_year = min(
(p["date"].year for p in self.pdf_infos(links, self.starting_year)),
Expand Down
5 changes: 0 additions & 5 deletions processing/data_collection/gazette/spiders/ro_porto_velho.py
Expand Up @@ -26,11 +26,6 @@ def start_requests(self):
yield Request(f"{self.BASE_URL}{date.year}/{date.month}")

def parse(self, response):
"""
@url https://www.portovelho.ro.gov.br/dom/datatablearquivosmes/2017/1
@returns items 20 20
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""
paragraphs = json.loads(response.body_as_unicode())["aaData"]
for paragraph, *_ in paragraphs:
selector = Selector(text=paragraph)
Expand Down
Expand Up @@ -29,10 +29,6 @@ def start_requests(self):
yield scrapy.Request(url)

def parse(self, response):
"""
@url https://doe.caxias.rs.gov.br/site/index?PublicacoesSearch[dt_publicacao]=&PublicacoesSearch[dt_range]=01-01-15+at茅+31-12-18&PublicacoesSearch[palavra_chave]=&PublicacoesSearch[num_publicacao]=&page=1
@returns requests 11 11
"""
for gazette_node in response.css(".table tbody tr"):
item = self.gazette(response, gazette_node)
pdf_page_url = gazette_node.css("a::attr(href)").extract_first()
Expand Down
9 changes: 0 additions & 9 deletions processing/data_collection/gazette/spiders/rs_porto_alegre.py
Expand Up @@ -14,10 +14,6 @@ class RsPortoAlegreSpider(BaseGazetteSpider):
start_urls = ["http://www2.portoalegre.rs.gov.br/dopa/"]

def parse(self, response):
"""
@url http://www2.portoalegre.rs.gov.br/dopa/
@returns requests 48
"""
selector = (
'//ul[contains(@id, "menucss")]'
'/descendant::*[contains(text(), "Di谩rio Oficial {}")]'
Expand All @@ -31,11 +27,6 @@ def parse(self, response):
yield scrapy.Request(url, self.parse_month_page)

def parse_month_page(self, response):
"""
@url http://www2.portoalegre.rs.gov.br/dopa/default.php?p_secao=1431
@returns items 58 58
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""
links = response.css("#conteudo a")
items = []
for link in links:
Expand Down
12 changes: 0 additions & 12 deletions processing/data_collection/gazette/spiders/sc_florianopolis.py
Expand Up @@ -16,13 +16,6 @@ class ScFlorianopolisSpider(BaseGazetteSpider):
AVAILABLE_FROM = date(2015, 1, 1) # actually from June/2009

def start_requests(self):
"""The City Hall website publish the gazettes in a page with a form
that allow users to browse through different years and months. This
form sends requests via POST, so this method emulates a series of these
POSTs.
@url http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial
@returns requests 1
"""
target = date.today()
while target >= self.AVAILABLE_FROM:
year, month = str(target.year), str(target.month)
Expand All @@ -31,11 +24,6 @@ def start_requests(self):
target = target + relativedelta(months=1)

def parse(self, response):
"""Parse each page. Each list all gazettes for a given month.
@url http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial
@returns items 1
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""
for link in response.css("ul.listagem li a"):
url = self.get_pdf_url(response, link)
if not url:
Expand Down
9 changes: 0 additions & 9 deletions processing/data_collection/gazette/spiders/sp_campinas.py
Expand Up @@ -19,10 +19,6 @@ class SpCampinasSpider(BaseGazetteSpider):
)

def parse(self, response):
"""
@url http://www.campinas.sp.gov.br/diario-oficial/index.php
@returns requests 4
"""
today = dt.date.today()
next_year = today.year + 1
for year in range(2015, next_year):
Expand All @@ -34,11 +30,6 @@ def parse(self, response):
yield scrapy.Request(url, self.parse_month_page)

def parse_month_page(self, response):
"""
@url http://www.campinas.sp.gov.br/diario-oficial/index.php?mes=1&ano=2018
@returns items 23 23
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""
items = []
month_year = response.css(
".tabelaDiario:first-child tr th:nth-child(2)::text"
Expand Down
9 changes: 0 additions & 9 deletions processing/data_collection/gazette/spiders/sp_franca.py
Expand Up @@ -19,10 +19,6 @@ class SpFrancaSpider(BaseGazetteSpider):
documents_url = "http://www.franca.sp.gov.br/arquivos/diario-oficial/documentos/{}"

def parse(self, response):
"""
@url http://www.franca.sp.gov.br/pmf-diario/rest/diario/init
@returns requests 10
"""
dates = set(json.loads(response.body_as_unicode()))

start_date = dt.date(2015, 1, 1)
Expand All @@ -35,11 +31,6 @@ def parse(self, response):
start_date += delta

def parse_document(self, response):
"""
@url http://www.franca.sp.gov.br/pmf-diario/rest/diario/buscaPorArquivo/03-01-2018
@returns items 1
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""
items = []

document = json.loads(response.body_as_unicode())[0]
Expand Down
8 changes: 0 additions & 8 deletions processing/data_collection/gazette/spiders/sp_guaruja.py
Expand Up @@ -14,19 +14,11 @@ class SpGuaruja(BaseGazetteSpider):
start_urls = ["http://www.guaruja.sp.gov.br/index.php/diario-oficial/"]

def parse(self, response):
"""
@url http://www.guaruja.sp.gov.br/index.php/diario-oficial/
@returns requests 26
"""
months = response.css("div.span12 a::attr(href)").extract()
for month_url in months:
yield scrapy.Request(month_url, self.parse_items)

def parse_items(self, response):
"""
@url http://www.guaruja.sp.gov.br/index.php/maio-2/maio2018/
@returns items 22 22
"""
gazettes = response.css("div.span12 p")
for gazette in gazettes:
date = gazette.css("a ::text").extract_first()
Expand Down
5 changes: 0 additions & 5 deletions processing/data_collection/gazette/spiders/sp_guarulhos.py
Expand Up @@ -22,11 +22,6 @@ def start_requests(self):
)

def parse(self, response):
"""
@url http://www.guarulhos.sp.gov.br/diario-oficial/index.php?mes=1&ano=2018
@returns items 17 17
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""
diarios = response.xpath('//div[contains(@id, "diario")]')
items = []
for diario in diarios:
Expand Down
5 changes: 0 additions & 5 deletions processing/data_collection/gazette/spiders/sp_jundiai.py
Expand Up @@ -25,11 +25,6 @@ def parse(self, response):
yield response.follow(next_page_url, callback=self.parse)

def parse_gazette(self, response):
"""
@url https://imprensaoficial.jundiai.sp.gov.br/edicao-4403
@returns items 1 1
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""
gazette_date = parse(
response.css(".edicao-data::text").extract_first(""), languages=["pt"]
).date()
Expand Down
5 changes: 0 additions & 5 deletions processing/data_collection/gazette/spiders/sp_santos.py
Expand Up @@ -13,11 +13,6 @@ class SpSantosSpider(BaseGazetteSpider):
download_url = "https://diariooficial.santos.sp.gov.br/edicoes/inicio/download/{}"

def parse(self, response):
"""
@url https://diariooficial.santos.sp.gov.br/
@returns items 1
@scrapes date file_urls is_extra_edition territory_id power scraped_at
"""
# all of the dates with gazettes are available inside the following hidden textarea:
dates = response.css("#datas.hidden::text").extract_first()

Expand Down

0 comments on commit 3eaeba4

Please sign in to comment.