Remove Scrapy Contracts from existing spiders

The initial idea of using Contracts was to have a quick (for experienced developers) and beginner-friendly way of writing Scrapy tests, exempting developers from having to write test cases, creating fixtures and maintaining them up-to-date. The downside is that these contracts commonly raise false positives, when the scraper is working correctly but the contract fails, temporarily or not, for other reasons. Since the number of false positives grow in the past months without maintenance, I am currently removing them. We should think about alternatives in the future. Following discussion from #44.
okfn-brasil · Jun 1, 2019 · 3eaeba4 · 3eaeba4
1 parent b575ed1
commit 3eaeba4
Show file tree

Hide file tree

Showing 24 changed files with 8 additions and 168 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -14,5 +14,4 @@ before_install:
 install:
   - make setup
 script:
-  - make unit_test
-  - docker-compose run --rm processing black . --check
+  - make test
diff --git a/Makefile b/Makefile
@@ -1,12 +1,3 @@
-test: unit_test integration_test
-
-unit_test:
-	docker-compose run --rm processing pytest
-
-integration_test:
-	docker-compose run --rm processing bash -c "cd data_collection && scrapy check"
-	docker-compose run --rm processing black . --check
-
 setup:
 	cp .env.example .env
 	docker-compose pull
@@ -17,5 +8,11 @@ setup:
 
 seed:
 	docker-compose up -d postgres
-	docker-compose run --rm processing python3 -c "import database; database.initialize()"
+	docker-compose run --rm processing python3 -c 'import database; database.initialize()'
 	docker-compose run --rm processing bash -c 'echo "\copy territories FROM /mnt/data/territories.csv CSV HEADER;" | psql $$DATABASE_URL'
+
+test: unit_test
+	docker-compose run --rm processing black . --check
+
+unit_test:
+	docker-compose run --rm processing pytest
diff --git a/processing/data_collection/gazette/spiders/al_maceio.py b/processing/data_collection/gazette/spiders/al_maceio.py
@@ -15,12 +15,6 @@ class AlMaceioSpider(BaseGazetteSpider):
     page_number = 1
 
     def parse(self, response):
-        """
-        @url http://www.maceio.al.gov.br/noticias/diario-oficial/
-        @returns items 0 9
-        @returns requests 1 10
-        @scrapes date file_urls is_extra_edition municipality_id power scraped_at
-        """
         gazettes = list(response.xpath("//article"))
         for gazette in gazettes:
             url = gazette.xpath("a/@href").extract_first()
@@ -52,12 +46,6 @@ def parse(self, response):
             )
 
     def parse_additional_page(self, response):
-        """
-        @url http://www.maceio.al.gov.br/noticias/diario-oficial/
-        @returns items 1 9
-        @returns requests 1
-        @scrapes date file_urls is_extra_edition municipality_id power scraped_at
-        """
         url = response.xpath('//p[@class="attachment"]/a/@href').extract_first()
         gazette = self.create_gazette(
             response.meta["date"], url, response.meta["is_extra_edition"]

diff --git a/processing/data_collection/gazette/spiders/am_manaus.py b/processing/data_collection/gazette/spiders/am_manaus.py
@@ -35,12 +35,6 @@ def start_requests(self):
         yield Request(self.LEGISLATIVE_URL, self.parse_legislative)
 
     def parse_executive(self, response):
-        """
-        @url http://dom.manaus.am.gov.br/diario-oficial-de-manaus
-        @returns requests 1
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
-
         for element in response.css(self.EXECUTIVE_GAZETTE_ROW_CSS):
             url = element.css(self.EXECUTIVE_PDF_HREF_CSS).extract_first()
             date = element.css(self.EXECUTIVE_DATE_CSS).extract_first()
@@ -56,12 +50,6 @@ def parse_executive(self, response):
             yield Request(url, self.parse_executive)
 
     def parse_legislative(self, response):
-        """
-        @url http://www.cmm.am.gov.br/diario-oficial/
-        @returns requests 1
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
-
         for element in response.css(self.LEGISLATIVE_GAZETTE_ROW_CSS):
             if not element.css("td"):
                 continue

diff --git a/processing/data_collection/gazette/spiders/ba_feira_de_santana.py b/processing/data_collection/gazette/spiders/ba_feira_de_santana.py
@@ -17,10 +17,6 @@ class BaFeiraDeSantanaSpider(BaseGazetteSpider):
     last_page = 1
 
     def parse(self, response):
-        """
-        @url http://www.diariooficial.feiradesantana.ba.gov.br/?p=29
-        @returns requests 30
-        """
         gazette_table = response.css(".style166")
         gazettes_links = gazette_table.xpath("a//@href").extract()
         dates = gazette_table.css("a::text").extract()

diff --git a/processing/data_collection/gazette/spiders/ce_fortaleza.py b/processing/data_collection/gazette/spiders/ce_fortaleza.py
@@ -27,12 +27,6 @@ def start_requests(self):
             yield Request(year_url)
 
     def parse(self, response):
-        """
-        @url http://apps.fortaleza.ce.gov.br/diariooficial/
-        @returns requests 1
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
-
         for element in response.css(self.GAZETTE_ELEMENT_CSS):
             url = response.urljoin(element.css("a::attr(href)").extract_first())
             date = dateparser.parse(

diff --git a/processing/data_collection/gazette/spiders/es_associacao_municipios.py b/processing/data_collection/gazette/spiders/es_associacao_municipios.py
@@ -14,12 +14,6 @@ class EsAssociacaoMunicipiosSpider(BaseGazetteSpider):
     start_urls = ["https://diariomunicipales.org.br/?r=site/edicoes&Edicao_page=1"]
 
     def parse(self, response):
-        """
-        @url https://diariomunicipales.org.br/?r=site/edicoes&Edicao_page=1
-        @returns items 15 15
-        @returns requests 1 1
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
         for gazette_node in response.css(".items tbody tr"):
             url = gazette_node.css("[download]::attr(href)").extract_first()
             date = gazette_node.css("td::text")[1].extract()

diff --git a/processing/data_collection/gazette/spiders/go_goiania.py b/processing/data_collection/gazette/spiders/go_goiania.py
@@ -18,21 +18,12 @@ class GoGoianiaSpider(BaseGazetteSpider):
     )
 
     def parse(self, response):
-        """
-        @url http://www4.goiania.go.gov.br/portal/site.asp?s=775&m=2075
-        @returns requests 4
-        """
         current_year = dt.date.today().year
         for year in range(current_year, 2014, -1):
             url = self.gazettes_list_url.format(year)
             yield scrapy.Request(url, self.parse_year)
 
     def parse_year(self, response):
-        """
-        @url http://www.goiania.go.gov.br/shtml//portal/casacivil/lista_diarios.asp?ano=2018
-        @returns items 75
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
         # The page with the list of gazettes is simply a table with links
         links = response.css("a")
         items = []

diff --git a/processing/data_collection/gazette/spiders/ms_campo_grande.py b/processing/data_collection/gazette/spiders/ms_campo_grande.py
@@ -27,11 +27,6 @@ def start_requests(self):
                 )
 
     def parse(self, response):
-        """
-        @url http://portal.capital.ms.gov.br/diogrande/diarioOficial
-        @returns items 1
-        @scrapes date file_urls is_extra_edition municipality_id power scraped_at
-        """
         year = response.css("#leftToRight > h3").extract_first().split("/")[1]
         docs = response.css(".arquivos li")
         for doc in docs:

diff --git a/processing/data_collection/gazette/spiders/pr_cascavel.py b/processing/data_collection/gazette/spiders/pr_cascavel.py
@@ -14,11 +14,6 @@ class PrCascavelSpider(BaseGazetteSpider):
     download_url = "http://www.cascavel.pr.gov.br/anexos/{}"
 
     def parse(self, response):
-        """
-        @url http://www.cascavel.pr.gov.br/servicos/orgao_oficial.php
-        @returns items 1
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
         for row in response.xpath("//table//tr[position()>1]"):
             date = row.xpath(".//td[2]//font//text()").extract_first()
             date = parse(date, languages=["pt"]).date()

diff --git a/processing/data_collection/gazette/spiders/pr_curitiba.py b/processing/data_collection/gazette/spiders/pr_curitiba.py
@@ -14,12 +14,6 @@ class PrCuritibaSpider(BaseGazetteSpider):
     custom_settings = {"DEFAULT_REQUEST_HEADERS": {"user-agent": "Mozilla/5.0"}}
 
     def start_requests(self):
-        """
-        The Curitiba website is a statefull page, so we can't just build the
-        request from zero, we have to resend the viewstate with every request.
-        @url http://legisladocexterno.curitiba.pr.gov.br/DiarioConsultaExterna_Pesquisa.aspx
-        @returns requests 1
-        """
         for year in range(date.today().year, 2006, -1):
             yield scrapy.FormRequest(
                 "http://legisladocexterno.curitiba.pr.gov.br/DiarioConsultaExterna_Pesquisa.aspx",

diff --git a/processing/data_collection/gazette/spiders/pr_foz_do_iguacu.py b/processing/data_collection/gazette/spiders/pr_foz_do_iguacu.py
@@ -14,10 +14,6 @@ class PrFozDoIguacuSpider(BaseGazetteSpider):
     start_urls = [f"{BASE_URL}/utilidades/diario/index.xhtml"]
 
     def parse(self, response):
-        """
-        @url http://www.pmfi.pr.gov.br/utilidades/diario/index.xhtml
-        @returns requests 1 1
-        """
         selector = '(//span[@class="ui-paginator-current"])[1]/text()'
         paginator_text = response.xpath(selector)
         quantity_of_documents = int(paginator_text.re_first("\([\d]+ de ([\d]+)\)")) + 1
@@ -33,10 +29,6 @@ def parse(self, response):
         return FormRequest(response.url, formdata=data, callback=self.parse_items)
 
     def parse_items(self, response):
-        """
-        @url http://www.pmfi.pr.gov.br/utilidades/diario/index.xhtml
-        @returns items 10 10
-        """
         lines = response.xpath('//tr[@role="row"]')
         for line in lines:
             date, url, is_extra_edition = self.get_gazette_data(line)

diff --git a/processing/data_collection/gazette/spiders/pr_ponta_grossa.py b/processing/data_collection/gazette/spiders/pr_ponta_grossa.py
@@ -16,10 +16,6 @@ class PrPontaGrossaSpider(BaseGazetteSpider):
     starting_year = 2015
 
     def parse(self, response):
-        """
-        @url http://www.pontagrossa.pr.gov.br/diario-oficial/
-        @returns requests 1
-        """
         links = response.css(".view-content .field a")
         smallest_year = min(
             (p["date"].year for p in self.pdf_infos(links, self.starting_year)),

diff --git a/processing/data_collection/gazette/spiders/ro_porto_velho.py b/processing/data_collection/gazette/spiders/ro_porto_velho.py
@@ -26,11 +26,6 @@ def start_requests(self):
             yield Request(f"{self.BASE_URL}{date.year}/{date.month}")
 
     def parse(self, response):
-        """
-        @url https://www.portovelho.ro.gov.br/dom/datatablearquivosmes/2017/1
-        @returns items 20 20
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
         paragraphs = json.loads(response.body_as_unicode())["aaData"]
         for paragraph, *_ in paragraphs:
             selector = Selector(text=paragraph)

diff --git a/processing/data_collection/gazette/spiders/rs_caxias_do_sul.py b/processing/data_collection/gazette/spiders/rs_caxias_do_sul.py
@@ -29,10 +29,6 @@ def start_requests(self):
         yield scrapy.Request(url)
 
     def parse(self, response):
-        """
-        @url https://doe.caxias.rs.gov.br/site/index?PublicacoesSearch[dt_publicacao]=&PublicacoesSearch[dt_range]=01-01-15+até+31-12-18&PublicacoesSearch[palavra_chave]=&PublicacoesSearch[num_publicacao]=&page=1
-        @returns requests 11 11
-        """
         for gazette_node in response.css(".table tbody tr"):
             item = self.gazette(response, gazette_node)
             pdf_page_url = gazette_node.css("a::attr(href)").extract_first()

diff --git a/processing/data_collection/gazette/spiders/rs_porto_alegre.py b/processing/data_collection/gazette/spiders/rs_porto_alegre.py
@@ -14,10 +14,6 @@ class RsPortoAlegreSpider(BaseGazetteSpider):
     start_urls = ["http://www2.portoalegre.rs.gov.br/dopa/"]
 
     def parse(self, response):
-        """
-        @url http://www2.portoalegre.rs.gov.br/dopa/
-        @returns requests 48
-        """
         selector = (
             '//ul[contains(@id, "menucss")]'
             '/descendant::*[contains(text(), "Diário Oficial {}")]'
@@ -31,11 +27,6 @@ def parse(self, response):
                 yield scrapy.Request(url, self.parse_month_page)
 
     def parse_month_page(self, response):
-        """
-        @url http://www2.portoalegre.rs.gov.br/dopa/default.php?p_secao=1431
-        @returns items 58 58
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
         links = response.css("#conteudo a")
         items = []
         for link in links:

diff --git a/processing/data_collection/gazette/spiders/sc_florianopolis.py b/processing/data_collection/gazette/spiders/sc_florianopolis.py
@@ -16,13 +16,6 @@ class ScFlorianopolisSpider(BaseGazetteSpider):
     AVAILABLE_FROM = date(2015, 1, 1)  # actually from June/2009
 
     def start_requests(self):
-        """The City Hall website publish the gazettes in a page with a form
-        that allow users to browse through different years and months. This
-        form sends requests via POST, so this method emulates a series of these
-        POSTs.
-        @url http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial
-        @returns requests 1
-        """
         target = date.today()
         while target >= self.AVAILABLE_FROM:
             year, month = str(target.year), str(target.month)
@@ -31,11 +24,6 @@ def start_requests(self):
             target = target + relativedelta(months=1)
 
     def parse(self, response):
-        """Parse each page. Each list all gazettes for a given month.
-        @url http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial
-        @returns items 1
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
         for link in response.css("ul.listagem li a"):
             url = self.get_pdf_url(response, link)
             if not url:

diff --git a/processing/data_collection/gazette/spiders/sp_campinas.py b/processing/data_collection/gazette/spiders/sp_campinas.py
@@ -19,10 +19,6 @@ class SpCampinasSpider(BaseGazetteSpider):
     )
 
     def parse(self, response):
-        """
-        @url http://www.campinas.sp.gov.br/diario-oficial/index.php
-        @returns requests 4
-        """
         today = dt.date.today()
         next_year = today.year + 1
         for year in range(2015, next_year):
@@ -34,11 +30,6 @@ def parse(self, response):
                 yield scrapy.Request(url, self.parse_month_page)
 
     def parse_month_page(self, response):
-        """
-        @url http://www.campinas.sp.gov.br/diario-oficial/index.php?mes=1&ano=2018
-        @returns items 23 23
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
         items = []
         month_year = response.css(
             ".tabelaDiario:first-child tr th:nth-child(2)::text"

diff --git a/processing/data_collection/gazette/spiders/sp_franca.py b/processing/data_collection/gazette/spiders/sp_franca.py
@@ -19,10 +19,6 @@ class SpFrancaSpider(BaseGazetteSpider):
     documents_url = "http://www.franca.sp.gov.br/arquivos/diario-oficial/documentos/{}"
 
     def parse(self, response):
-        """
-        @url http://www.franca.sp.gov.br/pmf-diario/rest/diario/init
-        @returns requests 10
-        """
         dates = set(json.loads(response.body_as_unicode()))
 
         start_date = dt.date(2015, 1, 1)
@@ -35,11 +31,6 @@ def parse(self, response):
             start_date += delta
 
     def parse_document(self, response):
-        """
-        @url http://www.franca.sp.gov.br/pmf-diario/rest/diario/buscaPorArquivo/03-01-2018
-        @returns items 1
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
         items = []
 
         document = json.loads(response.body_as_unicode())[0]

diff --git a/processing/data_collection/gazette/spiders/sp_guaruja.py b/processing/data_collection/gazette/spiders/sp_guaruja.py
@@ -14,19 +14,11 @@ class SpGuaruja(BaseGazetteSpider):
     start_urls = ["http://www.guaruja.sp.gov.br/index.php/diario-oficial/"]
 
     def parse(self, response):
-        """
-        @url http://www.guaruja.sp.gov.br/index.php/diario-oficial/
-        @returns requests 26
-        """
         months = response.css("div.span12 a::attr(href)").extract()
         for month_url in months:
             yield scrapy.Request(month_url, self.parse_items)
 
     def parse_items(self, response):
-        """
-        @url http://www.guaruja.sp.gov.br/index.php/maio-2/maio2018/
-        @returns items 22 22
-        """
         gazettes = response.css("div.span12 p")
         for gazette in gazettes:
             date = gazette.css("a ::text").extract_first()

diff --git a/processing/data_collection/gazette/spiders/sp_guarulhos.py b/processing/data_collection/gazette/spiders/sp_guarulhos.py
@@ -22,11 +22,6 @@ def start_requests(self):
             )
 
     def parse(self, response):
-        """
-        @url http://www.guarulhos.sp.gov.br/diario-oficial/index.php?mes=1&ano=2018
-        @returns items 17 17
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
         diarios = response.xpath('//div[contains(@id, "diario")]')
         items = []
         for diario in diarios:

diff --git a/processing/data_collection/gazette/spiders/sp_jundiai.py b/processing/data_collection/gazette/spiders/sp_jundiai.py
@@ -25,11 +25,6 @@ def parse(self, response):
             yield response.follow(next_page_url, callback=self.parse)
 
     def parse_gazette(self, response):
-        """
-        @url https://imprensaoficial.jundiai.sp.gov.br/edicao-4403
-        @returns items 1 1
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
         gazette_date = parse(
             response.css(".edicao-data::text").extract_first(""), languages=["pt"]
         ).date()

diff --git a/processing/data_collection/gazette/spiders/sp_santos.py b/processing/data_collection/gazette/spiders/sp_santos.py
@@ -13,11 +13,6 @@ class SpSantosSpider(BaseGazetteSpider):
     download_url = "https://diariooficial.santos.sp.gov.br/edicoes/inicio/download/{}"
 
     def parse(self, response):
-        """
-        @url https://diariooficial.santos.sp.gov.br/
-        @returns items 1
-        @scrapes date file_urls is_extra_edition territory_id power scraped_at
-        """
         # all of the dates with gazettes are available inside the following hidden textarea:
         dates = response.css("#datas.hidden::text").extract_first()