From 3eaeba451ea814661ba19a44e4c6f4412a290bd7 Mon Sep 17 00:00:00 2001 From: Irio Musskopf Date: Sat, 1 Jun 2019 09:58:32 +0200 Subject: [PATCH] Remove Scrapy Contracts from existing spiders The initial idea of using Contracts was to have a quick (for experienced developers) and beginner-friendly way of writing Scrapy tests, exempting developers from having to write test cases, creating fixtures and maintaining them up-to-date. The downside is that these contracts commonly raise false positives, when the scraper is working correctly but the contract fails, temporarily or not, for other reasons. Since the number of false positives grow in the past months without maintenance, I am currently removing them. We should think about alternatives in the future. Following discussion from https://github.com/okfn-brasil/diario-oficial/issues/44. --- .travis.yml | 3 +-- Makefile | 17 +++++++---------- .../gazette/spiders/al_maceio.py | 12 ------------ .../gazette/spiders/am_manaus.py | 12 ------------ .../gazette/spiders/ba_feira_de_santana.py | 4 ---- .../gazette/spiders/ce_fortaleza.py | 6 ------ .../gazette/spiders/es_associacao_municipios.py | 6 ------ .../gazette/spiders/go_goiania.py | 9 --------- .../gazette/spiders/ms_campo_grande.py | 5 ----- .../gazette/spiders/pr_cascavel.py | 5 ----- .../gazette/spiders/pr_curitiba.py | 6 ------ .../gazette/spiders/pr_foz_do_iguacu.py | 8 -------- .../gazette/spiders/pr_ponta_grossa.py | 4 ---- .../gazette/spiders/ro_porto_velho.py | 5 ----- .../gazette/spiders/rs_caxias_do_sul.py | 4 ---- .../gazette/spiders/rs_porto_alegre.py | 9 --------- .../gazette/spiders/sc_florianopolis.py | 12 ------------ .../gazette/spiders/sp_campinas.py | 9 --------- .../gazette/spiders/sp_franca.py | 9 --------- .../gazette/spiders/sp_guaruja.py | 8 -------- .../gazette/spiders/sp_guarulhos.py | 5 ----- .../gazette/spiders/sp_jundiai.py | 5 ----- .../gazette/spiders/sp_santos.py | 5 ----- .../gazette/spiders/to_palmas.py | 8 -------- 24 files changed, 8 insertions(+), 168 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8a5bb06b8..52f639239 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,5 +14,4 @@ before_install: install: - make setup script: - - make unit_test - - docker-compose run --rm processing black . --check + - make test diff --git a/Makefile b/Makefile index dc1f30918..517cf9678 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,3 @@ -test: unit_test integration_test - -unit_test: - docker-compose run --rm processing pytest - -integration_test: - docker-compose run --rm processing bash -c "cd data_collection && scrapy check" - docker-compose run --rm processing black . --check - setup: cp .env.example .env docker-compose pull @@ -17,5 +8,11 @@ setup: seed: docker-compose up -d postgres - docker-compose run --rm processing python3 -c "import database; database.initialize()" + docker-compose run --rm processing python3 -c 'import database; database.initialize()' docker-compose run --rm processing bash -c 'echo "\copy territories FROM /mnt/data/territories.csv CSV HEADER;" | psql $$DATABASE_URL' + +test: unit_test + docker-compose run --rm processing black . --check + +unit_test: + docker-compose run --rm processing pytest diff --git a/processing/data_collection/gazette/spiders/al_maceio.py b/processing/data_collection/gazette/spiders/al_maceio.py index 3ad54b547..20bec0de0 100644 --- a/processing/data_collection/gazette/spiders/al_maceio.py +++ b/processing/data_collection/gazette/spiders/al_maceio.py @@ -15,12 +15,6 @@ class AlMaceioSpider(BaseGazetteSpider): page_number = 1 def parse(self, response): - """ - @url http://www.maceio.al.gov.br/noticias/diario-oficial/ - @returns items 0 9 - @returns requests 1 10 - @scrapes date file_urls is_extra_edition municipality_id power scraped_at - """ gazettes = list(response.xpath("//article")) for gazette in gazettes: url = gazette.xpath("a/@href").extract_first() @@ -52,12 +46,6 @@ def parse(self, response): ) def parse_additional_page(self, response): - """ - @url http://www.maceio.al.gov.br/noticias/diario-oficial/ - @returns items 1 9 - @returns requests 1 - @scrapes date file_urls is_extra_edition municipality_id power scraped_at - """ url = response.xpath('//p[@class="attachment"]/a/@href').extract_first() gazette = self.create_gazette( response.meta["date"], url, response.meta["is_extra_edition"] diff --git a/processing/data_collection/gazette/spiders/am_manaus.py b/processing/data_collection/gazette/spiders/am_manaus.py index 81e0dd278..7c812d832 100644 --- a/processing/data_collection/gazette/spiders/am_manaus.py +++ b/processing/data_collection/gazette/spiders/am_manaus.py @@ -35,12 +35,6 @@ def start_requests(self): yield Request(self.LEGISLATIVE_URL, self.parse_legislative) def parse_executive(self, response): - """ - @url http://dom.manaus.am.gov.br/diario-oficial-de-manaus - @returns requests 1 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ - for element in response.css(self.EXECUTIVE_GAZETTE_ROW_CSS): url = element.css(self.EXECUTIVE_PDF_HREF_CSS).extract_first() date = element.css(self.EXECUTIVE_DATE_CSS).extract_first() @@ -56,12 +50,6 @@ def parse_executive(self, response): yield Request(url, self.parse_executive) def parse_legislative(self, response): - """ - @url http://www.cmm.am.gov.br/diario-oficial/ - @returns requests 1 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ - for element in response.css(self.LEGISLATIVE_GAZETTE_ROW_CSS): if not element.css("td"): continue diff --git a/processing/data_collection/gazette/spiders/ba_feira_de_santana.py b/processing/data_collection/gazette/spiders/ba_feira_de_santana.py index 486ea652c..89fdda18a 100644 --- a/processing/data_collection/gazette/spiders/ba_feira_de_santana.py +++ b/processing/data_collection/gazette/spiders/ba_feira_de_santana.py @@ -17,10 +17,6 @@ class BaFeiraDeSantanaSpider(BaseGazetteSpider): last_page = 1 def parse(self, response): - """ - @url http://www.diariooficial.feiradesantana.ba.gov.br/?p=29 - @returns requests 30 - """ gazette_table = response.css(".style166") gazettes_links = gazette_table.xpath("a//@href").extract() dates = gazette_table.css("a::text").extract() diff --git a/processing/data_collection/gazette/spiders/ce_fortaleza.py b/processing/data_collection/gazette/spiders/ce_fortaleza.py index a4bc4dd12..13d2e6d7f 100644 --- a/processing/data_collection/gazette/spiders/ce_fortaleza.py +++ b/processing/data_collection/gazette/spiders/ce_fortaleza.py @@ -27,12 +27,6 @@ def start_requests(self): yield Request(year_url) def parse(self, response): - """ - @url http://apps.fortaleza.ce.gov.br/diariooficial/ - @returns requests 1 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ - for element in response.css(self.GAZETTE_ELEMENT_CSS): url = response.urljoin(element.css("a::attr(href)").extract_first()) date = dateparser.parse( diff --git a/processing/data_collection/gazette/spiders/es_associacao_municipios.py b/processing/data_collection/gazette/spiders/es_associacao_municipios.py index 885903949..c60a2863a 100644 --- a/processing/data_collection/gazette/spiders/es_associacao_municipios.py +++ b/processing/data_collection/gazette/spiders/es_associacao_municipios.py @@ -14,12 +14,6 @@ class EsAssociacaoMunicipiosSpider(BaseGazetteSpider): start_urls = ["https://diariomunicipales.org.br/?r=site/edicoes&Edicao_page=1"] def parse(self, response): - """ - @url https://diariomunicipales.org.br/?r=site/edicoes&Edicao_page=1 - @returns items 15 15 - @returns requests 1 1 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ for gazette_node in response.css(".items tbody tr"): url = gazette_node.css("[download]::attr(href)").extract_first() date = gazette_node.css("td::text")[1].extract() diff --git a/processing/data_collection/gazette/spiders/go_goiania.py b/processing/data_collection/gazette/spiders/go_goiania.py index a16f4d57c..bdb17ab9a 100644 --- a/processing/data_collection/gazette/spiders/go_goiania.py +++ b/processing/data_collection/gazette/spiders/go_goiania.py @@ -18,21 +18,12 @@ class GoGoianiaSpider(BaseGazetteSpider): ) def parse(self, response): - """ - @url http://www4.goiania.go.gov.br/portal/site.asp?s=775&m=2075 - @returns requests 4 - """ current_year = dt.date.today().year for year in range(current_year, 2014, -1): url = self.gazettes_list_url.format(year) yield scrapy.Request(url, self.parse_year) def parse_year(self, response): - """ - @url http://www.goiania.go.gov.br/shtml//portal/casacivil/lista_diarios.asp?ano=2018 - @returns items 75 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ # The page with the list of gazettes is simply a table with links links = response.css("a") items = [] diff --git a/processing/data_collection/gazette/spiders/ms_campo_grande.py b/processing/data_collection/gazette/spiders/ms_campo_grande.py index 0ce6887bb..95c59212e 100644 --- a/processing/data_collection/gazette/spiders/ms_campo_grande.py +++ b/processing/data_collection/gazette/spiders/ms_campo_grande.py @@ -27,11 +27,6 @@ def start_requests(self): ) def parse(self, response): - """ - @url http://portal.capital.ms.gov.br/diogrande/diarioOficial - @returns items 1 - @scrapes date file_urls is_extra_edition municipality_id power scraped_at - """ year = response.css("#leftToRight > h3").extract_first().split("/")[1] docs = response.css(".arquivos li") for doc in docs: diff --git a/processing/data_collection/gazette/spiders/pr_cascavel.py b/processing/data_collection/gazette/spiders/pr_cascavel.py index 65c18e050..1344a99a5 100644 --- a/processing/data_collection/gazette/spiders/pr_cascavel.py +++ b/processing/data_collection/gazette/spiders/pr_cascavel.py @@ -14,11 +14,6 @@ class PrCascavelSpider(BaseGazetteSpider): download_url = "http://www.cascavel.pr.gov.br/anexos/{}" def parse(self, response): - """ - @url http://www.cascavel.pr.gov.br/servicos/orgao_oficial.php - @returns items 1 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ for row in response.xpath("//table//tr[position()>1]"): date = row.xpath(".//td[2]//font//text()").extract_first() date = parse(date, languages=["pt"]).date() diff --git a/processing/data_collection/gazette/spiders/pr_curitiba.py b/processing/data_collection/gazette/spiders/pr_curitiba.py index d5d0617df..b5d001376 100644 --- a/processing/data_collection/gazette/spiders/pr_curitiba.py +++ b/processing/data_collection/gazette/spiders/pr_curitiba.py @@ -14,12 +14,6 @@ class PrCuritibaSpider(BaseGazetteSpider): custom_settings = {"DEFAULT_REQUEST_HEADERS": {"user-agent": "Mozilla/5.0"}} def start_requests(self): - """ - The Curitiba website is a statefull page, so we can't just build the - request from zero, we have to resend the viewstate with every request. - @url http://legisladocexterno.curitiba.pr.gov.br/DiarioConsultaExterna_Pesquisa.aspx - @returns requests 1 - """ for year in range(date.today().year, 2006, -1): yield scrapy.FormRequest( "http://legisladocexterno.curitiba.pr.gov.br/DiarioConsultaExterna_Pesquisa.aspx", diff --git a/processing/data_collection/gazette/spiders/pr_foz_do_iguacu.py b/processing/data_collection/gazette/spiders/pr_foz_do_iguacu.py index 19e9c0538..3ace0ab40 100644 --- a/processing/data_collection/gazette/spiders/pr_foz_do_iguacu.py +++ b/processing/data_collection/gazette/spiders/pr_foz_do_iguacu.py @@ -14,10 +14,6 @@ class PrFozDoIguacuSpider(BaseGazetteSpider): start_urls = [f"{BASE_URL}/utilidades/diario/index.xhtml"] def parse(self, response): - """ - @url http://www.pmfi.pr.gov.br/utilidades/diario/index.xhtml - @returns requests 1 1 - """ selector = '(//span[@class="ui-paginator-current"])[1]/text()' paginator_text = response.xpath(selector) quantity_of_documents = int(paginator_text.re_first("\([\d]+ de ([\d]+)\)")) + 1 @@ -33,10 +29,6 @@ def parse(self, response): return FormRequest(response.url, formdata=data, callback=self.parse_items) def parse_items(self, response): - """ - @url http://www.pmfi.pr.gov.br/utilidades/diario/index.xhtml - @returns items 10 10 - """ lines = response.xpath('//tr[@role="row"]') for line in lines: date, url, is_extra_edition = self.get_gazette_data(line) diff --git a/processing/data_collection/gazette/spiders/pr_ponta_grossa.py b/processing/data_collection/gazette/spiders/pr_ponta_grossa.py index 756468c88..3ac010fcb 100644 --- a/processing/data_collection/gazette/spiders/pr_ponta_grossa.py +++ b/processing/data_collection/gazette/spiders/pr_ponta_grossa.py @@ -16,10 +16,6 @@ class PrPontaGrossaSpider(BaseGazetteSpider): starting_year = 2015 def parse(self, response): - """ - @url http://www.pontagrossa.pr.gov.br/diario-oficial/ - @returns requests 1 - """ links = response.css(".view-content .field a") smallest_year = min( (p["date"].year for p in self.pdf_infos(links, self.starting_year)), diff --git a/processing/data_collection/gazette/spiders/ro_porto_velho.py b/processing/data_collection/gazette/spiders/ro_porto_velho.py index ba799ca78..dfdef4c00 100644 --- a/processing/data_collection/gazette/spiders/ro_porto_velho.py +++ b/processing/data_collection/gazette/spiders/ro_porto_velho.py @@ -26,11 +26,6 @@ def start_requests(self): yield Request(f"{self.BASE_URL}{date.year}/{date.month}") def parse(self, response): - """ - @url https://www.portovelho.ro.gov.br/dom/datatablearquivosmes/2017/1 - @returns items 20 20 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ paragraphs = json.loads(response.body_as_unicode())["aaData"] for paragraph, *_ in paragraphs: selector = Selector(text=paragraph) diff --git a/processing/data_collection/gazette/spiders/rs_caxias_do_sul.py b/processing/data_collection/gazette/spiders/rs_caxias_do_sul.py index b8dd6ddfc..a243fa97a 100644 --- a/processing/data_collection/gazette/spiders/rs_caxias_do_sul.py +++ b/processing/data_collection/gazette/spiders/rs_caxias_do_sul.py @@ -29,10 +29,6 @@ def start_requests(self): yield scrapy.Request(url) def parse(self, response): - """ - @url https://doe.caxias.rs.gov.br/site/index?PublicacoesSearch[dt_publicacao]=&PublicacoesSearch[dt_range]=01-01-15+até+31-12-18&PublicacoesSearch[palavra_chave]=&PublicacoesSearch[num_publicacao]=&page=1 - @returns requests 11 11 - """ for gazette_node in response.css(".table tbody tr"): item = self.gazette(response, gazette_node) pdf_page_url = gazette_node.css("a::attr(href)").extract_first() diff --git a/processing/data_collection/gazette/spiders/rs_porto_alegre.py b/processing/data_collection/gazette/spiders/rs_porto_alegre.py index 76d9d7771..ff1aacf06 100644 --- a/processing/data_collection/gazette/spiders/rs_porto_alegre.py +++ b/processing/data_collection/gazette/spiders/rs_porto_alegre.py @@ -14,10 +14,6 @@ class RsPortoAlegreSpider(BaseGazetteSpider): start_urls = ["http://www2.portoalegre.rs.gov.br/dopa/"] def parse(self, response): - """ - @url http://www2.portoalegre.rs.gov.br/dopa/ - @returns requests 48 - """ selector = ( '//ul[contains(@id, "menucss")]' '/descendant::*[contains(text(), "Diário Oficial {}")]' @@ -31,11 +27,6 @@ def parse(self, response): yield scrapy.Request(url, self.parse_month_page) def parse_month_page(self, response): - """ - @url http://www2.portoalegre.rs.gov.br/dopa/default.php?p_secao=1431 - @returns items 58 58 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ links = response.css("#conteudo a") items = [] for link in links: diff --git a/processing/data_collection/gazette/spiders/sc_florianopolis.py b/processing/data_collection/gazette/spiders/sc_florianopolis.py index 33e8f0806..3866cf891 100644 --- a/processing/data_collection/gazette/spiders/sc_florianopolis.py +++ b/processing/data_collection/gazette/spiders/sc_florianopolis.py @@ -16,13 +16,6 @@ class ScFlorianopolisSpider(BaseGazetteSpider): AVAILABLE_FROM = date(2015, 1, 1) # actually from June/2009 def start_requests(self): - """The City Hall website publish the gazettes in a page with a form - that allow users to browse through different years and months. This - form sends requests via POST, so this method emulates a series of these - POSTs. - @url http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial - @returns requests 1 - """ target = date.today() while target >= self.AVAILABLE_FROM: year, month = str(target.year), str(target.month) @@ -31,11 +24,6 @@ def start_requests(self): target = target + relativedelta(months=1) def parse(self, response): - """Parse each page. Each list all gazettes for a given month. - @url http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial - @returns items 1 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ for link in response.css("ul.listagem li a"): url = self.get_pdf_url(response, link) if not url: diff --git a/processing/data_collection/gazette/spiders/sp_campinas.py b/processing/data_collection/gazette/spiders/sp_campinas.py index 0576788ec..e4ca3a73e 100644 --- a/processing/data_collection/gazette/spiders/sp_campinas.py +++ b/processing/data_collection/gazette/spiders/sp_campinas.py @@ -19,10 +19,6 @@ class SpCampinasSpider(BaseGazetteSpider): ) def parse(self, response): - """ - @url http://www.campinas.sp.gov.br/diario-oficial/index.php - @returns requests 4 - """ today = dt.date.today() next_year = today.year + 1 for year in range(2015, next_year): @@ -34,11 +30,6 @@ def parse(self, response): yield scrapy.Request(url, self.parse_month_page) def parse_month_page(self, response): - """ - @url http://www.campinas.sp.gov.br/diario-oficial/index.php?mes=1&ano=2018 - @returns items 23 23 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ items = [] month_year = response.css( ".tabelaDiario:first-child tr th:nth-child(2)::text" diff --git a/processing/data_collection/gazette/spiders/sp_franca.py b/processing/data_collection/gazette/spiders/sp_franca.py index 94e2f88bd..1fc7764ae 100644 --- a/processing/data_collection/gazette/spiders/sp_franca.py +++ b/processing/data_collection/gazette/spiders/sp_franca.py @@ -19,10 +19,6 @@ class SpFrancaSpider(BaseGazetteSpider): documents_url = "http://www.franca.sp.gov.br/arquivos/diario-oficial/documentos/{}" def parse(self, response): - """ - @url http://www.franca.sp.gov.br/pmf-diario/rest/diario/init - @returns requests 10 - """ dates = set(json.loads(response.body_as_unicode())) start_date = dt.date(2015, 1, 1) @@ -35,11 +31,6 @@ def parse(self, response): start_date += delta def parse_document(self, response): - """ - @url http://www.franca.sp.gov.br/pmf-diario/rest/diario/buscaPorArquivo/03-01-2018 - @returns items 1 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ items = [] document = json.loads(response.body_as_unicode())[0] diff --git a/processing/data_collection/gazette/spiders/sp_guaruja.py b/processing/data_collection/gazette/spiders/sp_guaruja.py index 50a76f0b4..8526e59c7 100644 --- a/processing/data_collection/gazette/spiders/sp_guaruja.py +++ b/processing/data_collection/gazette/spiders/sp_guaruja.py @@ -14,19 +14,11 @@ class SpGuaruja(BaseGazetteSpider): start_urls = ["http://www.guaruja.sp.gov.br/index.php/diario-oficial/"] def parse(self, response): - """ - @url http://www.guaruja.sp.gov.br/index.php/diario-oficial/ - @returns requests 26 - """ months = response.css("div.span12 a::attr(href)").extract() for month_url in months: yield scrapy.Request(month_url, self.parse_items) def parse_items(self, response): - """ - @url http://www.guaruja.sp.gov.br/index.php/maio-2/maio2018/ - @returns items 22 22 - """ gazettes = response.css("div.span12 p") for gazette in gazettes: date = gazette.css("a ::text").extract_first() diff --git a/processing/data_collection/gazette/spiders/sp_guarulhos.py b/processing/data_collection/gazette/spiders/sp_guarulhos.py index 2d337fbf7..fde2468c0 100644 --- a/processing/data_collection/gazette/spiders/sp_guarulhos.py +++ b/processing/data_collection/gazette/spiders/sp_guarulhos.py @@ -22,11 +22,6 @@ def start_requests(self): ) def parse(self, response): - """ - @url http://www.guarulhos.sp.gov.br/diario-oficial/index.php?mes=1&ano=2018 - @returns items 17 17 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ diarios = response.xpath('//div[contains(@id, "diario")]') items = [] for diario in diarios: diff --git a/processing/data_collection/gazette/spiders/sp_jundiai.py b/processing/data_collection/gazette/spiders/sp_jundiai.py index 6dead5532..a6d1e84b0 100644 --- a/processing/data_collection/gazette/spiders/sp_jundiai.py +++ b/processing/data_collection/gazette/spiders/sp_jundiai.py @@ -25,11 +25,6 @@ def parse(self, response): yield response.follow(next_page_url, callback=self.parse) def parse_gazette(self, response): - """ - @url https://imprensaoficial.jundiai.sp.gov.br/edicao-4403 - @returns items 1 1 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ gazette_date = parse( response.css(".edicao-data::text").extract_first(""), languages=["pt"] ).date() diff --git a/processing/data_collection/gazette/spiders/sp_santos.py b/processing/data_collection/gazette/spiders/sp_santos.py index fc861075e..4c298a731 100644 --- a/processing/data_collection/gazette/spiders/sp_santos.py +++ b/processing/data_collection/gazette/spiders/sp_santos.py @@ -13,11 +13,6 @@ class SpSantosSpider(BaseGazetteSpider): download_url = "https://diariooficial.santos.sp.gov.br/edicoes/inicio/download/{}" def parse(self, response): - """ - @url https://diariooficial.santos.sp.gov.br/ - @returns items 1 - @scrapes date file_urls is_extra_edition territory_id power scraped_at - """ # all of the dates with gazettes are available inside the following hidden textarea: dates = response.css("#datas.hidden::text").extract_first() diff --git a/processing/data_collection/gazette/spiders/to_palmas.py b/processing/data_collection/gazette/spiders/to_palmas.py index eea392c7a..f39b6cb21 100644 --- a/processing/data_collection/gazette/spiders/to_palmas.py +++ b/processing/data_collection/gazette/spiders/to_palmas.py @@ -21,10 +21,6 @@ class ToPalmasSpider(BaseGazetteSpider): start_urls = ["http://diariooficial.palmas.to.gov.br/todos-diarios/"] def parse(self, response): - """ - @url http://diariooficial.palmas.to.gov.br/todos-diarios/ - @returns requests 142 - """ last_page_number_str = response.xpath(last_page_number_xpath).extract_first() last_page_number = int(last_page_number_str) for page_number in range(1, last_page_number + 1): @@ -32,10 +28,6 @@ def parse(self, response): yield scrapy.Request(url=url, callback=self.parse_page) def parse_page(self, response): - """ - @url http://diariooficial.palmas.to.gov.br/todos-diarios/?page=1 - @returns items 14 - """ li_list = response.css("div.diario-content-todos > ul > li") for li in li_list: edicao, data = li.xpath('.//*[@id="audio-titulo"]/text()').re(