From b27b86880d797031c06542f7ae9816b6ff8bbaaa Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 13:04:52 -0400 Subject: [PATCH] Remove keyword for positional "url" argument in scrapy.Request --- docs/writing-spiders.rst | 2 +- kingfisher_scrapy/base_spider.py | 5 +---- kingfisher_scrapy/spiders/afghanistan_records.py | 7 ++----- kingfisher_scrapy/spiders/afghanistan_releases.py | 9 +++------ .../spiders/argentina_buenos_aires.py | 2 +- kingfisher_scrapy/spiders/argentina_vialidad.py | 2 +- kingfisher_scrapy/spiders/armenia.py | 10 ++-------- kingfisher_scrapy/spiders/australia.py | 8 ++++---- kingfisher_scrapy/spiders/canada_buyandsell.py | 8 ++++---- kingfisher_scrapy/spiders/canada_montreal.py | 4 ++-- kingfisher_scrapy/spiders/chile_base.py | 10 +++++----- kingfisher_scrapy/spiders/colombia.py | 5 +---- kingfisher_scrapy/spiders/colombia_bulk.py | 2 +- kingfisher_scrapy/spiders/france.py | 2 +- kingfisher_scrapy/spiders/georgia_opendata.py | 5 +---- kingfisher_scrapy/spiders/georgia_records.py | 5 +---- kingfisher_scrapy/spiders/georgia_releases.py | 5 +---- .../mexico_administracion_publica_federal.py | 4 ++-- kingfisher_scrapy/spiders/mexico_cdmx.py | 4 ++-- .../spiders/mexico_grupo_aeroporto.py | 2 +- kingfisher_scrapy/spiders/mexico_inai.py | 6 +++--- kingfisher_scrapy/spiders/mexico_jalisco.py | 6 +++--- kingfisher_scrapy/spiders/moldova.py | 6 +++--- kingfisher_scrapy/spiders/moldova_old.py | 4 ++-- kingfisher_scrapy/spiders/moldova_records.py | 2 +- kingfisher_scrapy/spiders/moldova_releases.py | 2 +- kingfisher_scrapy/spiders/openopps.py | 9 +++------ kingfisher_scrapy/spiders/paraguay_hacienda.py | 4 ++-- kingfisher_scrapy/spiders/portugal.py | 2 +- kingfisher_scrapy/spiders/scotland.py | 4 ++-- kingfisher_scrapy/spiders/test_fail.py | 14 ++++---------- kingfisher_scrapy/spiders/uk_contracts_finder.py | 10 ++-------- 32 files changed, 64 insertions(+), 106 deletions(-) diff --git a/docs/writing-spiders.rst b/docs/writing-spiders.rst index a41487df..aeb9b91c 100644 --- a/docs/writing-spiders.rst +++ b/docs/writing-spiders.rst @@ -60,7 +60,7 @@ Here is a sample: def start_requests(self): # This API only has one URL to get. Make a request for that, and set a filename yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', meta={'kf_filename': '13-14.json'} ) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index a1845d6f..2c775c66 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -274,10 +274,7 @@ class MySpider(LinksSpider): data_type = 'release_package' def start_requests(self): - yield scrapy.Request( - url='https://example.com/api/packages.json', - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request('https://example.com/api/packages.json', meta={'kf_filename': 'page1.json'}) """ @handle_error diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py index c25524be..90ad82cf 100644 --- a/kingfisher_scrapy/spiders/afghanistan_records.py +++ b/kingfisher_scrapy/spiders/afghanistan_records.py @@ -12,7 +12,7 @@ class AfghanistanRecords(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://ocds.ageops.net/api/ocds/records', + 'https://ocds.ageops.net/api/ocds/records', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -24,10 +24,7 @@ def parse_list(self, response): files_urls = [files_urls[0]] for file_url in files_urls: - yield scrapy.Request( - url=file_url, - meta={'kf_filename': file_url.split('/')[-1] + '.json'}, - ) + yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}) @handle_error def parse(self, response): diff --git a/kingfisher_scrapy/spiders/afghanistan_releases.py b/kingfisher_scrapy/spiders/afghanistan_releases.py index 67e999d9..60f8b34a 100644 --- a/kingfisher_scrapy/spiders/afghanistan_releases.py +++ b/kingfisher_scrapy/spiders/afghanistan_releases.py @@ -12,7 +12,7 @@ class AfghanistanReleases(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://ocds.ageops.net/api/ocds/releases/dates', + 'https://ocds.ageops.net/api/ocds/releases/dates', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -25,7 +25,7 @@ def parse_list(self, response): for file_url in files_urls: yield scrapy.Request( - url=file_url, + file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}, callback=self.parse_release_list ) @@ -37,10 +37,7 @@ def parse_release_list(self, response): files_urls = [files_urls[0]] for file_url in files_urls: - yield scrapy.Request( - url=file_url, - meta={'kf_filename': file_url.split('/')[-1] + '.json'}, - ) + yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}) @handle_error def parse(self, response): diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py index f25dc11f..10dadbf5 100644 --- a/kingfisher_scrapy/spiders/argentina_buenos_aires.py +++ b/kingfisher_scrapy/spiders/argentina_buenos_aires.py @@ -25,7 +25,7 @@ class ArgentinaBuenosAires(ZipSpider): def start_requests(self): yield scrapy.Request( - url='https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras', + 'https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) diff --git a/kingfisher_scrapy/spiders/argentina_vialidad.py b/kingfisher_scrapy/spiders/argentina_vialidad.py index e16a83c5..7f199d0a 100644 --- a/kingfisher_scrapy/spiders/argentina_vialidad.py +++ b/kingfisher_scrapy/spiders/argentina_vialidad.py @@ -9,7 +9,7 @@ class ArgentinaVialidad(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://datosabiertos.vialidad.gob.ar/api/ocds/package/all', + 'https://datosabiertos.vialidad.gob.ar/api/ocds/package/all', meta={'kf_filename': 'all.json'} ) diff --git a/kingfisher_scrapy/spiders/armenia.py b/kingfisher_scrapy/spiders/armenia.py index 790a9b82..6fcad702 100644 --- a/kingfisher_scrapy/spiders/armenia.py +++ b/kingfisher_scrapy/spiders/armenia.py @@ -11,10 +11,7 @@ class Armenia(BaseSpider): name = 'armenia' def start_requests(self): - yield scrapy.Request( - url='https://armeps.am/ocds/release', - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request('https://armeps.am/ocds/release', meta={'kf_filename': 'page1.json'}) @handle_error def parse(self, response): @@ -24,7 +21,4 @@ def parse(self, response): if not (self.sample): if 'next_page' in json_data and 'uri' in json_data['next_page']: url = json_data['next_page']['uri'] - yield scrapy.Request( - url=url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest()+'.json'} - ) + yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest()+'.json'}) diff --git a/kingfisher_scrapy/spiders/australia.py b/kingfisher_scrapy/spiders/australia.py index d17f023e..a6f1da85 100644 --- a/kingfisher_scrapy/spiders/australia.py +++ b/kingfisher_scrapy/spiders/australia.py @@ -10,17 +10,17 @@ class Australia(LinksSpider): data_type = 'release_package' def start_requests(self): + url_prefix = 'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' + if self.sample: yield scrapy.Request( - url='https://api.tenders.gov.au/ocds/findByDates/contractPublished/2018-01-01T00:00:00Z/2018-12-31T23' - ':59:59Z', + url_prefix + '2018-01-01T00:00:00Z/2018-12-31T23:59:59Z', meta={'kf_filename': 'year-2018.json'} ) else: current_year = datetime.datetime.now().year + 1 for year in range(2004, current_year): yield scrapy.Request( - url='https://api.tenders.gov.au/ocds/findByDates/contractPublished/' - '{}-01-01T00:00:00Z/{}-12-31T23:59:59Z'.format(year, year), + url_prefix + '{}-01-01T00:00:00Z/{}-12-31T23:59:59Z'.format(year, year), meta={'kf_filename': 'year-{}.json'.format(year)} ) diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py index 102630fa..cdb9ff15 100644 --- a/kingfisher_scrapy/spiders/canada_buyandsell.py +++ b/kingfisher_scrapy/spiders/canada_buyandsell.py @@ -9,21 +9,21 @@ class CanadaBuyAndSell(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', meta={'kf_filename': '13-14.json'} ) if self.sample: return yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json', meta={'kf_filename': '14-15.json'} ) yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json', meta={'kf_filename': '15-16.json'} ) yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-16-17.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-16-17.json', meta={'kf_filename': '16-17.json'} ) diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py index 6a431ee3..126a12b8 100644 --- a/kingfisher_scrapy/spiders/canada_montreal.py +++ b/kingfisher_scrapy/spiders/canada_montreal.py @@ -12,7 +12,7 @@ class CanadaMontreal(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d' % self.page_limit, + 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d' % self.page_limit, meta={'kf_filename': 'page0.json'} ) @@ -30,7 +30,7 @@ def parse(self, response): url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d&offset=%d' % \ (self.page_limit, offset) yield scrapy.Request( - url=url, + url, meta={'kf_filename': 'page' + str(offset) + '.json'} ) offset += self.page_limit diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py index abf62278..aca03da6 100644 --- a/kingfisher_scrapy/spiders/chile_base.py +++ b/kingfisher_scrapy/spiders/chile_base.py @@ -28,7 +28,7 @@ def get_year_month_until(self): def start_requests(self): if self.sample: yield scrapy.Request( - url=self.base_list_url.format(2017, 10, 0, 10), + self.base_list_url.format(2017, 10, 0, 10), meta={'kf_filename': 'list-2017-10.json', 'year': 2017, 'month': 10}, ) return @@ -40,7 +40,7 @@ def start_requests(self): if (until_year - 1) == year and month > until_month: break yield scrapy.Request( - url=self.base_list_url.format(year, month, 0, self.limit), + self.base_list_url.format(year, month, 0, self.limit), meta={'kf_filename': 'list-{}-{:02d}.json'.format(year, month), 'year': year, 'month': month}, ) @@ -51,7 +51,7 @@ def base_parse(self, response, package_type): for data_item in data['data']: if package_type == 'record': yield_list.append(scrapy.Request( - url=self.record_url % data_item['ocid'].replace('ocds-70d2nz-', ''), + self.record_url % data_item['ocid'].replace('ocds-70d2nz-', ''), meta={'kf_filename': 'data-%s-%s.json' % (data_item['ocid'], package_type)} )) else: @@ -67,7 +67,7 @@ def base_parse(self, response, package_type): if 'url' in stage: name = stage.replace('url', '') yield_list.append(scrapy.Request( - url=data_item[stage], + data_item[stage], meta={'kf_filename': 'data-%s-%s.json' % (data_item['ocid'], name)} )) if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']: @@ -75,7 +75,7 @@ def base_parse(self, response, package_type): month = response.request.meta['month'] offset = data['pagination']['offset'] yield_list.append(scrapy.Request( - url=self.base_list_url.format(year, month, self.limit + offset, self.limit), + self.base_list_url.format(year, month, self.limit + offset, self.limit), meta={'year': year, 'month': month} )) return yield_list diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index e4d4b075..855c95d9 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -21,10 +21,7 @@ def start_requests(self): start_page = 1 if hasattr(self, 'page'): start_page = int(self.page) - yield scrapy.Request( - url=base_url % start_page, - meta={'kf_filename': 'page{}.json'.format(start_page)} - ) + yield scrapy.Request(base_url % start_page, meta={'kf_filename': 'page{}.json'.format(start_page)}) def parse(self, response): # In Colombia, every day at certain hour they run a process in their system that drops the database and make diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py index f0e571c0..2523f8bc 100644 --- a/kingfisher_scrapy/spiders/colombia_bulk.py +++ b/kingfisher_scrapy/spiders/colombia_bulk.py @@ -27,7 +27,7 @@ class ColombiaBulk(ZipSpider): def start_requests(self): yield scrapy.Request( - url='https://www.colombiacompra.gov.co/transparencia/datos-json', + 'https://www.colombiacompra.gov.co/transparencia/datos-json', meta={'kf_filename': 'list.html'}, callback=self.parse_list, ) diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py index bb2f702c..e71c7805 100644 --- a/kingfisher_scrapy/spiders/france.py +++ b/kingfisher_scrapy/spiders/france.py @@ -12,7 +12,7 @@ class France(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://www.data.gouv.fr/api/1/datasets/?organization=534fff75a3a7292c64a77de4', + 'https://www.data.gouv.fr/api/1/datasets/?organization=534fff75a3a7292c64a77de4', meta={'kf_filename': 'list.json'}, callback=self.parse_list, ) diff --git a/kingfisher_scrapy/spiders/georgia_opendata.py b/kingfisher_scrapy/spiders/georgia_opendata.py index de800b5a..48306d10 100644 --- a/kingfisher_scrapy/spiders/georgia_opendata.py +++ b/kingfisher_scrapy/spiders/georgia_opendata.py @@ -12,7 +12,4 @@ class GeorgiaOpenData(ZipSpider): download_timeout = 1200 # 20min def start_requests(self): - yield scrapy.Request( - url='http://opendata.spa.ge/json/allTenders.zip', - meta={'kf_filename': 'all.json'} - ) + yield scrapy.Request('http://opendata.spa.ge/json/allTenders.zip', meta={'kf_filename': 'all.json'}) diff --git a/kingfisher_scrapy/spiders/georgia_records.py b/kingfisher_scrapy/spiders/georgia_records.py index fc257370..60438bbc 100644 --- a/kingfisher_scrapy/spiders/georgia_records.py +++ b/kingfisher_scrapy/spiders/georgia_records.py @@ -8,7 +8,4 @@ class GeorgiaRecords(LinksSpider): data_type = 'record_package' def start_requests(self): - yield scrapy.Request( - url='https://odapi.spa.ge/api/records.json', - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request('https://odapi.spa.ge/api/records.json', meta={'kf_filename': 'page1.json'}) diff --git a/kingfisher_scrapy/spiders/georgia_releases.py b/kingfisher_scrapy/spiders/georgia_releases.py index fd0ed606..6cf0263d 100644 --- a/kingfisher_scrapy/spiders/georgia_releases.py +++ b/kingfisher_scrapy/spiders/georgia_releases.py @@ -8,7 +8,4 @@ class GeorgiaReleases(LinksSpider): data_type = 'release_package' def start_requests(self): - yield scrapy.Request( - url='https://odapi.spa.ge/api/releases.json', - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request('https://odapi.spa.ge/api/releases.json', meta={'kf_filename': 'page1.json'}) diff --git a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py index ea75d13b..8d1b98f8 100644 --- a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py +++ b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py @@ -14,7 +14,7 @@ class MexicoAdministracionPublicaFederal(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://api.datos.gob.mx/v1/contratacionesabiertas', + 'https://api.datos.gob.mx/v1/contratacionesabiertas', meta={'kf_filename': 'page1.json'} ) @@ -32,7 +32,7 @@ def parse(self, response): limit = data['pagination']['pageSize'] while ((page - 1) * limit) < total: yield scrapy.Request( - url='https://api.datos.gob.mx/v1/contratacionesabiertas?page=%d' % page, + 'https://api.datos.gob.mx/v1/contratacionesabiertas?page=%d' % page, meta={'kf_filename': 'page' + str(page) + '.json'} ) page += 1 diff --git a/kingfisher_scrapy/spiders/mexico_cdmx.py b/kingfisher_scrapy/spiders/mexico_cdmx.py index bc4d35a0..326a7d2b 100644 --- a/kingfisher_scrapy/spiders/mexico_cdmx.py +++ b/kingfisher_scrapy/spiders/mexico_cdmx.py @@ -11,7 +11,7 @@ class MexicoCDMXSource(BaseSpider): def start_requests(self): yield scrapy.Request( - url='http://www.contratosabiertos.cdmx.gob.mx/api/contratos/todos', + 'http://www.contratosabiertos.cdmx.gob.mx/api/contratos/todos', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -24,7 +24,7 @@ def parse_list(self, response): for data_item in data: yield scrapy.Request( - url=data_item['uri'], + data_item['uri'], meta={'kf_filename': 'id%s.json' % data_item['id']}, callback=self.parse_record ) diff --git a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py index f08ff741..6c68226a 100644 --- a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py +++ b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py @@ -9,7 +9,7 @@ class MexicoGrupoAeroporto(BaseSpider): def start_requests(self): yield scrapy.Request( - url='http://gacmda.gacm.mx:8880/files/opendata/coleccion/concentrado05032019RELEASE.json', + 'http://gacmda.gacm.mx:8880/files/opendata/coleccion/concentrado05032019RELEASE.json', meta={'kf_filename': 'concentrado05032019RELEASE.json'} ) diff --git a/kingfisher_scrapy/spiders/mexico_inai.py b/kingfisher_scrapy/spiders/mexico_inai.py index efb52d12..6289634d 100644 --- a/kingfisher_scrapy/spiders/mexico_inai.py +++ b/kingfisher_scrapy/spiders/mexico_inai.py @@ -12,7 +12,7 @@ class MexicoINAI(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://datos.gob.mx/busca/api/3/action/package_search?q=organization:inai&rows=500', + 'https://datos.gob.mx/busca/api/3/action/package_search?q=organization:inai&rows=500', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -25,7 +25,7 @@ def parse_list(self, response): if resource['format'] == 'JSON': kf_filename = 'redirect-' + hashlib.md5(resource['url'].encode('utf-8')).hexdigest() + '.json' yield scrapy.Request( - url=resource['url'], + resource['url'], meta={ 'kf_filename': kf_filename, 'dont_redirect': True @@ -37,7 +37,7 @@ def parse_redirect(self, response): if response.status == 301: url = response.headers['Location'].decode("utf-8").replace("open?", "uc?export=download&") yield scrapy.Request( - url=url, + url, meta={'kf_filename': 'data-' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}, callback=self.parse ) diff --git a/kingfisher_scrapy/spiders/mexico_jalisco.py b/kingfisher_scrapy/spiders/mexico_jalisco.py index c34d56bc..4076a0c8 100644 --- a/kingfisher_scrapy/spiders/mexico_jalisco.py +++ b/kingfisher_scrapy/spiders/mexico_jalisco.py @@ -12,7 +12,7 @@ class MexicoJalisco(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://contratacionesabiertas.jalisco.gob.mx/OCApi/2017/contracts', + 'https://contratacionesabiertas.jalisco.gob.mx/OCApi/2017/contracts', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -24,7 +24,7 @@ def parse_list(self, response): datas = [datas[0]] for data in datas: yield scrapy.Request( - url=data['URIContract'], + data['URIContract'], meta={'kf_filename': 'id%s.json' % data['ocid']}, callback=self.parse_record_package ) @@ -35,7 +35,7 @@ def parse_record_package(self, response): if 'packages' in json_data: for url in json_data['packages']: yield scrapy.Request( - url=url, + url, meta={'kf_filename': 'packages-%s.json' % hashlib.md5(url.encode('utf-8')).hexdigest()}, callback=self.parse_release_package ) diff --git a/kingfisher_scrapy/spiders/moldova.py b/kingfisher_scrapy/spiders/moldova.py index db962e72..fb146a61 100644 --- a/kingfisher_scrapy/spiders/moldova.py +++ b/kingfisher_scrapy/spiders/moldova.py @@ -20,7 +20,7 @@ class Moldova(BaseSpider): def start_requests(self): for endpoint, url in self.endpoints.items(): yield scrapy.Request( - url=url, + url, meta={'kf_filename': 'meta-{}-start.json'.format(endpoint), 'endpoint': endpoint, 'data': False} ) @@ -41,7 +41,7 @@ def parse(self, response): for data in json_data.get('data', []): yield scrapy.Request( - url=endpoint_url + data['ocid'], + endpoint_url + data['ocid'], meta={ 'kf_filename': 'data-{}-{}.json'.format(endpoint, data['ocid']), 'endpoint': endpoint, @@ -53,7 +53,7 @@ def parse(self, response): return yield scrapy.Request( - url=endpoint_url + '?offset=' + offset, + endpoint_url + '?offset=' + offset, meta={ 'kf_filename': 'meta-{}-{}.json'.format(endpoint, offset), 'endpoint': endpoint, diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index 7282e2fe..6b5f31f9 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -10,13 +10,13 @@ class MoldovaOld(BaseSpider): def start_requests(self): if self.sample: yield scrapy.Request( - url='http://opencontracting.date.gov.md/ocds-api/year/2017', + 'http://opencontracting.date.gov.md/ocds-api/year/2017', meta={'kf_filename': 'sample.json'} ) else: for year in range(2012, 2018): yield scrapy.Request( - url='http://opencontracting.date.gov.md/ocds-api/year/%d' % year, + 'http://opencontracting.date.gov.md/ocds-api/year/%d' % year, meta={'kf_filename': 'year-%d.json' % year} ) diff --git a/kingfisher_scrapy/spiders/moldova_records.py b/kingfisher_scrapy/spiders/moldova_records.py index 669ee79f..3d6bc5a1 100644 --- a/kingfisher_scrapy/spiders/moldova_records.py +++ b/kingfisher_scrapy/spiders/moldova_records.py @@ -9,6 +9,6 @@ class MoldovaRecords(LinksSpider): def start_requests(self): yield scrapy.Request( - url='http://ocds.mepps.openprocurement.io/api/records.json', + 'http://ocds.mepps.openprocurement.io/api/records.json', meta={'kf_filename': 'page1.json'} ) diff --git a/kingfisher_scrapy/spiders/moldova_releases.py b/kingfisher_scrapy/spiders/moldova_releases.py index aff80466..81da3a56 100644 --- a/kingfisher_scrapy/spiders/moldova_releases.py +++ b/kingfisher_scrapy/spiders/moldova_releases.py @@ -9,6 +9,6 @@ class MoldovaReleases(LinksSpider): def start_requests(self): yield scrapy.Request( - url='http://ocds.mepps.openprocurement.io/api/releases.json', + 'http://ocds.mepps.openprocurement.io/api/releases.json', meta={'kf_filename': 'page1.json'} ) diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py index 3179f006..a4e88378 100644 --- a/kingfisher_scrapy/spiders/openopps.py +++ b/kingfisher_scrapy/spiders/openopps.py @@ -66,7 +66,7 @@ def from_crawler(cls, crawler, *args, **kwargs): def start_requests(self): """ Start requesting access token """ yield scrapy.Request( - url="https://api.openopps.com/api/api-token-auth/", + 'https://api.openopps.com/api/api-token-auth/', method='POST', headers={"Accept": "*/*", "Content-Type": "application/json"}, body=json.dumps({"username": self.username, "password": self.password}), @@ -131,10 +131,7 @@ def start_requests_pages(self): def request_range(self, start_date, end_date, search_h): return scrapy.Request( - url=self.base_page_url.format( - start_date, - end_date - ), + self.base_page_url.format(start_date, end_date), headers={"Accept": "*/*", "Content-Type": "application/json"}, meta={"release_date": start_date, "search_h": search_h}, ) @@ -190,7 +187,7 @@ def parse(self, response): self.logger.info('Time_diff: {}'.format(time_diff.total_seconds())) self.reauthenticating = True yield scrapy.Request( - url="https://api.openopps.com/api/api-token-auth/", + 'https://api.openopps.com/api/api-token-auth/', method='POST', headers={"Accept": "*/*", "Content-Type": "application/json"}, body=json.dumps({"username": self.username, "password": self.password}), diff --git a/kingfisher_scrapy/spiders/paraguay_hacienda.py b/kingfisher_scrapy/spiders/paraguay_hacienda.py index af28946a..d10ebb98 100644 --- a/kingfisher_scrapy/spiders/paraguay_hacienda.py +++ b/kingfisher_scrapy/spiders/paraguay_hacienda.py @@ -63,7 +63,7 @@ def parse(self, response): total_pages = data['meta']['totalPages'] for page in range(2, total_pages+1): yield scrapy.Request( - url=self.base_list_url.format(page), + self.base_list_url.format(page), meta={ 'kf_filename': 'list-{}.json'.format(page), 'meta': True, @@ -83,7 +83,7 @@ def parse(self, response): if row['idLlamado'] and row['idLlamado'] not in self.release_ids: self.release_ids.append(row['idLlamado']) yield scrapy.Request( - url=base_url.format(row['idLlamado']), + base_url.format(row['idLlamado']), meta={ 'kf_filename': 'release-{}.json'.format(row['idLlamado']), 'meta': False, diff --git a/kingfisher_scrapy/spiders/portugal.py b/kingfisher_scrapy/spiders/portugal.py index 1e1679c9..7a3b5f5c 100644 --- a/kingfisher_scrapy/spiders/portugal.py +++ b/kingfisher_scrapy/spiders/portugal.py @@ -21,7 +21,7 @@ def start_requests(self): id = '5ae97fa2c8d8c915d5faa3bf' page_size = 20 yield scrapy.Request( - url=url.format(id, page_size), + url.format(id, page_size), meta={'kf_filename': 'list.json'}, callback=self.parse_list ) diff --git a/kingfisher_scrapy/spiders/scotland.py b/kingfisher_scrapy/spiders/scotland.py index 4ec9c978..18151046 100644 --- a/kingfisher_scrapy/spiders/scotland.py +++ b/kingfisher_scrapy/spiders/scotland.py @@ -40,7 +40,7 @@ def start_requests(self): if self.sample: marker = now - datetime.timedelta(days=14) for notice_type in self.notice_types: - yield scrapy.Request(url=format_string.format(marker, notice_type), + yield scrapy.Request(format_string.format(marker, notice_type), meta={'kf_filename': 'sample_{}.json'.format(notice_type)}) else: # It's meant to go back a year, but in testing it seemed to be year minus one day! @@ -48,7 +48,7 @@ def start_requests(self): while marker <= now: datestring = '{:04d}-{:02d}-{:02d}'.format(marker.year, marker.month, marker.day) for notice_type in self.notice_types: - yield scrapy.Request(url=format_string.format(datestring, notice_type), + yield scrapy.Request(format_string.format(datestring, notice_type), meta={'kf_filename': '{}_type_{}.json'.format(datestring, notice_type)}) marker = marker + datetime.timedelta(days=14) diff --git a/kingfisher_scrapy/spiders/test_fail.py b/kingfisher_scrapy/spiders/test_fail.py index 7de61060..be553c5d 100644 --- a/kingfisher_scrapy/spiders/test_fail.py +++ b/kingfisher_scrapy/spiders/test_fail.py @@ -13,24 +13,18 @@ class TestFail(BaseSpider): def start_requests(self): # Fine yield scrapy.Request( - url='https://raw.githubusercontent.com/open-contracting/sample-data/master/fictional-example/1.1/ocds-213czf-000-00001-01-planning.json', # noqa: E501 + 'https://raw.githubusercontent.com/open-contracting/sample-data/master/fictional-example/1.1/ocds-213czf-000-00001-01-planning.json', # noqa: E501 meta={'kf_filename': 'fine.json'} ) # A straight 404 yield scrapy.Request( - url='https://www.open-contracting.org/i-want-a-kitten', + 'https://www.open-contracting.org/i-want-a-kitten', meta={'kf_filename': 'http-404.json'} ) # I broke the server .... - yield scrapy.Request( - url='http://httpstat.us/500', - meta={'kf_filename': 'http-500.json'} - ) + yield scrapy.Request('http://httpstat.us/500', meta={'kf_filename': 'http-500.json'}) # .... but actually, yes, I also broke the Proxy too - yield scrapy.Request( - url='http://httpstat.us/502', - meta={'kf_filename': 'http-502.json'} - ) + yield scrapy.Request('http://httpstat.us/502', meta={'kf_filename': 'http-502.json'}) @handle_error def parse(self, response): diff --git a/kingfisher_scrapy/spiders/uk_contracts_finder.py b/kingfisher_scrapy/spiders/uk_contracts_finder.py index 7bd194f2..333cc89c 100644 --- a/kingfisher_scrapy/spiders/uk_contracts_finder.py +++ b/kingfisher_scrapy/spiders/uk_contracts_finder.py @@ -11,10 +11,7 @@ class UKContractsFinder(BaseSpider): base_url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=%d' def start_requests(self): - yield scrapy.Request( - url=self.base_url % 1, - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request(self.base_url % 1, meta={'kf_filename': 'page1.json'}) @handle_error def parse(self, response): @@ -28,7 +25,4 @@ def parse(self, response): json_data = json.loads(response.text) last_page = json_data['maxPage'] for page in range(1, last_page + 1): - yield scrapy.Request( - url=self.base_url % page, - meta={'kf_filename': 'page%d.json' % page} - ) + yield scrapy.Request(self.base_url % page, meta={'kf_filename': 'page%d.json' % page})