Skip to content

Commit

Permalink
Merge a6327e5 into 4b2ff8b
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed May 29, 2020
2 parents 4b2ff8b + a6327e5 commit 0923031
Show file tree
Hide file tree
Showing 25 changed files with 174 additions and 81 deletions.
10 changes: 5 additions & 5 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,14 @@ def build_file_item(self, number, data, data_type, url, encoding, file_name):
})

def build_file_error_from_response(self, response, **kwargs):
file_error = {
item = FileError({
'url': response.request.url,
'errors': {'http_code': response.status},
}
})
if 'kf_filename' in response.request.meta:
file_error['file_name'] = response.request.meta['kf_filename']
file_error.update(kwargs)
return FileError(file_error)
item['file_name'] = response.request.meta['kf_filename']
item.update(kwargs)
return item

def _get_package_metadata(self, f, skip_key):
"""
Expand Down
6 changes: 5 additions & 1 deletion kingfisher_scrapy/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,8 @@ class AuthenticationError(KingfisherScrapyError):


class SpiderArgumentError(KingfisherScrapyError):
"""Raises when a spider argument's value is invalid"""
"""Raised when a spider argument's value is invalid"""


class MissingRequiredFieldError(KingfisherScrapyError, KeyError):
"""Raised when an item is missing a required field"""
2 changes: 1 addition & 1 deletion kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def item_scraped(self, item, spider):
metadata = {
'url': item['url'],
'data_type': item['data_type'],
'encoding': item['encoding'],
'encoding': item.get('encoding', 'utf-8'),
}
self._write_file(path + '.fileinfo', metadata, spider)

Expand Down
48 changes: 41 additions & 7 deletions kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,26 @@
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy

from kingfisher_scrapy.exceptions import MissingRequiredFieldError

class File(scrapy.Item):

class KingfisherItem(scrapy.Item):
file_name = scrapy.Field()
url = scrapy.Field()

def validate(self):
"""
Raises an error if any required field is missing.
:raises kingfisher_scrapy.extensions.MissingRequiredFieldError: if any required field is missing
"""
if hasattr(self, 'required'):
for field in self.required:
if field not in self:
raise MissingRequiredFieldError(field)


class File(KingfisherItem):
data = scrapy.Field()
data_type = scrapy.Field()
encoding = scrapy.Field()
Expand All @@ -15,17 +32,34 @@ class File(scrapy.Item):
path = scrapy.Field()
files_store = scrapy.Field()

required = [
'file_name',
'url',
'data',
'data_type',
]


class FileItem(scrapy.Item):
class FileItem(KingfisherItem):
number = scrapy.Field()
file_name = scrapy.Field()
url = scrapy.Field()
data = scrapy.Field()
data_type = scrapy.Field()
encoding = scrapy.Field()

required = [
'number',
'file_name',
'url',
'data',
'data_type',
]

class FileError(scrapy.Item):
file_name = scrapy.Field()
url = scrapy.Field()

class FileError(KingfisherItem):
errors = scrapy.Field()

required = [
'file_name',
'url',
'errors',
]
5 changes: 0 additions & 5 deletions kingfisher_scrapy/middlewares.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

import logging
Expand Down
7 changes: 5 additions & 2 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
# https://docs.scrapy.org/en/latest/topics/signals.html#item-signals


class KingfisherScrapyPipeline:
class Validate:
def process_item(self, item, spider):
item.validate()
if hasattr(item, 'validate'):
# We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't
# as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue.
item.validate()

return item
6 changes: 3 additions & 3 deletions kingfisher_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'kingfisher_scrapy.pipelines.KingfisherScrapyPipeline': 300,
#}
ITEM_PIPELINES = {
'kingfisher_scrapy.pipelines.Validate': 300,
}

# To send items to Kingfishet Process, set this to, for example, "http://kingfisher.example.com" (no trailing slash).
KINGFISHER_API_URI = os.getenv('KINGFISHER_API_URI')
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/argentina_buenos_aires.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ def parse_list(self, response):
data = json.loads(response.text)
for resource in data['result']['resources']:
if resource['format'].upper() == 'JSON':
yield scrapy.Request(url=resource['url'])
yield scrapy.Request(resource['url'], meta={'kf_filename': resource['url'].rsplit('/', 1)[-1]})
17 changes: 12 additions & 5 deletions kingfisher_scrapy/spiders/australia_nsw.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,41 +17,48 @@ def start_requests(self):
for release_type in release_types:
yield scrapy.Request(
url.format(release_type, page_limit),
meta={'release_type': release_type},
meta={
'kf_filename': '{}.json'.format(release_type),
'release_type': release_type,
},
callback=self.parse_list
)

def parse_list(self, response):
if self.is_http_success(response):

json_data = json.loads(response.text)
release_type = response.request.meta['release_type']

# More Pages?
if 'links' in json_data and isinstance(json_data['links'], dict) and 'next' in json_data['links'] \
and not self.sample:
yield scrapy.Request(
json_data['links']['next'],
meta={'release_type': response.request.meta['release_type']},
meta={
'kf_filename': hashlib.md5(json_data['links']['next'].encode('utf-8')).hexdigest() + '.json',
'release_type': release_type,
},
callback=self.parse_list
)

# Data?
for release in json_data['releases']:
if response.request.meta['release_type'] == 'planning':
if release_type == 'planning':
uuid = release['tender']['plannedProcurementUUID']
yield scrapy.Request(
'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=%s' % uuid,
meta={'kf_filename': 'plannning-%s.json' % uuid},
callback=self.parse
)
if response.request.meta['release_type'] == 'tender':
if release_type == 'tender':
uuid = release['tender']['RFTUUID']
yield scrapy.Request(
'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=%s' % uuid,
meta={'kf_filename': 'tender-%s.json' % uuid},
callback=self.parse
)
if response.request.meta['release_type'] == 'contract':
if release_type == 'contract':
for award in release['awards']:
uuid = award['CNUUID']
yield scrapy.Request(
Expand Down
14 changes: 6 additions & 8 deletions kingfisher_scrapy/spiders/chile_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,14 @@ def get_year_month_until(self):
until_month = 12 if self.start_year != datetime.datetime.now().year else until_month
return until_year, until_month

def get_sample_request(self):
return scrapy.Request(
url=self.base_list_url.format(2017, 10, 0, 10),
meta={'year': 2017, 'month': 10}
)

def start_requests(self):
if self.sample:
yield self.get_sample_request()
yield scrapy.Request(
url=self.base_list_url.format(2017, 10, 0, 10),
meta={'kf_filename': 'list-2017-10.json', 'year': 2017, 'month': 10},
)
return

until_year, until_month = self.get_year_month_until()
for year in range(self.start_year, until_year):
for month in range(1, 13):
Expand All @@ -43,7 +41,7 @@ def start_requests(self):
break
yield scrapy.Request(
url=self.base_list_url.format(year, month, 0, self.limit),
meta={'year': year, 'month': month}
meta={'kf_filename': 'list-{}-{:02d}.json'.format(year, month), 'year': year, 'month': month},
)

def base_parse(self, response, package_type):
Expand Down
3 changes: 2 additions & 1 deletion kingfisher_scrapy/spiders/colombia_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ class ColombiaBulk(ZipSpider):
def start_requests(self):
yield scrapy.Request(
url='https://www.colombiacompra.gov.co/transparencia/datos-json',
callback=self.parse_list
meta={'kf_filename': 'list.html'},
callback=self.parse_list,
)

@handle_error
Expand Down
9 changes: 8 additions & 1 deletion kingfisher_scrapy/spiders/digiwhist_base.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
import tarfile
from io import BytesIO

import scrapy

from kingfisher_scrapy.base_spider import BaseSpider
from kingfisher_scrapy.util import handle_error


class DigiwhistBase(BaseSpider):
def start_requests(self):
# See scrapy.spiders.Spider.start_requests
for url in self.start_urls:
yield scrapy.Request(url, dont_filter=True, meta={'kf_filename': 'file.tar.gz'})

@handle_error
def parse(self, response):
yield self.build_file_from_response(response, 'file.tar.gz', post_to_api=False)
yield self.build_file_from_response(response, response.request.meta['kf_filename'], post_to_api=False)

# Load a line at the time, pass it to API
with tarfile.open(fileobj=BytesIO(response.body), mode="r:gz") as tar:
Expand Down
11 changes: 7 additions & 4 deletions kingfisher_scrapy/spiders/dominican_republic.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@ class DominicanRepublic(BaseSpider):
}

def start_requests(self):
yield scrapy.Request('https://www.dgcp.gob.do/estandar-mundial-ocds/',
callback=self.parse_main_page)
yield scrapy.Request(
'https://www.dgcp.gob.do/estandar-mundial-ocds/',
meta={'kf_filename': 'list.html'},
callback=self.parse_list,
)

@handle_error
def parse_main_page(self, response):
def parse_list(self, response):
urls = response.css('.fileLink::attr(href)').getall()
json_urls = list(filter(lambda x: '/JSON_DGCP_' in x, urls))

Expand All @@ -28,7 +31,7 @@ def parse_main_page(self, response):

for url in json_urls:
if '/JSON_DGCP_' in url:
yield scrapy.Request('https:' + url)
yield scrapy.Request('https:' + url, meta={'kf_filename': url.rsplit('/', 1)[-1]})

def parse(self, response):
if self.is_http_success(response):
Expand Down
8 changes: 5 additions & 3 deletions kingfisher_scrapy/spiders/france.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ class France(BaseSpider):
def start_requests(self):
yield scrapy.Request(
url='https://www.data.gouv.fr/api/1/datasets/?organization=534fff75a3a7292c64a77de4',
callback=self.parse_item
meta={'kf_filename': 'list.json'},
callback=self.parse_list,
)

@handle_error
def parse_item(self, response):
def parse_list(self, response):
json_data = json.loads(response.text)
data = json_data['data']
for item in data:
Expand All @@ -40,7 +41,8 @@ def parse_item(self, response):
if next_page:
yield scrapy.Request(
next_page,
callback=self.parse_item
meta={'kf_filename': hashlib.md5(next_page.encode('utf-8')).hexdigest() + '.json'},
callback=self.parse_list
)

@handle_error
Expand Down
7 changes: 6 additions & 1 deletion kingfisher_scrapy/spiders/honduras_cost.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@

class HondurasCoST(BaseSpider):
name = 'honduras_cost'
start_urls = ['http://app.sisocs.org/protected/ocdsShow/']

def start_requests(self):
yield scrapy.Request(
'http://app.sisocs.org/protected/ocdsShow/',
meta={'kf_filename': 'list.html'},
)

@handle_error
def parse(self, response):
Expand Down
8 changes: 6 additions & 2 deletions kingfisher_scrapy/spiders/honduras_oncae.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@

class HondurasONCAE(ZipSpider):
name = 'honduras_oncae'
start_urls = ['http://oncae.gob.hn/datosabiertos']

# the files take too long to be downloaded, so we increase the download timeout
download_timeout = 900

def start_requests(self):
yield scrapy.Request(
'http://oncae.gob.hn/datosabiertos',
meta={'kf_filename': 'list.html'},
)

@handle_error
def parse(self, response):
urls = response.css(".article-content ul")\
Expand Down
11 changes: 7 additions & 4 deletions kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,22 @@ class HondurasPortalBulkFiles(BaseSpider):
def start_requests(self):
yield scrapy.Request(
'http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json',
callback=self.parse_json_list
meta={'kf_filename': 'list.json'},
callback=self.parse_list,
)

@handle_error
def parse_json_list(self, response):
def parse_list(self, response):
filelist = json.loads(response.text)

if self.sample:
yield scrapy.Request(filelist[0]['urls']['json'])
url = filelist[0]['urls']['json']
yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]})

else:
for item in filelist:
yield scrapy.Request(item['urls']['json'])
url = item['urls']['json']
yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]})

def parse(self, response):
filename = urlparse(response.request.url).path.split('/')[-2]
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/indonesia_bandung.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def parse_data(self, response):
if next_page_url:
yield scrapy.Request(
next_page_url,
meta={'kf_filename': next_page_url.rsplit('/', 1)[-1] + '.json'},
callback=self.parse_data
)

Expand Down

0 comments on commit 0923031

Please sign in to comment.