Skip to content

Commit

Permalink
Merge 800b655 into 9dde2cf
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Nov 17, 2020
2 parents 9dde2cf + 800b655 commit 132f5d3
Show file tree
Hide file tree
Showing 8 changed files with 220 additions and 225 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ jobs:
- run: curl -s -S --retry 3 $BASEDIR/tests/install.sh | bash -
- run: pip install -r requirements_dev.txt
- run: curl -s -S --retry 3 $BASEDIR/tests/script.sh | bash -
- run: python -m pytest --cov kingfisher_scrapy tests
# Specify tests/ directory to avoid "ModuleNotFoundError: No module named 'sphinx_rtd_theme'" from docs/conf.py.
- run: pytest --cov kingfisher_scrapy tests
if: always()
- env:
COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
run: coveralls
79 changes: 40 additions & 39 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import json
import os

import requests
import sentry_sdk
from scrapy import signals
from scrapy.exceptions import NotConfigured
Expand Down Expand Up @@ -146,17 +145,16 @@ def spider_closed(self, spider, reason):
Sends an API request to end the collection's store step.
"""
# https://docs.scrapy.org/en/latest/topics/signals.html#spider-closed
if reason != 'finished' or spider.pluck or spider.keep_collection_open:
if reason not in ('finished', 'sample') or spider.pluck or spider.keep_collection_open:
return

response = self.client.end_collection_store({
data = {
'collection_source': spider.name,
'collection_data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'),
'collection_sample': bool(spider.sample),
})
'collection_sample': str(bool(spider.sample)),
}

if not response.ok:
spider.logger.warning('Failed to post End Collection Store. API status code: %s', response.status_code)
return self._request(spider, 'end_collection_store', data['collection_source'], data)

def item_scraped(self, item, spider):
"""
Expand All @@ -169,48 +167,51 @@ def item_scraped(self, item, spider):
data = {
'collection_source': spider.name,
'collection_data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'),
'collection_sample': bool(spider.sample),
'collection_sample': str(bool(spider.sample)),
'file_name': item['file_name'],
'url': item['url'],
}

if isinstance(item, FileError):
data['errors'] = json.dumps(item['errors'])

self._request(item, spider, 'create_file_error', 'File Error API', data)
else:
data['data_type'] = item['data_type']
data['encoding'] = item.get('encoding', 'utf-8')
if spider.note:
data['collection_note'] = spider.note
return self._request(spider, 'create_file_error', item['url'], data)

if isinstance(item, FileItem):
data['number'] = item['number']
data['data'] = item['data']
data['data_type'] = item['data_type']
data['encoding'] = item.get('encoding', 'utf-8')
if spider.note:
data['collection_note'] = spider.note

self._request(item, spider, 'create_file_item', 'File Item API', data)
if isinstance(item, FileItem):
data['number'] = item['number']
data['data'] = item['data']

# File
else:
if self.directory:
path = item['path']
data['local_file_name'] = os.path.join(self.directory, path)
files = {}
else:
path = os.path.join(item['files_store'], item['path'])
f = open(path, 'rb')
files = {'file': (item['file_name'], f, 'application/json')}

self._request(item, spider, 'create_file', 'File API', data, files)

def _request(self, item, spider, method, name, *args):
try:
response = getattr(self.client, method)(*args)
if not response.ok:
spider.logger.warning('Failed to post [%s]. %s status code: %s', item['url'], name,
response.status_code)
except (requests.exceptions.ConnectionError, requests.exceptions.ProxyError) as e:
spider.logger.warning('Failed to post [%s]. %s exception: %s', item['url'], name, e)
return self._request(spider, 'create_file_item', item['url'], data)

# File
if self.directory:
path = item['path']
data['local_file_name'] = os.path.join(self.directory, path)
files = {}
else:
path = os.path.join(item['files_store'], item['path'])
f = open(path, 'rb')
files = {'file': (item['file_name'], 'application/json', f)}

return self._request(spider, 'create_file', item['url'], data, files)

def _request(self, spider, method, infix, *args):
def log_for_status(response):
# Same condition as `Response.raise_for_status` in requests module.
# https://github.com/psf/requests/blob/28cc1d237b8922a2dcbd1ed95782a7f1751f475b/requests/models.py#L920
if 400 <= response.code < 600:
spider.logger.warning(f'{method} failed ({infix}) with status code: {response.code}')
# A return value is provided to ease testing.
return response

d = getattr(self.client, method)(*args)
d.addCallback(log_for_status)
return d


# https://stackoverflow.com/questions/25262765/handle-all-exception-in-scrapy-with-sentry
Expand Down
7 changes: 3 additions & 4 deletions kingfisher_scrapy/kingfisher_process.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import requests
import treq as treq


class Client:
def __init__(self, url, key):
self.url = url
self.key = key
self.headers = {'Authorization': f'ApiKey {key}'}

def create_file(self, data, files):
return self._post('/api/v1/submit/file/', data, files=files)
Expand All @@ -19,5 +19,4 @@ def end_collection_store(self, data):
return self._post('/api/v1/submit/end_collection_store/', data)

def _post(self, path, data, **kwargs):
return requests.post(self.url + path, headers={'Authorization': 'ApiKey ' + self.key}, data=data,
proxies={'http': None, 'https': None}, **kwargs)
return treq.post(f'{self.url}{path}', headers=self.headers, data=data, **kwargs)
3 changes: 2 additions & 1 deletion requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ jsonref
jsonschema
ocdsmerge
rarfile
requests
rfc3987
scrapyd
Scrapy>=2
scrapyd-client
sentry-sdk
treq
twisted
20 changes: 11 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,23 @@
#
# pip-compile
#
attrs==19.3.0 # via automat, jsonschema, service-identity, twisted
attrs==19.3.0 # via automat, jsonschema, service-identity, treq, twisted
automat==0.8.0 # via twisted
certifi==2019.11.28 # via requests, sentry-sdk
cffi==1.13.2 # via cryptography
chardet==3.0.4 # via requests
constantly==15.1.0 # via twisted
contextlib2==0.6.0.post1 # via schema
cryptography==3.2.1 # via pyopenssl, scrapy, service-identity
cssselect==1.1.0 # via parsel, scrapy
defusedxml==0.6.0 # via odfpy
et-xmlfile==1.0.1 # via openpyxl
flattentool==0.14.0 # via -r requirements.in
hyperlink==19.0.0 # via twisted
idna==2.8 # via hyperlink, requests
hyperlink==19.0.0 # via treq, twisted
idna==2.8 # via hyperlink, requests, twisted
ijson==3.1.1 # via -r requirements.in
importlib-metadata==1.6.1 # via jsonschema
incremental==17.5.0 # via twisted
incremental==17.5.0 # via treq, twisted
itemadapter==0.1.1 # via itemloaders, scrapy
itemloaders==1.0.3 # via scrapy
jdcal==1.4.1 # via openpyxl
Expand All @@ -38,21 +39,22 @@ pyasn1==0.4.8 # via pyasn1-modules, service-identity
pycparser==2.19 # via cffi
pydispatcher==2.0.5 # via scrapy
pyhamcrest==1.9.0 # via twisted
pyopenssl==19.1.0 # via scrapy
pyopenssl==19.1.0 # via scrapy, twisted
pyrsistent==0.16.0 # via jsonschema
pytz==2020.1 # via flattentool
queuelib==1.5.0 # via scrapy
rarfile==3.1 # via -r requirements.in
requests==2.22.0 # via -r requirements.in, ocdsmerge
requests==2.22.0 # via ocdsmerge, treq
rfc3987==1.3.8 # via -r requirements.in
schema==0.7.2 # via flattentool
scrapy==2.3.0 # via -r requirements.in, scrapyd, scrapyd-client
scrapyd-client==1.1.0 # via -r requirements.in
scrapyd==1.2.1 # via -r requirements.in
sentry-sdk==0.14.4 # via -r requirements.in
service-identity==18.1.0 # via scrapy
six==1.13.0 # via automat, cryptography, jsonschema, parsel, protego, pyhamcrest, pyopenssl, pyrsistent, scrapyd, scrapyd-client, w3lib
twisted==20.3.0 # via scrapy, scrapyd
service-identity==18.1.0 # via scrapy, twisted
six==1.13.0 # via automat, cryptography, jsonschema, parsel, protego, pyhamcrest, pyopenssl, pyrsistent, scrapyd, scrapyd-client, treq, w3lib
treq==20.9.0 # via -r requirements.in
twisted[tls]==20.3.0 # via -r requirements.in, scrapy, scrapyd, treq
urllib3==1.25.7 # via requests, sentry-sdk
w3lib==1.21.0 # via itemloaders, parsel, scrapy
xmltodict==0.12.0 # via flattentool
Expand Down
1 change: 1 addition & 0 deletions requirements_dev.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ isort
pip-tools
pytest
pytest-cov
pytest-twisted
24 changes: 14 additions & 10 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile requirements_dev.in
#
attrs==19.3.0 # via -r requirements.txt, automat, jsonschema, pytest, service-identity, twisted
attrs==19.3.0 # via -r requirements.txt, automat, jsonschema, pytest, service-identity, treq, twisted
automat==0.8.0 # via -r requirements.txt, twisted
certifi==2019.11.28 # via -r requirements.txt, requests, sentry-sdk
cffi==1.13.2 # via -r requirements.txt, cryptography
Expand All @@ -16,17 +16,19 @@ coverage==5.0.3 # via coveralls, pytest-cov
coveralls==2.0.0 # via -r requirements_dev.in
cryptography==3.2.1 # via -r requirements.txt, pyopenssl, scrapy, service-identity
cssselect==1.1.0 # via -r requirements.txt, parsel, scrapy
decorator==4.4.2 # via pytest-twisted
defusedxml==0.6.0 # via -r requirements.txt, odfpy
docopt==0.6.2 # via coveralls
entrypoints==0.3 # via flake8
et-xmlfile==1.0.1 # via -r requirements.txt, openpyxl
flake8==3.7.9 # via -r requirements_dev.in
flattentool==0.14.0 # via -r requirements.txt
hyperlink==19.0.0 # via -r requirements.txt, twisted
idna==2.8 # via -r requirements.txt, hyperlink, requests
greenlet==0.4.17 # via pytest-twisted
hyperlink==19.0.0 # via -r requirements.txt, treq, twisted
idna==2.8 # via -r requirements.txt, hyperlink, requests, twisted
ijson==3.1.1 # via -r requirements.txt
importlib-metadata==1.6.1 # via -r requirements.txt, jsonschema, pluggy, pytest
incremental==17.5.0 # via -r requirements.txt, twisted
incremental==17.5.0 # via -r requirements.txt, treq, twisted
isort==4.3.21 # via -r requirements_dev.in
itemadapter==0.1.1 # via -r requirements.txt, itemloaders, scrapy
itemloaders==1.0.3 # via -r requirements.txt, scrapy
Expand Down Expand Up @@ -54,24 +56,26 @@ pycparser==2.19 # via -r requirements.txt, cffi
pydispatcher==2.0.5 # via -r requirements.txt, scrapy
pyflakes==2.1.1 # via flake8
pyhamcrest==1.9.0 # via -r requirements.txt, twisted
pyopenssl==19.1.0 # via -r requirements.txt, scrapy
pyopenssl==19.1.0 # via -r requirements.txt, scrapy, twisted
pyparsing==2.4.5 # via packaging
pyrsistent==0.16.0 # via -r requirements.txt, jsonschema
pytest-cov==2.8.1 # via -r requirements_dev.in
pytest==5.3.2 # via -r requirements_dev.in, pytest-cov
pytest-twisted==1.13.2 # via -r requirements_dev.in
pytest==5.3.2 # via -r requirements_dev.in, pytest-cov, pytest-twisted
pytz==2020.1 # via -r requirements.txt, flattentool
queuelib==1.5.0 # via -r requirements.txt, scrapy
rarfile==3.1 # via -r requirements.txt
requests==2.22.0 # via -r requirements.txt, coveralls, ocdsmerge
requests==2.22.0 # via -r requirements.txt, coveralls, ocdsmerge, treq
rfc3987==1.3.8 # via -r requirements.txt
schema==0.7.2 # via -r requirements.txt, flattentool
scrapy==2.3.0 # via -r requirements.txt, scrapyd, scrapyd-client
scrapyd-client==1.1.0 # via -r requirements.txt
scrapyd==1.2.1 # via -r requirements.txt
sentry-sdk==0.14.4 # via -r requirements.txt
service-identity==18.1.0 # via -r requirements.txt, scrapy
six==1.13.0 # via -r requirements.txt, automat, cryptography, jsonschema, packaging, parsel, pip-tools, protego, pyhamcrest, pyopenssl, pyrsistent, scrapyd, scrapyd-client, w3lib
twisted==20.3.0 # via -r requirements.txt, scrapy, scrapyd
service-identity==18.1.0 # via -r requirements.txt, scrapy, twisted
six==1.13.0 # via -r requirements.txt, automat, cryptography, jsonschema, packaging, parsel, pip-tools, protego, pyhamcrest, pyopenssl, pyrsistent, scrapyd, scrapyd-client, treq, w3lib
treq==20.9.0 # via -r requirements.txt
twisted[tls]==20.3.0 # via -r requirements.txt, scrapy, scrapyd, treq
urllib3==1.25.7 # via -r requirements.txt, requests, sentry-sdk
w3lib==1.21.0 # via -r requirements.txt, itemloaders, parsel, scrapy
wcwidth==0.1.7 # via pytest
Expand Down

0 comments on commit 132f5d3

Please sign in to comment.