View
@@ -1,56 +1,161 @@
import email.utils
import logging
import re
import os
import json
from six import StringIO
import jwt
import requests
from six import StringIO
from slugify import slugify
from dpp_runner.lib import DppRunner
from datapackage import Package
from os_api_cache import get_os_cache
from conductor.blueprints.user.controllers import PUBLIC_KEY
from .models import package_registry
os_api_url = os.environ.get('OS_API_URL')
api_cache = get_os_cache()
runner = DppRunner(max_workers=4)
os.environ['DPP_DB_ENGINE'] = os.environ['OS_CONDUCTOR_ENGINE']
os.environ['ELASTICSEARCH_ADDRESS'] = os.environ['OS_ELASTICSEARCH_ADDRESS']
os.environ['AWS_ACCESS_KEY_ID'] = os.environ['OS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = os.environ['OS_SECRET_ACCESS_KEY']
os.environ['S3_BUCKET_NAME'] = os.environ['OS_STORAGE_BUCKET_NAME']
def copy_except(obj, fields):
return dict(
(k, v)
for k, v in obj.items()
if k not in fields
)
def prepare_field(field, slugs):
field_name = field['name'].strip()
slug_base = slugify(field_name, separator='_', to_lower=True)
slug = slug_base
if slug in slugs:
suffix = 0
while slug in slugs:
suffix += 1
slug = '{}_{}'.format(slug_base, suffix)
if slug != field_name:
aliases = [field_name]
else:
aliases = []
ret = {
'header': slug,
'aliases': aliases,
'osType': field['osType'],
}
if 'title' in field:
ret['title'] = field['title']
ret['options'] = copy_except(field,
('name', 'title', 'osType', 'type',
'slug', 'conceptType', 'format'))
return ret
def upload(datapackage, callback, token, cache_set):
def make_upload_complete_callback(name, token):
"""Callback function when upload is complete."""
def on_upload_complete_callback(name=name, token=token):
# Make package private
toggle_publish(name, token, toggle=False, publish=False)
# Obfuscate email in author field
name, datapackage_url, datapackage, \
model, dataset_name, author,\
status, loaded = package_registry.get_raw(name)
# Get the full name from the author field, and rewrite it without
# domain in the email
fullname, email_addr = email.utils.parseaddr(datapackage['author'])
email_addr = '{0}@not.shown'.format(email_addr.split('@')[0])
datapackage['author'] = '{0} <{1}>'.format(fullname, email_addr)
package_registry.save_model(name, datapackage_url, datapackage,
model, dataset_name, author,
status, loaded)
return on_upload_complete_callback
def upload(datapackage, token, cache_get, cache_set):
"""Initiate a package load to the database
:param datapackage: URL for datapackage to load
:param callback: URL for callback to send to loader
:param token: authentication token for user performing the upload
"""
encoded_token = token
try:
token = jwt.decode(token.encode('ascii'),
PUBLIC_KEY,
algorithm='RS256')
except jwt.InvalidTokenError:
token = None
key = None
if token is None:
ret = {
"status": "fail",
"error": 'unauthorized'
}
else:
params = {
'package': datapackage,
'callback': callback
}
load_url = os_api_url+'/api/3/loader/'
response = requests.get(load_url, params=params)
if response.status_code == 200:
try:
key = 'os-conductor:package:'+datapackage
ret = {
"progress": 0,
"status": "queued"
}
else:
cache_set(key, ret, 3600)
package = Package(datapackage)
desc = package.descriptor
slugs = set()
r = package.resources[0]
source = {
'url': r.source
}
if r.descriptor.get('encoding') is not None:
source['encoding'] = r.descriptor.get('encoding')
fiscal_spec = {
'dataset-name:': desc['name'],
'resource-name': r.name,
'title': desc.get('title', desc['name']),
'datapackage-url': datapackage,
'owner-id': token['userid'],
'sources': [
source
],
'fields': [
prepare_field(f, slugs)
for f in
r.descriptor['schema']['fields']
if 'osType' in f
]
}
package_id = '{0}:{1}'.format(token['userid'],
slugify(desc['name'],
separator='_',
to_lower=True))
on_upload_complete_callback = \
make_upload_complete_callback(package_id, encoded_token)
status_cb = StatusCallback(datapackage, cache_get, cache_set,
on_upload_complete_callback)
logging.info('About to run spec\n%s', json.dumps(fiscal_spec, indent=2))
runner.start('fiscal', json.dumps(fiscal_spec).encode('utf8'),
verbosity=2, status_cb=status_cb)
except Exception as e:
ret = {
"status": "fail",
"error": 'HTTP %s' % response.status_code
"error": str(e)
}
if key is not None:
cache_set(key, ret, 3600)
return ret
@@ -60,24 +165,52 @@ def upload_status(datapackage, cache_get):
return ret
def upload_status_update(datapackage, status, error,
progress, cache_get, cache_set):
logging.error('upload_status_update: %s sts:%s, err:%s, prog:%s',
datapackage, status, error, progress)
if datapackage is not None and status is not None:
key = 'os-conductor:package:'+datapackage
ret = cache_get(key)
class StatusCallback:
def __init__(self, datapackage_url, cache_get, cache_set,
complete_callback):
self.datapackage_url = datapackage_url
self.cache_get = cache_get
self.cache_set = cache_set
self.statuses = {}
self.error = None
self.on_complete_callback = complete_callback
def status(self):
statuses = self.statuses.values()
if 'FAILED' in statuses:
return 'fail'
if 'INPROGRESS' in statuses:
return 'loading-data'
if all(self.statuses.get(pi) == 'SUCCESS'
for pi in ('./finalize_datapackage_flow',
'./dumper_flow_update_status')):
return 'done'
return 'loading-data'
def __call__(self, pipeline_id, status, errors=None, stats=None):
logging.debug('upload_status_update: %s pipeline:%s, ' +
'status:%s, err:%s, stats:%s',
self.datapackage_url, pipeline_id, status, errors, stats)
key = 'os-conductor:package:'+self.datapackage_url
ret = self.cache_get(key)
if ret is None:
ret = {
'status': status,
'progress': 0
}
if progress is not None:
ret['progress'] = int(progress)
if status == 'fail' and error is not None:
ret['error'] = error
ret['status'] = status
cache_set(key, ret, 3600)
if status == 'FAILED' and errors is not None and self.error is None:
self.error = '\n'.join(errors)
ret['error'] = self.error
self.statuses[pipeline_id] = status
ret['status'] = self.status()
if ret['status'] == 'done':
if stats is not None:
progress = stats.get('count_of_rows')
if progress:
ret['progress'] = int(progress)
self.on_complete_callback()
self.cache_set(key, ret, 3600)
def toggle_publish(name, token, toggle=False, publish=False):
@@ -88,9 +221,15 @@ def toggle_publish(name, token, toggle=False, publish=False):
except jwt.InvalidTokenError:
return None
name, datapackage_url, datapackage, \
model, dataset_name, author,\
status, loaded = package_registry.get_raw(name)
try:
name, datapackage_url, datapackage, \
model, dataset_name, author,\
status, loaded = package_registry.get_raw(name)
except KeyError:
datapackage = None
if datapackage is None:
logging.error('toggle_publish: Failed to locate package %s', name)
return {'success': False, 'error': 'Failed to locate package %s' % name}
private = datapackage.get('private', False)
if toggle:
private = not private
@@ -103,6 +242,38 @@ def toggle_publish(name, token, toggle=False, publish=False):
return {'success': True, 'published': not private}
def update_params(name, token, params):
"""
Update package.defaultParams for the passed `name`. Only owner can update.
"""
try:
token = jwt.decode(token.encode('ascii'),
PUBLIC_KEY,
algorithm='RS256')
userid = token['userid']
if name.split(':')[0] != userid:
logging.error('USERID=%r, name=%r', userid, name)
return None
except jwt.InvalidTokenError:
return None
try:
_, _, datapackage, *_ = package_registry.get_raw(name)
except KeyError:
# Can't find package by `name`
return None
datapackage['defaultParams'] = params
package_registry.update_model(name, datapackage=datapackage)
# Clear the cached entry for this package is api_cache isn't None.
try:
api_cache.clear(name)
except AttributeError:
pass
return {'success': True}
def delete_package(name, token):
try:
token = jwt.decode(token.encode('ascii'),
@@ -120,20 +291,19 @@ def delete_package(name, token):
return {'success': success}
obeu_url = 'http://eis-openbudgets.iais.fraunhofer.de/' \
'linkedpipes/execute/fdp2rdf'
webhook_obeu_url = os.environ.get('WEBHOOK_OBEU_URL', obeu_url)
def run_hooks(name, token):
def run_hooks(name, token, pipeline):
try:
jwt.decode(token.encode('ascii'),
PUBLIC_KEY,
algorithm='RS256')
except jwt.InvalidTokenError:
return None
_, datapackage_url, _, _, _, _, _, _ = package_registry.get_raw(name)
# Fix datastore.openspending.org url
datapackage_url = \
re.sub(r'https?://datastore\.openspending\.org/',
'https://s3.amazonaws.com/datastore.openspending.org/',
datapackage_url)
json_ld_payload = {
"@context": {
"@vocab": "http://schema.org/",
@@ -147,7 +317,7 @@ def run_hooks(name, token):
(filename, StringIO(json.dumps(json_ld_payload)), 'application/json')
)
]
response = requests.post(webhook_obeu_url, files=files)
response = requests.post(pipeline, files=files)
return {'success': True,
'response': response.text,
'payload': json_ld_payload}
View
@@ -12,6 +12,9 @@ services:
OS_ELASTICSEARCH_ADDRESS: es:9200
OS_API_URL: http://localhost:8000
OS_CHECK_ES_HEALTHY: 'True'
OS_ACCESS_KEY_ID: foo
OS_SECRET_ACCESS_KEY: bar
OS_STORAGE_BUCKET_NAME: buck
db:
image: 'postgres:10-alpine'
View
@@ -1,8 +1,6 @@
#!/bin/sh
set -e
cd /app
echo working from `pwd`
echo DB: $OS_CONDUCTOR_ENGINE
echo Setting base url to $OS_BASE_URL
View
@@ -1,18 +1,23 @@
flask
flask-cors
flask-session
flask-jsonpify
flask-oauthlib
flask>=0.12.0,<1.0.0
flask-cors>=3.0.0,<4.0.0
flask-session>=0.3.0,<1.0.0
flask-jsonpify>=1.5.0,<2.0.0
flask-oauthlib>=0.9.4,<1.0.0
pyyaml
boto
raven==6.0.0
requests
pyjwt
sqlalchemy
psycopg2
cryptography
elasticsearch>=1.0.0,<2.0.0
os-package-registry>=0.0.12
os-api-cache>=0.0.6
os-package-registry>=0.0.13
datapackage-pipelines-fiscal>=1.0.11
datapackage-pipelines-aws
datapackage-pipelines[speedup]>=1.6.16
dpp_runner
raven==6.0.0
blinker>=1.1
python-dotenv
gunicorn
View
@@ -0,0 +1,9 @@
# Quiet down third-party logging in test output.
import logging
es_logger = logging.getLogger('elasticsearch')
es_logger.setLevel(logging.WARNING)
urllib3_logger = logging.getLogger('urllib3')
urllib3_logger.setLevel(logging.WARNING)
View
@@ -7,22 +7,45 @@
from elasticsearch import Elasticsearch, NotFoundError
from os_package_registry import PackageRegistry
from ..config import LOCAL_ELASTICSEARCH
from werkzeug.exceptions import BadRequest, NotFound, Forbidden
try:
from unittest.mock import Mock, patch
except ImportError:
from mock import Mock, patch
from importlib import import_module
from conductor.blueprints.user.controllers import PRIVATE_KEY
module = import_module('conductor.blueprints.package.controllers')
Response = namedtuple('Response', ['status_code'])
dpp_module = import_module('datapackage.helpers')
class Response:
def __init__(self, status_code, _json):
self.status_code = status_code
self._json = _json
def json(self):
return self._json
def raise_for_status(self):
if self.status_code != 200:
raise AssertionError('HTTP {}'.format(self.status_code))
datapackage = {
'name': 'my-dataset',
'resources': [
{
'name': 'my-resource',
'path': 'data.csv',
'schema': {
'fields': [
{'name': 'year', 'type': 'integer', 'osType': 'date:fiscal-year'}
]
}
}
]
}
_cache = {}
callback = 'http://conductor/callback'
token = jwt.encode({'userid': 'owner'}, PRIVATE_KEY, algorithm='RS256').decode('ascii')
token = None
def cache_get(key):
global _cache
@@ -39,20 +62,27 @@ class ApiloadTest(unittest.TestCase):
# Actions
def setUp(self):
from conductor.blueprints.user.controllers import PRIVATE_KEY
global token
self.private_key = PRIVATE_KEY
token = jwt.encode({'userid': 'owner'}, PRIVATE_KEY, algorithm='RS256').decode('ascii')
# Cleanup
self.addCleanup(patch.stopall)
# Various patches
self.requests = patch.object(module, 'requests').start()
self.requests = patch.object(dpp_module, 'requests').start()
self.requests.exceptions.RequestException = IOError
self.runner = patch.object(module, 'DppRunner').start()
self.runner.start = Mock(return_value=None)
module.os_api = 'api'
module.os_conductor = 'conductor'
global _cache
_cache = {}
# Tests
def assertResponse(self, ret, status=None, progress=None, error=None):
if status is not None:
self.assertEquals(ret['status'], status)
@@ -63,8 +93,8 @@ def assertResponse(self, ret, status=None, progress=None, error=None):
def test___load___good_request(self):
api_load = module.upload
self.requests.get = Mock(return_value=Response(200))
self.assertResponse(api_load('bla', callback, token, cache_set), 'queued', 0)
self.requests.get = Mock(return_value=Response(200, datapackage))
self.assertResponse(api_load('http://bla', token, cache_get, cache_set), 'queued', 0)
# def test___load___bad_request(self):
# api_load = module.upload
@@ -78,13 +108,13 @@ def test___load___good_request(self):
def test___callback___server_down(self):
api_load = module.upload
self.requests.get = Mock(return_value=Response(499))
self.assertResponse(api_load('bla', callback, token, cache_set), 'fail', error='HTTP 499')
self.requests.get = Mock(return_value=Response(499, datapackage))
self.assertResponse(api_load('http://bla', token, cache_get, cache_set), 'fail', error='HTTP 499')
def test___poll___good_request(self):
api_load = module.upload
self.requests.get = Mock(return_value=Response(200))
api_load('bla2', callback, token, cache_set)
self.requests.get = Mock(return_value=Response(200, datapackage))
api_load('bla2', token, cache_get, cache_set)
api_poll = module.upload_status
self.assertResponse(api_poll('bla2', cache_get), 'queued', 0)
@@ -97,50 +127,10 @@ def test___poll___nonexistent_request(self):
# api_poll = module.upload_status
# self.assertRaises(BadRequest, api_poll, None, cache_get)
def test___callback___no_update(self):
# No parameters
api_callback = module.upload_status_update
self.assertEquals(api_callback('bla4', None, None, 0, cache_get, cache_set), None)
api_poll = module.upload_status
self.assertEquals(api_poll('bla4', cache_get), None)
def test___callback___just_status(self):
# No parameters
api_callback = module.upload_status_update
self.assertEquals(api_callback('bla5', 'status1', None, 0, cache_get, cache_set), None)
api_poll = module.upload_status
self.assertResponse(api_poll('bla5', cache_get), 'status1', 0)
def test___callback___progress(self):
# No parameters
api_callback = module.upload_status_update
self.assertEquals(api_callback('bla6', 'status2', None, 123, cache_get, cache_set), None)
api_poll = module.upload_status
self.assertResponse(api_poll('bla6', cache_get), 'status2', 123)
def test___callback___errormsg_wrong_status(self):
# No parameters
api_callback = module.upload_status_update
self.assertEquals(api_callback('bla7', 'status3', 'wtf', 123, cache_get, cache_set), None)
api_poll = module.upload_status
self.assertResponse(api_poll('bla7', cache_get), 'status3', 123)
def test___callback___error_good_status(self):
# No parameters
api_callback = module.upload_status_update
self.assertEquals(api_callback('bla8', 'fail', 'wtf8', 0, cache_get, cache_set), None)
api_poll = module.upload_status
self.assertResponse(api_poll('bla8', cache_get), 'fail', 0, 'wtf8')
class PublishDeleteAPITests(unittest.TestCase):
DATASET_NAME='owner:datasetid'
DATASET_NAME = 'owner:datasetid'
def setUp(self):
# Clean index
@@ -154,7 +144,8 @@ def setUp(self):
time.sleep(1)
self.pr = PackageRegistry(es_connection_string=LOCAL_ELASTICSEARCH)
self.pr.save_model(self.DATASET_NAME, 'datapackage_url', {}, {}, 'dataset', 'author', '', True)
self.pr.save_model(self.DATASET_NAME, 'datapackage_url', {}, {},
'dataset', 'author', '', True)
def test__initial_value__none(self):
pkg = self.pr.get_package(self.DATASET_NAME)
@@ -206,7 +197,54 @@ def test__force_unpublish__correct(self):
assert(pkg.get('private') is True)
class UpdateDefaultParamsAPITests(unittest.TestCase):
DATASET_NAME = 'owner:datasetid'
DEFAULT_PARAMS = {'param1': True, 'param2': 'hello'}
def setUp(self):
# Clean index
self.es = Elasticsearch(hosts=[LOCAL_ELASTICSEARCH])
try:
self.es.indices.delete(index='users')
self.es.indices.delete(index='packages')
except NotFoundError:
pass
self.es.indices.create('users')
time.sleep(1)
self.pr = PackageRegistry(es_connection_string=LOCAL_ELASTICSEARCH)
self.pr.save_model(self.DATASET_NAME, 'datapackage_url', {}, {},
'dataset', 'author', '', True)
def test__initial_value__none(self):
pkg = self.pr.get_package(self.DATASET_NAME)
assert(pkg.get('defaultParams') is None)
def test__update_params__empty_params(self):
module.update_params('owner:datasetid', token, {})
pkg = self.pr.get_package(self.DATASET_NAME)
assert(pkg.get('defaultParams') == {})
def test__update_params__with_value(self):
module.update_params('owner:datasetid', token, self.DEFAULT_PARAMS)
pkg = self.pr.get_package(self.DATASET_NAME)
assert(pkg.get('defaultParams') == self.DEFAULT_PARAMS)
def test__update_params__bad_owner(self):
module.update_params('badowner:datasetid', token, self.DEFAULT_PARAMS)
pkg = self.pr.get_package(self.DATASET_NAME)
assert(pkg.get('defaultParams') is None)
def test__update_params__bad_package_id(self):
module.update_params('owner:baddatasetid', token, self.DEFAULT_PARAMS)
pkg = self.pr.get_package(self.DATASET_NAME)
assert(pkg.get('defaultParams') is None)
class StatsTests(unittest.TestCase):
def test__stats__delegates_to_package_registry(self):
with patch('conductor.blueprints.package.models.package_registry.get_stats') as get_stats_mock:
stats_path = \
'conductor.blueprints.package.models.package_registry.get_stats'
with patch(stats_path) as get_stats_mock:
assert module.stats() == get_stats_mock()
View
@@ -6,12 +6,15 @@ envlist =
setenv =
OS_CONDUCTOR_ENGINE=postgresql://postgres@/postgres
OS_ELASTICSEARCH_ADDRESS=localhost:9200
OS_ACCESS_KEY_ID=access_key
OS_SECRET_ACCESS_KEY=secret_key
OS_STORAGE_BUCKET_NAME=
deps =
-rrequirements.txt
nose
coverage
commands =
nosetests tests \
nosetests -w tests \
{posargs} \
--with-coverage \
--cover-package conductor