Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Admin: add schema field to resource #2512

Merged
merged 12 commits into from Jul 21, 2020
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -8,6 +8,7 @@
- Adding a celery job `purge-orphan-community-resources` to remove community resources not linked to a dataset. This should be scheduled regularly.
- Adding a migration file to populate resources fs_filename new field. Scripts to delete the orphaned files are available [here](https://gist.github.com/quaxsze/dc089e4ecd2e00f82acea573d8d2cfb9).
- Show traceback for migration errors [#2513](https://github.com/opendatateam/udata/pull/2513)
- Add `schema` field to ressources. This field can be filled based on an external schema catalog [#2512](https://github.com/opendatateam/udata/pull/2512)

## 2.1.3 (2020-06-29)

Expand Down
9 changes: 9 additions & 0 deletions docs/adapting-settings.md
Expand Up @@ -126,6 +126,15 @@ The id of an existing user which will post a comment when a dataset is archived.
The title of the comment optionaly posted when a dataset is archived.
NB: the content of the comment is located in `udata/templates/comments/dataset_archived.txt`.


### SCHEMA_CATALOG_URL

**default** : `None`

The URL to a schema catalog, listing schemas resources can conform to. The URL should be a JSON endpoint, returning a schema catalog. Example: https://schema.data.gouv.fr/schemas/schemas.json

NB: this is used by the `datasets/schemas` API to fill the `schema` field of a `Resource`.

## URLs validation

### URLS_ALLOW_PRIVATE
Expand Down
19 changes: 18 additions & 1 deletion js/components/dataset/resource/form.vue
Expand Up @@ -153,6 +153,7 @@ import CommunityResource from 'models/communityresource';
import FormHorizontal from 'components/form/horizontal-form.vue';
import UploaderMixin from 'mixins/uploader';
import resource_types from 'models/resource_types';
import schemas from 'models/schemas';

export default {
components: {FormHorizontal},
Expand Down Expand Up @@ -243,7 +244,23 @@ export default {
}]
},
fields() {
return this.generic_fields.concat(this.file_fields);
return this.generic_fields.concat(this.schema_field).concat(this.file_fields);
AntoineAugusti marked this conversation as resolved.
Show resolved Hide resolved
},
schema_field() {
if (schemas.has_data) {
const values = [{id: '', label: ''}].concat(schemas.data);
return [{
id: 'schema',
label: this._('Schema'),
widget: 'select-input',
values,
map: function(item) {
return {value: item.id, text: item.label};
}
}];
}

return [];
},
is_community() {
return this.resource instanceof CommunityResource;
Expand Down
14 changes: 14 additions & 0 deletions js/models/schemas.js
@@ -0,0 +1,14 @@
import {List} from 'models/base';


export class Schemas extends List {
constructor(options) {
super(options);
this.$options.ns = 'datasets';
this.$options.fetch = 'schemas';
this.sorted = 'label';
AntoineAugusti marked this conversation as resolved.
Show resolved Hide resolved
}
}

export var schemas = new Schemas().fetch();
export default schemas;
23 changes: 21 additions & 2 deletions udata/core/dataset/api.py
Expand Up @@ -21,7 +21,7 @@
import logging
from datetime import datetime

from flask import request, current_app
from flask import request, current_app, abort
from flask_security import current_user

from udata import search
Expand All @@ -44,17 +44,21 @@
resource_fields,
resource_type_fields,
upload_fields,
schema_fields,
)
from udata.linkchecker.checker import check_resource
from .models import (
Dataset, Resource, Checksum, License, UPDATE_FREQUENCIES,
CommunityResource, RESOURCE_TYPES
CommunityResource, RESOURCE_TYPES, ResourceSchema
)
from .permissions import DatasetEditPermission, ResourceEditPermission
from .forms import (
ResourceForm, DatasetForm, CommunityResourceForm, ResourcesListForm
)
from .search import DatasetSearch
from .exceptions import (
SchemasCatalogNotFoundException, SchemasCacheUnavailableException
)

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -557,3 +561,18 @@ def get(self):
'''List all resource types'''
return [{'id': id, 'label': label}
for id, label in RESOURCE_TYPES.items()]


@ns.route('/schemas/', endpoint='schemas')
class SchemasAPI(API):
@api.doc('schemas')
@api.marshal_list_with(schema_fields)
def get(self):
'''List all available schemas'''
try:
# This method call is cached as it makes HTTP requests
return ResourceSchema.objects()
except SchemasCacheUnavailableException:
abort(503, description='No schemas in cache and endpoint unavailable')
except SchemasCatalogNotFoundException:
abort(404, description='Schema catalog endpoint was not found')
8 changes: 8 additions & 0 deletions udata/core/dataset/api_fields.py
Expand Up @@ -76,6 +76,9 @@
'loaded as a standalone page (ie. iframe or '
'new page)',
readonly=True),
'schema': fields.String(
description='The schema slug the resource adheres to',
allow_null=True),
})

upload_fields = api.inherit('UploadedResource', resource_fields, {
Expand Down Expand Up @@ -210,3 +213,8 @@
'id': fields.String(description='The resource type identifier'),
'label': fields.String(description='The resource type display name')
})

schema_fields = api.model('Schema', {
'id': fields.String(description='The schema identifier'),
'label': fields.String(description='The schema display name')
})
18 changes: 18 additions & 0 deletions udata/core/dataset/exceptions.py
@@ -0,0 +1,18 @@
class DatasetException(Exception):
'''Base class for all dataset exceptions'''
pass


class ResourceSchemaException(DatasetException):
'''Raised for resources' schema related exceptions'''
pass


class SchemasCatalogNotFoundException(ResourceSchemaException):
'''Raised when the schema catalog cannot be found'''
pass


class SchemasCacheUnavailableException(ResourceSchemaException):
'''Raised when the schema catalog cache is not available or is empty'''
pass
17 changes: 17 additions & 0 deletions udata/core/dataset/forms.py
Expand Up @@ -12,6 +12,7 @@
Dataset, Resource, License, Checksum, CommunityResource,
UPDATE_FREQUENCIES, DEFAULT_FREQUENCY, RESOURCE_FILETYPES, CHECKSUM_TYPES,
LEGACY_FREQUENCIES, RESOURCE_TYPES, RESOURCE_FILETYPE_FILE,
ResourceSchema,
)

__all__ = ('DatasetForm', 'ResourceForm', 'CommunityResourceForm')
Expand Down Expand Up @@ -48,6 +49,17 @@ def enforce_filetype_file(form, field):
))


def enforce_allowed_schemas(form, field):
schema = field.data
allowed_schemas = [s['id'] for s in ResourceSchema.objects()]
if schema not in allowed_schemas:
message = _('Schema "{schema}" is not an allowed value. Allowed values: {values}')
raise validators.ValidationError(message.format(
schema=schema,
values=', '.join(allowed_schemas)
))


class BaseResourceForm(ModelForm):
title = fields.StringField(_('Title'), [validators.DataRequired()])
description = fields.MarkdownField(_('Description'))
Expand Down Expand Up @@ -79,6 +91,11 @@ class BaseResourceForm(ModelForm):
_('Publication date'),
description=_('The publication date of the resource'))
extras = fields.ExtrasField()
schema = fields.StringField(
_('Schema'),
default=None,
validators=[validators.optional(), enforce_allowed_schemas],
description=_('The schema slug the resource adheres to'))


class ResourceForm(BaseResourceForm):
Expand Down
51 changes: 51 additions & 0 deletions udata/core/dataset/models.py
@@ -1,5 +1,6 @@
from datetime import datetime, timedelta
from collections import OrderedDict
import logging

from blinker import signal
from dateutil.parser import parse as parse_dt
Expand All @@ -9,20 +10,28 @@
from stringdist import rdlevenshtein
from werkzeug import cached_property
from elasticsearch_dsl import Integer, Object
import requests

from udata.app import cache
from udata.frontend.markdown import mdstrip
from udata.models import db, WithMetrics, BadgeMixin, SpatialCoverage
from udata.i18n import lazy_gettext as _
from udata.utils import get_by, hash_url

from .preview import get_preview_url
from .exceptions import (
SchemasCatalogNotFoundException, SchemasCacheUnavailableException
)

__all__ = (
'License', 'Resource', 'Dataset', 'Checksum', 'CommunityResource',
'UPDATE_FREQUENCIES', 'LEGACY_FREQUENCIES', 'RESOURCE_FILETYPES',
'PIVOTAL_DATA', 'DEFAULT_LICENSE', 'RESOURCE_TYPES',
'ResourceSchema'
)

log = logging.getLogger(__name__)

#: Udata frequencies with their labels
#:
#: See: http://dublincore.org/groups/collections/frequency/
Expand Down Expand Up @@ -99,6 +108,8 @@
# (ie. number of allowed character changes)
MAX_DISTANCE = 2

SCHEMA_CACHE_DURATION = 60 * 5 # In seconds


def get_json_ld_extra(key, value):
'''Serialize an extras key, value pair into JSON-LD'''
Expand Down Expand Up @@ -224,6 +235,7 @@ class ResourceMixin(object):
filesize = db.IntField() # `size` is a reserved keyword for mongoengine.
fs_filename = db.StringField()
extras = db.ExtrasField()
schema = db.StringField()

created_at = db.DateTimeField(default=datetime.now, required=True)
modified = db.DateTimeField(default=datetime.now, required=True)
Expand Down Expand Up @@ -748,6 +760,45 @@ def from_community(self):
return True


class ResourceSchema(object):
@staticmethod
@cache.memoize(timeout=SCHEMA_CACHE_DURATION)
def objects():
AntoineAugusti marked this conversation as resolved.
Show resolved Hide resolved
'''
Get a list of schemas from a schema catalog endpoint.

This has a double layer of cache:
- @cache.cached decorator w/ short lived cache for normal operations
- a long terme cache w/o timeout to be able to always render some content
'''
endpoint = current_app.config.get('SCHEMA_CATALOG_URL')
if endpoint is None:
return []

cache_key = 'schema-catalog-objects'
try:
response = requests.get(endpoint, timeout=5)
# do not cache 404 and forward status code
if response.status_code == 404:
raise SchemasCatalogNotFoundException(f'Schemas catalog does not exist at {endpoint}')
response.raise_for_status()
except requests.exceptions.RequestException as e:
log.exception(f'Error while getting schema catalog from {endpoint}')
content = cache.get(cache_key)
else:
schemas = response.json().get('schemas', [])
content = [
{'id': s['name'], 'label': s['title']} for s in schemas
]
cache.set(cache_key, content)
# no cached version or no content
if not content:
log.error(f'No content found inc. from cache for schema catalog')
raise SchemasCacheUnavailableException('No content in cache for schema catalog')

return content


def get_resource(id):
'''Fetch a resource given its UUID'''
dataset = Dataset.objects(resources__id=id).first()
Expand Down
4 changes: 4 additions & 0 deletions udata/settings.py
Expand Up @@ -437,6 +437,10 @@ class Defaults(object):
ARCHIVE_COMMENT_USER_ID = None
ARCHIVE_COMMENT_TITLE = _('This dataset has been archived')

# Schemas parameters
####################
SCHEMA_CATALOG_URL = None
AntoineAugusti marked this conversation as resolved.
Show resolved Hide resolved

API_DOC_EXTERNAL_LINK = 'https://doc.data.gouv.fr/api/reference/'


Expand Down
65 changes: 64 additions & 1 deletion udata/tests/api/test_datasets_api.py
Expand Up @@ -5,9 +5,11 @@
from uuid import uuid4

from flask import url_for
import pytest

from . import APITestCase

from udata.app import cache
from udata.core import storages
from udata.core.dataset.factories import (
DatasetFactory, VisibleDatasetFactory, CommunityResourceFactory,
Expand All @@ -22,7 +24,7 @@
)
from udata.tags import MIN_TAG_LENGTH, MAX_TAG_LENGTH
from udata.utils import unique_string, faker

from udata.tests.helpers import assert200, assert404

SAMPLE_GEOM = {
"type": "MultiPolygon",
Expand Down Expand Up @@ -1362,3 +1364,64 @@ def test_resource_types_list(self):
response = self.get(url_for('api.resource_types'))
self.assert200(response)
self.assertEqual(len(response.json), len(RESOURCE_TYPES))


@pytest.mark.usefixtures('clean_db')
class DatasetSchemasAPITest:
AntoineAugusti marked this conversation as resolved.
Show resolved Hide resolved
modules = ['core.dataset']
AntoineAugusti marked this conversation as resolved.
Show resolved Hide resolved

def test_dataset_schemas_api_list(self, api, rmock, app):
# Can't use @pytest.mark.options otherwise a request will be
# made before setting up rmock at module load, resulting in a 404
app.config['SCHEMA_CATALOG_URL'] = 'https://example.com/schemas'
AntoineAugusti marked this conversation as resolved.
Show resolved Hide resolved

rmock.get('https://example.com/schemas', json={
'schemas': [{"name": "etalab/schema-irve", "title": "Schéma IRVE"}]
})

response = api.get(url_for('api.schemas'))

assert200(response)
assert response.json == [{"id": "etalab/schema-irve", "label": "Schéma IRVE"}]

@pytest.mark.options(SCHEMA_CATALOG_URL=None)
def test_dataset_schemas_api_list_no_catalog_url(self, api):
response = api.get(url_for('api.schemas'))

assert200(response)
assert response.json == []

@pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/notfound')
def test_dataset_schemas_api_list_not_found(self, api):
response = api.get(url_for('api.schemas'))
assert404(response)

@pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas')
def test_dataset_schemas_api_list_error_no_cache(self, api, rmock):
rmock.get('https://example.com/schemas', status_code=500)

response = api.get(url_for('api.schemas'))
assert response.status_code == 503

@pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas')
def test_dataset_schemas_api_list_error_w_cache(self, api, rmock, mocker):
cache_mock_set = mocker.patch.object(cache, 'set')
mocker.patch.object(cache, 'get', return_value=[{"id": "etalab/schema-irve", "label": "Schéma IRVE"}])

# Fill cache
rmock.get('https://example.com/schemas', json={
'schemas': [{"name": "etalab/schema-irve", "title": "Schéma IRVE"}]
})
response = api.get(url_for('api.schemas'))
assert200(response)
assert response.json == [{"id": "etalab/schema-irve", "label": "Schéma IRVE"}]
assert cache_mock_set.called

# Endpoint becomes unavailable
rmock.get('https://example.com/schemas', status_code=500)

# Long term cache is used
response = api.get(url_for('api.schemas'))
assert200(response)
assert response.json == [{"id": "etalab/schema-irve", "label": "Schéma IRVE"}]