opendatateam · abulte · Jul 21, 2020 · Jul 14, 2020 · Jul 14, 2020 · Jul 15, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
   - Adding a celery job `purge-orphan-community-resources` to remove community resources not linked to a dataset. This should be scheduled regularly.
   - Adding a migration file to populate resources fs_filename new field. Scripts to delete the orphaned files are available [here](https://gist.github.com/quaxsze/dc089e4ecd2e00f82acea573d8d2cfb9).
 - Show traceback for migration errors [#2513](https://github.com/opendatateam/udata/pull/2513)
+- Add `schema` field to ressources. This field can be filled based on an external schema catalog [#2512](https://github.com/opendatateam/udata/pull/2512)
 
 ## 2.1.3 (2020-06-29)
 

diff --git a/docs/adapting-settings.md b/docs/adapting-settings.md
@@ -126,6 +126,15 @@ The id of an existing user which will post a comment when a dataset is archived.
 The title of the comment optionaly posted when a dataset is archived.
 NB: the content of the comment is located in `udata/templates/comments/dataset_archived.txt`.
 
+
+### SCHEMA_CATALOG_URL
+
+**default** : `None`
+
+The URL to a schema catalog, listing schemas resources can conform to. The URL should be a JSON endpoint, returning a schema catalog. Example: https://schema.data.gouv.fr/schemas/schemas.json
+
+NB: this is used by the `datasets/schemas` API to fill the `schema` field of a `Resource`.
+
 ## URLs validation
 
 ### URLS_ALLOW_PRIVATE

diff --git a/js/components/dataset/resource/form.vue b/js/components/dataset/resource/form.vue
@@ -153,6 +153,7 @@ import CommunityResource from 'models/communityresource';
 import FormHorizontal from 'components/form/horizontal-form.vue';
 import UploaderMixin from 'mixins/uploader';
 import resource_types from 'models/resource_types';
+import schemas from 'models/schemas';
 
 export default {
     components: {FormHorizontal},
@@ -243,7 +244,23 @@ export default {
             }]
         },
         fields() {
-            return this.generic_fields.concat(this.file_fields);
+            return this.generic_fields.concat(this.schema_field).concat(this.file_fields);
+        },
+        schema_field() {
+            if (schemas.has_data) {
+                const values = [{id: '', label: ''}].concat(schemas.data);
+                return [{
+                    id: 'schema',
+                    label: this._('Schema'),
+                    widget: 'select-input',
+                    values,
+                    map: function(item) {
+                        return {value: item.id, text: item.label};
+                    }
+                }];
+            }
+
+            return [];
         },
         is_community() {
             return this.resource instanceof CommunityResource;

diff --git a/js/models/schemas.js b/js/models/schemas.js
@@ -0,0 +1,14 @@
+import {List} from 'models/base';
+
+
+export class Schemas extends List {
+    constructor(options) {
+        super(options);
+        this.$options.ns = 'datasets';
+        this.$options.fetch = 'schemas';
+        this.sorted = 'label';
+    }
+}
+
+export var schemas = new Schemas().fetch();
+export default schemas;
diff --git a/udata/core/dataset/api.py b/udata/core/dataset/api.py
@@ -21,7 +21,7 @@
 import logging
 from datetime import datetime
 
-from flask import request, current_app
+from flask import request, current_app, abort
 from flask_security import current_user
 
 from udata import search
@@ -44,17 +44,21 @@
     resource_fields,
     resource_type_fields,
     upload_fields,
+    schema_fields,
 )
 from udata.linkchecker.checker import check_resource
 from .models import (
     Dataset, Resource, Checksum, License, UPDATE_FREQUENCIES,
-    CommunityResource, RESOURCE_TYPES
+    CommunityResource, RESOURCE_TYPES, ResourceSchema
 )
 from .permissions import DatasetEditPermission, ResourceEditPermission
 from .forms import (
     ResourceForm, DatasetForm, CommunityResourceForm, ResourcesListForm
 )
 from .search import DatasetSearch
+from .exceptions import (
+    SchemasCatalogNotFoundException, SchemasCacheUnavailableException
+)
 
 log = logging.getLogger(__name__)
 
@@ -557,3 +561,18 @@ def get(self):
         '''List all resource types'''
         return [{'id': id, 'label': label}
                 for id, label in RESOURCE_TYPES.items()]
+
+
+@ns.route('/schemas/', endpoint='schemas')
+class SchemasAPI(API):
+    @api.doc('schemas')
+    @api.marshal_list_with(schema_fields)
+    def get(self):
+        '''List all available schemas'''
+        try:
+            # This method call is cached as it makes HTTP requests
+            return ResourceSchema.objects()
+        except SchemasCacheUnavailableException:
+            abort(503, description='No schemas in cache and endpoint unavailable')
+        except SchemasCatalogNotFoundException:
+            abort(404, description='Schema catalog endpoint was not found')
diff --git a/udata/core/dataset/api_fields.py b/udata/core/dataset/api_fields.py
@@ -76,6 +76,9 @@
                                  'loaded as a standalone page (ie. iframe or '
                                  'new page)',
                                  readonly=True),
+    'schema': fields.String(
+        description='The schema slug the resource adheres to',
+        allow_null=True),
 })
 
 upload_fields = api.inherit('UploadedResource', resource_fields, {
@@ -210,3 +213,8 @@
     'id': fields.String(description='The resource type identifier'),
     'label': fields.String(description='The resource type display name')
 })
+
+schema_fields = api.model('Schema', {
+    'id': fields.String(description='The schema identifier'),
+    'label': fields.String(description='The schema display name')
+})
diff --git a/udata/core/dataset/exceptions.py b/udata/core/dataset/exceptions.py
@@ -0,0 +1,18 @@
+class DatasetException(Exception):
+    '''Base class for all dataset exceptions'''
+    pass
+
+
+class ResourceSchemaException(DatasetException):
+    '''Raised for resources' schema related exceptions'''
+    pass
+
+
+class SchemasCatalogNotFoundException(ResourceSchemaException):
+    '''Raised when the schema catalog cannot be found'''
+    pass
+
+
+class SchemasCacheUnavailableException(ResourceSchemaException):
+    '''Raised when the schema catalog cache is not available or is empty'''
+    pass
diff --git a/udata/core/dataset/forms.py b/udata/core/dataset/forms.py
@@ -12,6 +12,7 @@
     Dataset, Resource, License, Checksum, CommunityResource,
     UPDATE_FREQUENCIES, DEFAULT_FREQUENCY, RESOURCE_FILETYPES, CHECKSUM_TYPES,
     LEGACY_FREQUENCIES, RESOURCE_TYPES, RESOURCE_FILETYPE_FILE,
+    ResourceSchema,
 )
 
 __all__ = ('DatasetForm', 'ResourceForm', 'CommunityResourceForm')
@@ -48,6 +49,17 @@ def enforce_filetype_file(form, field):
         ))
 
 
+def enforce_allowed_schemas(form, field):
+    schema = field.data
+    allowed_schemas = [s['id'] for s in ResourceSchema.objects()]
+    if schema not in allowed_schemas:
+        message = _('Schema "{schema}" is not an allowed value. Allowed values: {values}')
+        raise validators.ValidationError(message.format(
+            schema=schema,
+            values=', '.join(allowed_schemas)
+        ))
+
+
 class BaseResourceForm(ModelForm):
     title = fields.StringField(_('Title'), [validators.DataRequired()])
     description = fields.MarkdownField(_('Description'))
@@ -79,6 +91,11 @@ class BaseResourceForm(ModelForm):
         _('Publication date'),
         description=_('The publication date of the resource'))
     extras = fields.ExtrasField()
+    schema = fields.StringField(
+        _('Schema'),
+        default=None,
+        validators=[validators.optional(), enforce_allowed_schemas],
+        description=_('The schema slug the resource adheres to'))
 
 
 class ResourceForm(BaseResourceForm):

diff --git a/udata/core/dataset/models.py b/udata/core/dataset/models.py
@@ -1,5 +1,6 @@
 from datetime import datetime, timedelta
 from collections import OrderedDict
+import logging
 
 from blinker import signal
 from dateutil.parser import parse as parse_dt
@@ -9,20 +10,28 @@
 from stringdist import rdlevenshtein
 from werkzeug import cached_property
 from elasticsearch_dsl import Integer, Object
+import requests
 
+from udata.app import cache
 from udata.frontend.markdown import mdstrip
 from udata.models import db, WithMetrics, BadgeMixin, SpatialCoverage
 from udata.i18n import lazy_gettext as _
 from udata.utils import get_by, hash_url
 
 from .preview import get_preview_url
+from .exceptions import (
+    SchemasCatalogNotFoundException, SchemasCacheUnavailableException
+)
 
 __all__ = (
     'License', 'Resource', 'Dataset', 'Checksum', 'CommunityResource',
     'UPDATE_FREQUENCIES', 'LEGACY_FREQUENCIES', 'RESOURCE_FILETYPES',
     'PIVOTAL_DATA', 'DEFAULT_LICENSE', 'RESOURCE_TYPES',
+    'ResourceSchema'
 )
 
+log = logging.getLogger(__name__)
+
 #: Udata frequencies with their labels
 #:
 #: See: http://dublincore.org/groups/collections/frequency/
@@ -99,6 +108,8 @@
 # (ie. number of allowed character changes)
 MAX_DISTANCE = 2
 
+SCHEMA_CACHE_DURATION = 60 * 5  # In seconds
+
 
 def get_json_ld_extra(key, value):
     '''Serialize an extras key, value pair into JSON-LD'''
@@ -224,6 +235,7 @@ class ResourceMixin(object):
     filesize = db.IntField()  # `size` is a reserved keyword for mongoengine.
     fs_filename = db.StringField()
     extras = db.ExtrasField()
+    schema = db.StringField()
 
     created_at = db.DateTimeField(default=datetime.now, required=True)
     modified = db.DateTimeField(default=datetime.now, required=True)
@@ -748,6 +760,45 @@ def from_community(self):
         return True
 
 
+class ResourceSchema(object):
+    @staticmethod
+    @cache.memoize(timeout=SCHEMA_CACHE_DURATION)
+    def objects():
+        '''
+        Get a list of schemas from a schema catalog endpoint.
+
+        This has a double layer of cache:
+        - @cache.cached decorator w/ short lived cache for normal operations
+        - a long terme cache w/o timeout to be able to always render some content
+        '''
+        endpoint = current_app.config.get('SCHEMA_CATALOG_URL')
+        if endpoint is None:
+            return []
+
+        cache_key = 'schema-catalog-objects'
+        try:
+            response = requests.get(endpoint, timeout=5)
+            # do not cache 404 and forward status code
+            if response.status_code == 404:
+                raise SchemasCatalogNotFoundException(f'Schemas catalog does not exist at {endpoint}')
+            response.raise_for_status()
+        except requests.exceptions.RequestException as e:
+            log.exception(f'Error while getting schema catalog from {endpoint}')
+            content = cache.get(cache_key)
+        else:
+            schemas = response.json().get('schemas', [])
+            content = [
+                {'id': s['name'], 'label': s['title']} for s in schemas
+            ]
+            cache.set(cache_key, content)
+        # no cached version or no content
+        if not content:
+            log.error(f'No content found inc. from cache for schema catalog')
+            raise SchemasCacheUnavailableException('No content in cache for schema catalog')
+
+        return content
+
+
 def get_resource(id):
     '''Fetch a resource given its UUID'''
     dataset = Dataset.objects(resources__id=id).first()

diff --git a/udata/settings.py b/udata/settings.py
@@ -437,6 +437,10 @@ class Defaults(object):
     ARCHIVE_COMMENT_USER_ID = None
     ARCHIVE_COMMENT_TITLE = _('This dataset has been archived')
 
+    # Schemas parameters
+    ####################
+    SCHEMA_CATALOG_URL = None
+
     API_DOC_EXTERNAL_LINK = 'https://doc.data.gouv.fr/api/reference/'
 
 

diff --git a/udata/tests/api/test_datasets_api.py b/udata/tests/api/test_datasets_api.py
@@ -5,9 +5,11 @@
 from uuid import uuid4
 
 from flask import url_for
+import pytest
 
 from . import APITestCase
 
+from udata.app import cache
 from udata.core import storages
 from udata.core.dataset.factories import (
     DatasetFactory, VisibleDatasetFactory, CommunityResourceFactory,
@@ -22,7 +24,7 @@
 )
 from udata.tags import MIN_TAG_LENGTH, MAX_TAG_LENGTH
 from udata.utils import unique_string, faker
-
+from udata.tests.helpers import assert200, assert404
 
 SAMPLE_GEOM = {
     "type": "MultiPolygon",
@@ -1362,3 +1364,64 @@ def test_resource_types_list(self):
         response = self.get(url_for('api.resource_types'))
         self.assert200(response)
         self.assertEqual(len(response.json), len(RESOURCE_TYPES))
+
+
+@pytest.mark.usefixtures('clean_db')
+class DatasetSchemasAPITest:
+    modules = ['core.dataset']
+
+    def test_dataset_schemas_api_list(self, api, rmock, app):
+        # Can't use @pytest.mark.options otherwise a request will be
+        # made before setting up rmock at module load, resulting in a 404
+        app.config['SCHEMA_CATALOG_URL'] = 'https://example.com/schemas'
+
+        rmock.get('https://example.com/schemas', json={
+            'schemas': [{"name": "etalab/schema-irve", "title": "Schéma IRVE"}]
+        })
+
+        response = api.get(url_for('api.schemas'))
+
+        assert200(response)
+        assert response.json == [{"id": "etalab/schema-irve", "label": "Schéma IRVE"}]
+
+    @pytest.mark.options(SCHEMA_CATALOG_URL=None)
+    def test_dataset_schemas_api_list_no_catalog_url(self, api):
+        response = api.get(url_for('api.schemas'))
+
+        assert200(response)
+        assert response.json == []
+
+    @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/notfound')
+    def test_dataset_schemas_api_list_not_found(self, api):
+        response = api.get(url_for('api.schemas'))
+        assert404(response)
+
+    @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas')
+    def test_dataset_schemas_api_list_error_no_cache(self, api, rmock):
+        rmock.get('https://example.com/schemas', status_code=500)
+
+        response = api.get(url_for('api.schemas'))
+        assert response.status_code == 503
+
+    @pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas')
+    def test_dataset_schemas_api_list_error_w_cache(self, api, rmock, mocker):
+        cache_mock_set = mocker.patch.object(cache, 'set')
+        mocker.patch.object(cache, 'get', return_value=[{"id": "etalab/schema-irve", "label": "Schéma IRVE"}])
+
+        # Fill cache
+        rmock.get('https://example.com/schemas', json={
+            'schemas': [{"name": "etalab/schema-irve", "title": "Schéma IRVE"}]
+        })
+        response = api.get(url_for('api.schemas'))
+        assert200(response)
+        assert response.json == [{"id": "etalab/schema-irve", "label": "Schéma IRVE"}]
+        assert cache_mock_set.called
+
+        # Endpoint becomes unavailable
+        rmock.get('https://example.com/schemas', status_code=500)
+
+        # Long term cache is used
+        response = api.get(url_for('api.schemas'))
+        assert200(response)
+        assert response.json == [{"id": "etalab/schema-irve", "label": "Schéma IRVE"}]
+