Skip to content

Commit

Permalink
Abstract link checking through entrypoint (#1110)
Browse files Browse the repository at this point in the history
Croquemort specific stuff is removed from udata and will live in udata-croquemort.
Linkchecking can now be specified on a resource level.
Related change: default availability for a user's datasets is now 100% (vs 0%).
  • Loading branch information
abulte committed Oct 18, 2017
1 parent 364bb06 commit ce56a0b
Show file tree
Hide file tree
Showing 25 changed files with 406 additions and 404 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -25,6 +25,7 @@
- Explicit dataset search reuse facet context (only known reuses) [#1219](https://github.com/opendatateam/udata/pull/1219)
- Optimize indexation a little bit [#1215](https://github.com/opendatateam/udata/pull/1215)
- Fix some reversed temporal coverage [migration] [#1214](https://github.com/opendatateam/udata/pull/1214)
- Move croquemort features to a generic link checker architecture [#1110](https://github.com/opendatateam/udata/pull/1110)

## 1.1.8 (2017-09-28)

Expand Down
25 changes: 25 additions & 0 deletions docs/adapting-settings.md
Expand Up @@ -136,6 +136,31 @@ The number of items to fetch while previewing an harvest source

A cron expression used as default harvester schedule when validating harvesters.

## Link checker configuration

### LINKCHECKING_ENABLED

**default**: `True`

A flag to enable the resources urls check by an external link checker.

### LINKCHECKING_DEFAULT_LINKCHECKER

**default**: `no_check`

An entrypoint key of `udata.linkcheckers` that will be used as a default link checker, i.e. when no specific link checker is set for a resource (via `resource.extras.check:checker`).

### LINKCHECKING_IGNORE_DOMAINS

**default**: []

A list of domains to ignore when triggering link checking of resources urls.

### LINKCHECKING_CACHE_DURATION

**default**: 300

The minimum time in seconds between two consecutive checks of a resource's url.

## ElasticSearch configuration

Expand Down
29 changes: 29 additions & 0 deletions js/components/dataset/resource/availability.vue
@@ -0,0 +1,29 @@
<template>
<span v-if="availability === 'AVAILABLE'" class="badge bg-green">✓</span>
<span v-if="availability === 'NOT_AVAILABLE'" class="badge bg-red">×</span>
<span v-if="availability === 'UNKNOWN'" class="badge bg-gray">?</span>
</template>

<script>
export default {
name: 'resource-availability',
props: {
resource: {
type: Object,
required: true,
}
},
computed: {
availability() {
switch (this.resource.extras && this.resource.extras['check:available']) {
case true:
return 'AVAILABLE';
case false:
return 'NOT_AVAILABLE';
default:
return 'UNKNOWN';
}
}
}
}
</script>
6 changes: 3 additions & 3 deletions js/components/dataset/resource/list.vue
Expand Up @@ -85,8 +85,7 @@
</span>
</td>
<td class="text-center">
<span v-if="resource.is_available" class="badge bg-green">✓</span>
<span v-if="!resource.is_available" class="badge bg-red">×</span>
<resource-availability :resource="resource"></resource-availability>
</td>
</tr>
<tr v-if="!(dataset && dataset.resources)" class="text-center lead">
Expand Down Expand Up @@ -137,11 +136,12 @@ import Uploader from 'mixins/uploader';
import Resource from 'models/resource';
import Box from 'components/containers/box.vue';
import PaginationWidget from 'components/pagination.vue';
import ResourceAvailability from './availability.vue';
export default {
name: 'resources-list',
mixins: [Uploader, Sorter],
components: {Box, PaginationWidget},
components: {Box, PaginationWidget, ResourceAvailability},
props: {
dataset: {
type: Object,
Expand Down
28 changes: 15 additions & 13 deletions js/front/dataset/index.js
Expand Up @@ -160,20 +160,22 @@ new Vue({
el.classList.add('format-label-warning');
el.setTooltip(this._('The server may be hard to reach (FTP).'), true);
} else {
this.$api.get(checkurl, {url: url.href, group: this.dataset.alternateName})
.then(() => el.classList.add('format-label-success'))
.catch(error => {
switch (error.status) {
case 404:
el.classList.add('format-label-warning');
el.setTooltip(this._('The resource cannot be found.'), true);
break;
case 503:
break;
default:
el.classList.add('format-label-danger');
el.setTooltip(this._('The server cannot be found.'), true);
this.$api.get(checkurl)
.then((res) => {
const status = res['check:status'];
if (status >= 200 && status < 400) {
el.classList.add('format-label-success')
} else if (status >= 400 && status < 500) {
el.classList.add('format-label-danger');
el.setTooltip(this._('The resource cannot be found.'), true);
} else if (status >= 500) {
el.classList.add('format-label-warning');
el.setTooltip(this._('An error occured on the remote server. This may be temporary.'), true);
}
})
.catch(error => {
el.classList.add('format-label-unchecked');
console.log('Something went wrong with the linkchecker', error);
});
}
}
Expand Down
3 changes: 3 additions & 0 deletions setup.py
Expand Up @@ -138,6 +138,9 @@ def pip(filename):
'adorable = udata.features.identicon.backends:adorable',
'robohash = udata.features.identicon.backends:robohash',
],
'udata.linkcheckers': [
'no_check = udata.linkchecker.backends:NoCheckLinkchecker',
],
},
license='GNU AGPLv3+',
# use_2to3=True,
Expand Down
1 change: 1 addition & 0 deletions udata/commands/__init__.py
Expand Up @@ -64,6 +64,7 @@ def register_commands(manager):
import udata.api.commands # noqa
import udata.harvest.commands # noqa
import udata.features.territories.commands # noqa
import udata.linkchecker.commands # noqa

# Dynamic module commands loading
for plugin in manager.app.config['PLUGINS']:
Expand Down
36 changes: 13 additions & 23 deletions udata/core/dataset/api.py
Expand Up @@ -19,6 +19,7 @@
'''
from __future__ import unicode_literals
import os
import logging
from datetime import datetime

from flask import request, current_app
Expand All @@ -34,7 +35,6 @@
from udata.core.followers.api import FollowAPI
from udata.utils import get_by, multi_to_dict

from .croquemort import check_url
from .api_fields import (
community_resource_fields,
community_resource_page_fields,
Expand All @@ -46,6 +46,7 @@
resource_fields,
upload_fields,
)
from udata.linkchecker.checker import check_resource
from .models import (
Dataset, Resource, Checksum, License, UPDATE_FREQUENCIES,
CommunityResource
Expand All @@ -56,6 +57,8 @@
)
from .search import DatasetSearch

log = logging.getLogger(__name__)

ns = api.namespace('datasets', 'Dataset related operations')
search_parser = DatasetSearch.as_request_parser()
community_parser = api.parser()
Expand Down Expand Up @@ -516,26 +519,13 @@ def get(self):
return current_app.config['ALLOWED_RESOURCES_EXTENSIONS']


checkurl_parser = api.parser()
checkurl_parser.add_argument('url', type=str, help='The URL to check',
location='args', required=True)
checkurl_parser.add_argument('group', type=str,
help='The dataset related to the URL',
location='args', required=True)


@ns.route('/checkurl/', endpoint='checkurl')
class CheckUrlAPI(API):
@ns.route('/<dataset:dataset>/resources/<uuid:rid>/check/',
endpoint='check_dataset_resource', doc=common_doc)
@api.doc(params={'rid': 'The resource unique identifier'})
class CheckDatasetResource(API, ResourceMixin):

@api.doc('checkurl', parser=checkurl_parser)
def get(self):
'''Checks that a URL exists and returns metadata.'''
args = checkurl_parser.parse_args()
error, response = check_url(args['url'], args['group'])
status = (isinstance(response, int) and response or
int(response.get('status', 500)))
if error or status >= 500:
# We keep 503 which means the URL checker is unreachable.
return error, status == 503 and status or 500
else:
return response, status
@api.doc('check_dataset_resource')
def get(self, dataset, rid):
'''Checks that a resource's URL exists and returns metadata.'''
resource = self.get_resource_or_404(dataset, rid)
return check_resource(resource)
2 changes: 0 additions & 2 deletions udata/core/dataset/api_fields.py
Expand Up @@ -63,8 +63,6 @@
description='The resource last modification date'),
'metrics': fields.Raw(description='The resource metrics', readonly=True),
'extras': fields.Raw(description='Extra attributes as key-value pairs'),
'is_available': fields.Raw(
description='The resource availability', readonly=True),
})

upload_fields = api.inherit('UploadedResource', resource_fields, {
Expand Down
137 changes: 0 additions & 137 deletions udata/core/dataset/croquemort.py

This file was deleted.

0 comments on commit ce56a0b

Please sign in to comment.