Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve licenses detection in harvesting #1203

Merged
merged 2 commits into from
Oct 9, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
- Properly serialize empty geometries for zones missing it and prevent leaflet crash on invalid bounds [#1188](https://github.com/opendatateam/udata/pull/1188)
- Start validating some configuration parameters [#1197](https://github.com/opendatateam/udata/pull/1197)
- Migration to remove resources without title or url [#1200](https://github.com/opendatateam/udata/pull/1200)
- Improve harvesting licenses detection [#1203](https://github.com/opendatateam/udata/pull/1203)

## 1.1.8 (2017-09-28)

Expand Down
23 changes: 22 additions & 1 deletion udata/core/dataset/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,30 @@ def __unicode__(self):
return self.title

@classmethod
def guess(cls, text):
def guess(cls, *strings, **kwargs):
'''
Try to guess a license from a list of strings.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe document the default kwarg?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or declare it explicitly in the method signature.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not possible to declare explicitely a keyword argument with variable arguments in Python 2 (already tried :/).
I'll add it to the documentation.


Accept a `default` keyword argument which will be
the default fallback license.
'''
license = None
for string in strings:
license = cls.guess_one(string)
if license:
break
return license or kwargs.get('default')

@classmethod
def guess_one(cls, text):
'''
Try to guess license from a string.

Try to exact match on identifier then slugified title
and fallback on edit distance ranking (after slugification)
'''
if not text:
return
qs = cls.objects
text = text.strip().lower() # Stored identifiers are lower case
slug = cls.slug.slugify(text) # Use slug as it normalize string
Expand All @@ -132,6 +149,10 @@ def guess(cls, text):
license = candidates[0]
return license

@classmethod
def default(cls):
return cls.objects(id=DEFAULT_LICENSE['id']).first()


class DatasetQuerySet(db.OwnedQuerySet):
def visible(self):
Expand Down
7 changes: 2 additions & 5 deletions udata/core/dataset/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,10 +372,7 @@ def dataset_from_rdf(graph, dataset=None):
elif isinstance(value, RdfResource):
licenses.add(value.identifier.toPython())

for text in licenses:
license = License.guess(text)
if license:
dataset.license = license
break
default_license = dataset.license or License.default()
dataset.license = License.guess(*licenses, default=default_license)

return dataset
10 changes: 8 additions & 2 deletions udata/harvest/backends/ckan.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
'title': basestring,
'notes': Any(All(basestring, normalize_string), None),
'license_id': All(DefaultTo('not-specified'), basestring),
'license_title': Any(basestring, None),
'tags': [tag],

'metadata_created': All(basestring, to_date),
Expand Down Expand Up @@ -159,8 +160,13 @@ def process(self, item):
dataset.slug = data['name']
dataset.title = data['title']
dataset.description = data['notes']
dataset.license = License.objects(id=data['license_id']).first()
# dataset.license = license or License.objects.get(id='notspecified')

# Detect license
default_license = dataset.license or License.default()
dataset.license = License.guess(data['license_id'],
data['license_title'],
default=default_license)

dataset.tags = [t['name'] for t in data['tags'] if t['name']]

dataset.created_at = data['metadata_created']
Expand Down
10 changes: 6 additions & 4 deletions udata/harvest/backends/ods.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,12 @@ def process(self, item):

dataset.tags = list(tags)

ods_license_id = ods_metadata.get('license')
if ods_license_id and ods_license_id in self.LICENSES:
license_id = self.LICENSES[ods_license_id]
dataset.license = License.objects.get(id=license_id)
# Detect license
default_license = dataset.license or License.default()
license_id = ods_metadata.get('license')
dataset.license = License.guess(license_id,
self.LICENSES.get(license_id),
default=default_license)

dataset.resources = []

Expand Down
19 changes: 19 additions & 0 deletions udata/tests/dataset/test_dataset_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,19 @@ def test_not_found(self):
found = License.guess('should not be found')
self.assertIsNone(found)

def test_not_found_with_default(self):
license = LicenseFactory()
found = License.guess('should not be found', default=license)
self.assertEqual(found.id, license.id)

def test_none(self):
found = License.guess(None)
self.assertIsNone(found)

def test_empty_string(self):
found = License.guess('')
self.assertIsNone(found)

def test_exact_match_by_id(self):
license = LicenseFactory()
found = License.guess(license.id)
Expand Down Expand Up @@ -294,3 +307,9 @@ def test_match_by_title_with_mismatching_case(self):
found = License.guess('License ODBL')
self.assertIsInstance(found, License)
self.assertEqual(license.id, found.id)

def test_multiple_strings(self):
license = LicenseFactory()
found = License.guess('should not match', license.id)
self.assertIsInstance(found, License)
self.assertEqual(license.id, found.id)