Skip to content

Commit

Permalink
Merge pull request #1203 from noirbizarre/harvest-improve-licenses-de…
Browse files Browse the repository at this point in the history
…tection

Improve licenses detection in harvesting
  • Loading branch information
noirbizarre committed Oct 9, 2017
2 parents 9034872 + 2e6555c commit 6d7529f
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 12 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
- Properly serialize empty geometries for zones missing it and prevent leaflet crash on invalid bounds [#1188](https://github.com/opendatateam/udata/pull/1188)
- Start validating some configuration parameters [#1197](https://github.com/opendatateam/udata/pull/1197)
- Migration to remove resources without title or url [#1200](https://github.com/opendatateam/udata/pull/1200)
- Improve harvesting licenses detection [#1203](https://github.com/opendatateam/udata/pull/1203)

## 1.1.8 (2017-09-28)

Expand Down
23 changes: 22 additions & 1 deletion udata/core/dataset/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,30 @@ def __unicode__(self):
return self.title

@classmethod
def guess(cls, text):
def guess(cls, *strings, **kwargs):
'''
Try to guess a license from a list of strings.
Accept a `default` keyword argument which will be
the default fallback license.
'''
license = None
for string in strings:
license = cls.guess_one(string)
if license:
break
return license or kwargs.get('default')

@classmethod
def guess_one(cls, text):
'''
Try to guess license from a string.
Try to exact match on identifier then slugified title
and fallback on edit distance ranking (after slugification)
'''
if not text:
return
qs = cls.objects
text = text.strip().lower() # Stored identifiers are lower case
slug = cls.slug.slugify(text) # Use slug as it normalize string
Expand All @@ -132,6 +149,10 @@ def guess(cls, text):
license = candidates[0]
return license

@classmethod
def default(cls):
return cls.objects(id=DEFAULT_LICENSE['id']).first()


class DatasetQuerySet(db.OwnedQuerySet):
def visible(self):
Expand Down
7 changes: 2 additions & 5 deletions udata/core/dataset/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,10 +372,7 @@ def dataset_from_rdf(graph, dataset=None):
elif isinstance(value, RdfResource):
licenses.add(value.identifier.toPython())

for text in licenses:
license = License.guess(text)
if license:
dataset.license = license
break
default_license = dataset.license or License.default()
dataset.license = License.guess(*licenses, default=default_license)

return dataset
10 changes: 8 additions & 2 deletions udata/harvest/backends/ckan.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
'title': basestring,
'notes': Any(All(basestring, normalize_string), None),
'license_id': All(DefaultTo('not-specified'), basestring),
'license_title': Any(basestring, None),
'tags': [tag],

'metadata_created': All(basestring, to_date),
Expand Down Expand Up @@ -159,8 +160,13 @@ def process(self, item):
dataset.slug = data['name']
dataset.title = data['title']
dataset.description = data['notes']
dataset.license = License.objects(id=data['license_id']).first()
# dataset.license = license or License.objects.get(id='notspecified')

# Detect license
default_license = dataset.license or License.default()
dataset.license = License.guess(data['license_id'],
data['license_title'],
default=default_license)

dataset.tags = [t['name'] for t in data['tags'] if t['name']]

dataset.created_at = data['metadata_created']
Expand Down
10 changes: 6 additions & 4 deletions udata/harvest/backends/ods.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,12 @@ def process(self, item):

dataset.tags = list(tags)

ods_license_id = ods_metadata.get('license')
if ods_license_id and ods_license_id in self.LICENSES:
license_id = self.LICENSES[ods_license_id]
dataset.license = License.objects.get(id=license_id)
# Detect license
default_license = dataset.license or License.default()
license_id = ods_metadata.get('license')
dataset.license = License.guess(license_id,
self.LICENSES.get(license_id),
default=default_license)

dataset.resources = []

Expand Down
19 changes: 19 additions & 0 deletions udata/tests/dataset/test_dataset_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,19 @@ def test_not_found(self):
found = License.guess('should not be found')
self.assertIsNone(found)

def test_not_found_with_default(self):
license = LicenseFactory()
found = License.guess('should not be found', default=license)
self.assertEqual(found.id, license.id)

def test_none(self):
found = License.guess(None)
self.assertIsNone(found)

def test_empty_string(self):
found = License.guess('')
self.assertIsNone(found)

def test_exact_match_by_id(self):
license = LicenseFactory()
found = License.guess(license.id)
Expand Down Expand Up @@ -294,3 +307,9 @@ def test_match_by_title_with_mismatching_case(self):
found = License.guess('License ODBL')
self.assertIsInstance(found, License)
self.assertEqual(license.id, found.id)

def test_multiple_strings(self):
license = LicenseFactory()
found = License.guess('should not match', license.id)
self.assertIsInstance(found, License)
self.assertEqual(license.id, found.id)

0 comments on commit 6d7529f

Please sign in to comment.