Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

deduplication - add isEquivalentIdentity() method #1580

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 56 additions & 2 deletions helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -124,20 +124,72 @@ function isNameDifferent(item1, item2, requestLanguage){
// iterate over all the languages in item2, comparing them to the
// 'default' name of item1 and also against the language requested by the user.
for( let lang in names2 ){
if (!names2.hasOwnProperty(lang)) { continue; } // do not iterate over inherited properties
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these hasOwnProperty checks snuck in to the PR, they are not strictly related but considered good practice

if( !isPropertyDifferent({[lang]: names1.default}, names2, lang) ){ return false; }
if( requestLanguage && !isPropertyDifferent({[lang]: names1[requestLanguage]}, names2, lang) ){ return false; }
}

// iterate over all the languages in item1, comparing them to the
// 'default' name of item2 and also against the language requested by the user.
// as above, but inverse
for( let lang in names1 ){
if (!names1.hasOwnProperty(lang)) { continue; } // do not iterate over inherited properties
if( !isPropertyDifferent({[lang]: names2.default}, names1, lang) ){ return false; }
if( requestLanguage && !isPropertyDifferent({[lang]: names2[requestLanguage]}, names1, lang) ){ return false; }
}

return true;
}

// convenience function to return a GID (or null in case of an error)
// note: supports scalar values else takes the first element of an Array.
const GID = (item) => {
const parts = _.compact([
_.first(_.castArray(_.get(item, 'source'))),
_.first(_.castArray(_.get(item, 'layer'))),
_.first(_.castArray(_.get(item, 'source_id')))
]);

return (parts.length === 3) ? parts.join(':') : null;
};

// convenience function to return a *parent GID (or null in case of an error)
// note: supports scalar values else takes the first element of an Array.
// note: defaults the 'source' to 'whosonfirst' for backwards compatibility
const parentGID = (item, layer) => {
const source = _.first(_.compact(_.castArray(_.get(item, ['parent', `${layer}_source`]))));
const source_id = _.get(item, ['parent', `${layer}_id`]);

return GID({ layer, source: (source || 'whosonfirst'), source_id });
};

/**
* Compare the item properties and return true for any of the following predicates, else false
* - IsEqual -- The two items are save the same source, source_id and layer
* - IsEquivalant -- The two items are otherwise equalivant (as per corresponding code comments)
*/
function isEquivalentIdentity(item1, item2) {

// Both items must be on the same layer
if (item1.layer !== item2.layer) { return false; }

// Generate a GID value for each item
const gid1 = GID(item1);
const gid2 = GID(item2);

// Invalid GID(s)
if (gid1 === null || gid2 === null) { return false; }

// Equal GIDs
if (gid1 === gid2) { return true; }

// Equivalant GID ($item1 == $item2.parent[$item1.layer])
if (gid1 === parentGID(item2, item1.layer)) { return true; }

// Equivalant GID ($item2 = $item1.parent[$item2.layer])
if (gid2 === parentGID(item1, item2.layer)) { return true; }

return false;
}

/**
* Compare the address_parts properties if they exist.
* Returns false if the objects are the same, else true.
Expand Down Expand Up @@ -174,6 +226,7 @@ function isAddressDifferent(item1, item2){
* Optionally provide $requestLanguage (req.clean.lang.iso6393) to improve name deduplication.
*/
function isDifferent(item1, item2, requestLanguage){
if( isEquivalentIdentity( item1, item2 ) ){ return false; }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be worth pointing out here that isEquivalentIdentity is different (no pun intended) than all the other checks in the isDifferent function.

All the others are each sufficient to prove that items are different. isEquivalentIdentity, on the other hand, is sufficient to prove that the items are the same.

I'm honestly surprised it took us this long to have such a check but we might want to divide the functions within isDifferent into two appropriate sections with a comment or something to help us keep track of them.

if( isLayerDifferent( item1, item2 ) ){ return true; }
if( isParentHierarchyDifferent( item1, item2 ) ){ return true; }
if( isNameDifferent( item1, item2, requestLanguage ) ){ return true; }
Expand Down Expand Up @@ -245,3 +298,4 @@ module.exports.isDifferent = isDifferent;
module.exports.layerPreferences = layerPreferences;
module.exports.isNameDifferent = isNameDifferent;
module.exports.normalizeString = normalizeString;
module.exports.isEquivalentIdentity = isEquivalentIdentity;
89 changes: 89 additions & 0 deletions test/unit/helper/diffPlaces.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
const isDifferent = require('../../../helper/diffPlaces').isDifferent;
const isNameDifferent = require('../../../helper/diffPlaces').isNameDifferent;
const normalizeString = require('../../../helper/diffPlaces').normalizeString;
const isEquivalentIdentity = require('../../../helper/diffPlaces').isEquivalentIdentity;

module.exports.tests = {};

Expand Down Expand Up @@ -539,6 +540,94 @@ module.exports.tests.isNameDifferent = function (test, common) {
});
};

module.exports.tests.isEquivalentIdentity = function (test, common) {
test('basic equivalence', function (t) {
t.true(isEquivalentIdentity(
{ source: 'test', source_id: '1', layer: 'example' },
{ source: 'test', source_id: '1', layer: 'example' }
), 'same source, source_id and layer');

t.false(isEquivalentIdentity(
{ source: 'test', source_id: '1', layer: 'example' },
{ source: 'foo', source_id: '1', layer: 'example' }
), 'different source');

t.false(isEquivalentIdentity(
{ source: 'test', source_id: '1', layer: 'example' },
{ source: 'test', source_id: '2', layer: 'example' }
), 'different source_id');

// two records sharing the same source, source_id but with
// differing layer are considered not equal.
// in practice this should be rare/never happen but if it
// becomes an issue would could consider making this stricter.
t.false(isEquivalentIdentity(
{ source: 'test', source_id: '1', layer: 'example' },
{ source: 'test', source_id: '1', layer: 'foo' }
), 'same source, source_id and layer');

// if either record fails to generate a valid GID then they are
// considered not equivalent.
t.false(isEquivalentIdentity(
{ source: 'test', source_id: '1' },
{ source: 'test', source_id: '1', layer: 'example' }
), 'invalid GID');

t.end();
});

test('equivalence via parent hierarchy', function (t) {
t.true(isEquivalentIdentity(
{ source: 'foo', source_id: '1', layer: 'example' },
{ source: 'bar', source_id: '2', layer: 'example', parent: {
example_id: '1',
example_source: 'foo'
}
}), 'match parent GID');

t.false(isEquivalentIdentity(
{ source: 'foo', source_id: '1', layer: 'example' },
{ source: 'bar', source_id: '2', layer: 'example', parent: {
foo_id: '1',
foo_source: 'foo'
}
}), 'parent name must be from same layer');

t.false(isEquivalentIdentity(
{ source: 'foo', source_id: '1', layer: 'example' },
{ source: 'bar', source_id: '2', layer: 'example', parent: {
foo_id: '1',
example_source: 'foo'
}
}), 'parent name must have the same source_id');

t.false(isEquivalentIdentity(
{ source: 'foo', source_id: '1', layer: 'example' },
{ source: 'bar', source_id: '2', layer: 'example', parent: {
example_id: '1',
example_source: 'not_foo'
}
}), 'parent name must have the same source');

t.true(isEquivalentIdentity(
{ source: 'whosonfirst', source_id: '1', layer: 'example' },
{ source: 'test', source_id: '2', layer: 'example', parent: {
example_id: '1'
}
}), 'parent source defaults to "whosonfirst"');

t.true(isEquivalentIdentity(
{ source: 'whosonfirst', source_id: '1', layer: 'example' },
{ source: 'test', source_id: '2', layer: 'example', parent: {
example_id: ['1'],
example_source: [null]
}
}), 'parent source defaults to "whosonfirst" (arrays)');

t.end();
});
};

module.exports.tests.normalizeString = function (test, common) {
test('lowercase', function (t) {
t.equal(normalizeString('Foo Bar'), 'foo bar');
Expand Down