Skip to content

Commit

Permalink
feat(unicode): improved diacritics removal
Browse files Browse the repository at this point in the history
  • Loading branch information
missinglink authored and orangejulius committed Feb 8, 2022
1 parent c4c0e93 commit 8d21f63
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 16 deletions.
7 changes: 2 additions & 5 deletions lib/analysis.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@

const lowercase = require('lower-case').lowerCase;
const removeAccents = require('remove-accents');
const unicode = require('./unicode');

const PARTIAL_TOKEN_SUFFIX = '\x26';
Expand Down Expand Up @@ -102,11 +101,9 @@ function normalize( input ){
return synonyms.map( function( synonym ){
return synonym.replace(/\s{2,}/g, ' ').trim();
})
// basic normalization
// note: lowercase MUST be run before removeAccents, please don't change the order
// see: https://github.com/pelias/placeholder/pull/12 for more detail.
// normalization
.map( function( synonym ){
return removeAccents( lowercase( synonym ) );
return lowercase( unicode.fold( synonym ) );
})
// remove empty synonyms
.filter( function( synonym ){
Expand Down
18 changes: 18 additions & 0 deletions lib/unicode.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const _ = require('lodash');
const regenerate = require('regenerate');
const accentsDiacritics = require('remove-accents-diacritics');

// non-printable control characters
// ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters
Expand Down Expand Up @@ -93,4 +94,21 @@ function normalize(str) {
.replace(COMBINING_MARKS, '');
}

/**
* Converts alphabetic, numeric, and symbolic characters that are not
* in the Basic Latin Unicode block(first 127 ASCII characters) to their
* ASCII equivalent, if one exists.For example, the filter changes à to a.
*/
function fold(str) {

// sanity checking
if (!_.isString(str)) { return str; }

return accentsDiacritics.remove(str)
.normalize('NFD')
.replace(COMBINING_MARKS, '')
.normalize('NFKC');
}

module.exports.normalize = normalize;
module.exports.fold = fold;
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
"pelias-logger": "^1.2.1",
"pelias-whosonfirst": "^5.0.0",
"regenerate": "^1.4.2",
"remove-accents": "^0.4.0",
"remove-accents-diacritics": "^1.0.2",
"require-dir": "^1.0.0",
"sorted-intersect": "^0.1.4",
"split2": "^3.0.0",
Expand Down
1 change: 0 additions & 1 deletion prototype/query.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
var async = require('async');
var util = require('util');
var Result = require('../lib/Result');
var sorted = require('../lib/sorted');
var debug = false;

function reduce( index, res ){
Expand Down
8 changes: 4 additions & 4 deletions prototype/tokenize.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@

// plugin for tokenize
const _ = require('lodash'),
async = require('async'),
analysis = require('../lib/analysis'),
permutations = require('../lib/permutations');
const _ = require('lodash');
const async = require('async');
const analysis = require('../lib/analysis');
const permutations = require('../lib/permutations');

function tokenize(input, cb){

Expand Down
12 changes: 8 additions & 4 deletions test/lib/analysis.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@ module.exports.normalize = function(test, common) {
// Punctuation substitutions
assert( 'Straße', [ 'strasse' ] );
assert( 'Jǿ œ̆', [ 'jo oe' ] );
assert( 'orilẹ́ede manamari', [ 'orileede manamari' ] );
assert( 'z︠h︡ovkva', [ 'zhovkva' ] );
assert( 'Žovkva', [ 'zovkva' ] );
assert( 'Żółkiew', [ 'zolkiew' ] );
assert( 'Trinidad & Tobago', [ 'trinidad and tobago' ] );

// Tests to confirm the order of function execution
// see: https://github.com/pelias/placeholder/pull/12#issuecomment-302437570
test( 'order of execution', function(t) {
t.deepEqual( analysis.normalize( 'İnceyol' ), [ 'i̇nceyol' ] );
t.equal( analysis.normalize( 'İnceyol' )[0].length, 8 );
t.equal( analysis.normalize( 'İ' )[0].length, 2 );
test('order of execution', function(t) {
t.deepEqual( analysis.normalize( 'İnceyol' ), [ 'inceyol' ] );
t.equal( analysis.normalize( 'İnceyol' )[0].length, 7 );
t.equal( analysis.normalize( 'İ' )[0].length, 1 );
t.end();
});

Expand Down
2 changes: 1 addition & 1 deletion test/prototype/tokenize_integration.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ module.exports.tokenize = function(test, util) {

assert('Kelburn Wellington New Zealand', [['kelburn', 'wellington', 'new zealand']]);
assert('Sydney New South Wales Australia', [['sydney', 'new south wales', 'australia']]);
assert('ケープタウン 南アフリカ', [['ケープタウン', '南アフリカ']]);
assert('ケープタウン 南アフリカ', [['ケーフタウン', '南アフリカ']]);

// duplicates
assert('lancaster lancaster pa', [['lancaster', 'lancaster', 'pa']]);
Expand Down

0 comments on commit 8d21f63

Please sign in to comment.