feat(unicode): improved diacritics removal

pelias · Feb 8, 2022 · 8d21f63 · 8d21f63
1 parent c4c0e93
commit 8d21f63
Show file tree

Hide file tree

Showing 7 changed files with 34 additions and 16 deletions.
diff --git a/lib/analysis.js b/lib/analysis.js
@@ -1,6 +1,5 @@
 
 const lowercase = require('lower-case').lowerCase;
-const removeAccents = require('remove-accents');
 const unicode = require('./unicode');
 
 const PARTIAL_TOKEN_SUFFIX = '\x26';
@@ -102,11 +101,9 @@ function normalize( input ){
   return synonyms.map( function( synonym ){
     return synonym.replace(/\s{2,}/g, ' ').trim();
   })
-  // basic normalization
-  // note: lowercase MUST be run before removeAccents, please don't change the order
-  // see: https://github.com/pelias/placeholder/pull/12 for more detail.
+  // normalization
   .map( function( synonym ){
-    return removeAccents( lowercase( synonym ) );
+    return lowercase( unicode.fold( synonym ) );
   })
   // remove empty synonyms
   .filter( function( synonym ){

diff --git a/lib/unicode.js b/lib/unicode.js
@@ -1,5 +1,6 @@
 const _ = require('lodash');
 const regenerate = require('regenerate');
+const accentsDiacritics = require('remove-accents-diacritics');
 
 // non-printable control characters
 // ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters
@@ -93,4 +94,21 @@ function normalize(str) {
     .replace(COMBINING_MARKS, '');
 }
 
+/**
+ * Converts alphabetic, numeric, and symbolic characters that are not
+ * in the Basic Latin Unicode block(first 127 ASCII characters) to their
+ * ASCII equivalent, if one exists.For example, the filter changes à to a.
+ */
+function fold(str) {
+
+  // sanity checking
+  if (!_.isString(str)) { return str; }
+
+  return accentsDiacritics.remove(str)
+    .normalize('NFD')
+    .replace(COMBINING_MARKS, '')
+    .normalize('NFKC');
+}
+
 module.exports.normalize = normalize;
+module.exports.fold = fold;
diff --git a/package.json b/package.json
@@ -43,7 +43,7 @@
     "pelias-logger": "^1.2.1",
     "pelias-whosonfirst": "^5.0.0",
     "regenerate": "^1.4.2",
-    "remove-accents": "^0.4.0",
+    "remove-accents-diacritics": "^1.0.2",
     "require-dir": "^1.0.0",
     "sorted-intersect": "^0.1.4",
     "split2": "^3.0.0",

diff --git a/prototype/query.js b/prototype/query.js
@@ -2,7 +2,6 @@
 var async = require('async');
 var util = require('util');
 var Result = require('../lib/Result');
-var sorted = require('../lib/sorted');
 var debug = false;
 
 function reduce( index, res ){

diff --git a/prototype/tokenize.js b/prototype/tokenize.js
@@ -1,9 +1,9 @@
 
 // plugin for tokenize
-const _ = require('lodash'),
-    async = require('async'),
-    analysis = require('../lib/analysis'),
-    permutations = require('../lib/permutations');
+const _ = require('lodash');
+const async = require('async');
+const analysis = require('../lib/analysis');
+const permutations = require('../lib/permutations');
 
 function tokenize(input, cb){
 

diff --git a/test/lib/analysis.js b/test/lib/analysis.js
@@ -15,14 +15,18 @@ module.exports.normalize = function(test, common) {
   // Punctuation substitutions
   assert( 'Straße', [ 'strasse' ] );
   assert( 'Jǿ œ̆', [ 'jo oe' ] );
+  assert( 'orilẹ́ede manamari', [ 'orileede manamari' ] );
+  assert( 'z︠h︡ovkva', [ 'zhovkva' ] );
+  assert( 'Žovkva', [ 'zovkva' ] );
+  assert( 'Żółkiew', [ 'zolkiew' ] );
   assert( 'Trinidad & Tobago', [ 'trinidad and tobago' ] );
 
   // Tests to confirm the order of function execution
   // see: https://github.com/pelias/placeholder/pull/12#issuecomment-302437570
-  test( 'order of execution', function(t) {
-    t.deepEqual( analysis.normalize( 'İnceyol' ), [ 'i̇nceyol' ] );
-    t.equal( analysis.normalize( 'İnceyol' )[0].length, 8 );
-    t.equal( analysis.normalize( 'İ' )[0].length, 2 );
+  test('order of execution', function(t) {
+    t.deepEqual( analysis.normalize( 'İnceyol' ), [ 'inceyol' ] );
+    t.equal( analysis.normalize( 'İnceyol' )[0].length, 7 );
+    t.equal( analysis.normalize( 'İ' )[0].length, 1 );
     t.end();
   });
 

diff --git a/test/prototype/tokenize_integration.js b/test/prototype/tokenize_integration.js
@@ -11,7 +11,7 @@ module.exports.tokenize = function(test, util) {
 
   assert('Kelburn Wellington New Zealand', [['kelburn', 'wellington', 'new zealand']]);
   assert('Sydney New South Wales Australia', [['sydney', 'new south wales', 'australia']]);
-  assert('ケープタウン 南アフリカ', [['ケープタウン', '南アフリカ']]);
+  assert('ケープタウン 南アフリカ', [['ケーフタウン', '南アフリカ']]);
 
   // duplicates
   assert('lancaster lancaster pa', [['lancaster', 'lancaster', 'pa']]);