Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

re-enable support for custom multi-word synonyms #457

Merged
merged 2 commits into from Aug 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
24 changes: 22 additions & 2 deletions settings.js
Expand Up @@ -34,6 +34,7 @@ function generate(){
"filter": [
"lowercase",
"trim",
"synonyms/custom_admin/multiword",
"admin_synonyms_multiplexer",
"icu_folding",
"word_delimiter",
Expand All @@ -49,6 +50,7 @@ function generate(){
"filter": [
"lowercase",
"trim",
"synonyms/custom_name/multiword",
"name_synonyms_multiplexer",
"icu_folding",
"remove_ordinals",
Expand Down Expand Up @@ -81,6 +83,7 @@ function generate(){
"lowercase",
"trim",
"remove_duplicate_spaces",
"synonyms/custom_name/multiword",
"name_synonyms_multiplexer",
"icu_folding",
"remove_ordinals",
Expand Down Expand Up @@ -126,6 +129,7 @@ function generate(){
"lowercase",
"trim",
"remove_duplicate_spaces",
"synonyms/custom_street/multiword",
"street_synonyms_multiplexer",
"icu_folding",
"remove_ordinals",
Expand Down Expand Up @@ -227,10 +231,26 @@ function generate(){
// dynamically create filters for all synonym files in the ./synonyms directory.
// each filter is given the same name as the file, paths separators are replaced with
// underscores and the file extension is removed.
_.each(synonyms, (synonym, name) => {
// note: if no synonym entries are present in the list we use an array
// containing an empty space to avoid elasticsearch schema parsing errors.
_.each(synonyms, (entries, name) => {

// same tokenizer regex as above except without comma
// (which is a delimeter within the synonym files)
const tokenizerRegex = new RegExp('[\\s/\\\\-]+');
const singleWordEntries = entries.filter(e => !tokenizerRegex.test(e))
const multiWordEntries = entries.filter(e => tokenizerRegex.test(e))

// generate a filter containing single-word synonyms
settings.analysis.filter[`synonyms/${name}`] = {
"type": "synonym",
"synonyms": !_.isEmpty(synonym) ? synonym : ['']
"synonyms": !_.isEmpty(singleWordEntries) ? singleWordEntries : ['']
};

// generate a filter containing multi-word synonyms
settings.analysis.filter[`synonyms/${name}/multiword`] = {
"type": "synonym",
"synonyms": !_.isEmpty(multiWordEntries) ? multiWordEntries : ['']
};
});

Expand Down
16 changes: 14 additions & 2 deletions synonyms/linter.js
Expand Up @@ -2,6 +2,11 @@ const _ = require('lodash');
const logger = require('pelias-logger').get('schema-synonyms');
const punctuation = require('../punctuation');

// same tokenizer regex as the schema
const TOKENIZER_REGEX = new RegExp('[\\s,/\\\\-]+');
const DEMIMETER_REGEX = /,|=>/g
const REPLACEMENT_REGEX = /=>/

/**
* The synonyms linter attempts to warn the user when making
* common mistakes with synonyms.
Expand All @@ -22,7 +27,7 @@ function linter(synonyms) {
logger.debug(`[line] ${line}`);

// split the lines by delimeter
let tokens = line.split(/,|=>/g).map(t => t.trim());
let tokens = line.split(DEMIMETER_REGEX).map(t => t.trim());

// strip blacklisted punctuation from synonyms
// the 'punctuation.blacklist' contains a list of characters which are
Expand All @@ -41,6 +46,7 @@ function linter(synonyms) {
letterCasing(line, logprefix, tokens);
tokensSanityCheck(line, logprefix, tokens);
multiWordCheck(line, logprefix, tokens);
tokenReplacementCheck(line, logprefix);
// tokenLengthCheck(line, logprefix, tokens);
})
})
Expand Down Expand Up @@ -68,12 +74,18 @@ function tokensSanityCheck(line, logprefix, tokens) {

function multiWordCheck(line, logprefix, tokens) {
_.each(tokens, token => {
if (/\s/.test(token)){
if (TOKENIZER_REGEX.test(token)){
logger.warn(`${logprefix} multi word synonyms may cause issues with phrase queries:`, token);
}
});
}

function tokenReplacementCheck(line, logprefix) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've also added this additional linter warning.

Technically the => syntax is actually still supported (the linter doesn't rewrite the configs) but I would like to heavily discourage its use.

The reason for this is an entry such as road => rd would only enter [ "rd" ] in the index, so when a user queries for [ "road" ] (or any ngram of), ☠️ it will not match ☠️

Advanced users may opt to write it as road => road, rd to overcome this issue but this is equivalent to road, rd.

I think we will probably completely drop support for => at some point in the future because it's very easy to make mistakes with.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking it might actually be useful to use => sometimes. For example, if we have a single letter abbreviation that we want to ensure is not being applied too often.

c, carrer would replace every instance of c with carrer. However, carrer => career, c would not.

As you mentioned we would probably recommend that all tokens on the left hand side of the replacement also appear on the right, to avoid issues with autocomplete jitter.

if (REPLACEMENT_REGEX.test(line)) {
logger.warn(`${logprefix} synonym rule '=>' is not supported, use ',' instead`);
}
}

function tokenLengthCheck(line, logprefix, tokens) {
_.each(tokens, token => {
if (token.length <= 1) {
Expand Down
2 changes: 1 addition & 1 deletion synonyms/place_names/fr.txt
@@ -1,5 +1,5 @@
abbaye, abe
auto-école, autoécole, autoecole
autoécole, autoecole
aéroport, aeroport
bastide, bstd
baston, bast
Expand Down
8 changes: 4 additions & 4 deletions synonyms/streets/en.txt
Expand Up @@ -91,12 +91,12 @@ crest, crst, cst
crief, crf
croft, cft
cross, cs, crss
crossing, crsg, xing, csg, x-ing
crossroad, crd, xroad, x-road, xrd, x-rd
crossing, crsg, xing, csg
crossroad, crd, xroad, xrd
crossroads, xrds
crossway, cowy, crwy, xway, xwy, x-way
crossway, cowy, crwy, xway, xwy
cruiseway, cuwy, crwy
cul-de-sac, culdesac, cds, cusac, csac
culdesac, cds, cusac, csac
curve, cve, crv, crve, curv
cutting, cttg, ctg, cutt
dale, dle
Expand Down
62 changes: 57 additions & 5 deletions test/fixtures/expected.json
Expand Up @@ -30,6 +30,7 @@
"filter": [
"lowercase",
"trim",
"synonyms/custom_admin/multiword",
"admin_synonyms_multiplexer",
"icu_folding",
"word_delimiter",
Expand All @@ -48,6 +49,7 @@
"filter": [
"lowercase",
"trim",
"synonyms/custom_name/multiword",
"name_synonyms_multiplexer",
"icu_folding",
"remove_ordinals",
Expand Down Expand Up @@ -86,6 +88,7 @@
"lowercase",
"trim",
"remove_duplicate_spaces",
"synonyms/custom_name/multiword",
"name_synonyms_multiplexer",
"icu_folding",
"remove_ordinals",
Expand Down Expand Up @@ -142,6 +145,7 @@
"lowercase",
"trim",
"remove_duplicate_spaces",
"synonyms/custom_street/multiword",
"street_synonyms_multiplexer",
"icu_folding",
"remove_ordinals",
Expand Down Expand Up @@ -218,18 +222,36 @@
""
]
},
"synonyms/custom_admin/multiword": {
"type": "synonym",
"synonyms": [
""
]
},
"synonyms/custom_name": {
"type": "synonym",
"synonyms": [
""
]
},
"synonyms/custom_name/multiword": {
"type": "synonym",
"synonyms": [
""
]
},
"synonyms/custom_street": {
"type": "synonym",
"synonyms": [
""
]
},
"synonyms/custom_street/multiword": {
"type": "synonym",
"synonyms": [
""
]
},
"synonyms/directionals": {
"type": "synonym",
"synonyms": [
Expand Down Expand Up @@ -304,6 +326,12 @@
"sud,s"
]
},
"synonyms/directionals/multiword": {
"type": "synonym",
"synonyms": [
""
]
},
"synonyms/personal_titles": {
"type": "synonym",
"synonyms": [
Expand Down Expand Up @@ -500,6 +528,12 @@
"veuve,vve"
]
},
"synonyms/personal_titles/multiword": {
"type": "synonym",
"synonyms": [
""
]
},
"synonyms/place_names": {
"type": "synonym",
"synonyms": [
Expand Down Expand Up @@ -742,7 +776,7 @@
"volcan,vlcn",
"voluntarios,voluntos",
"abbaye,abe",
"auto-école,autoécole,autoecole",
"autoécole,autoecole",
"aéroport,aeroport",
"bastide,bstd",
"baston,bast",
Expand Down Expand Up @@ -819,13 +853,25 @@
"étang,etang"
]
},
"synonyms/place_names/multiword": {
"type": "synonym",
"synonyms": [
""
]
},
"synonyms/punctuation": {
"type": "synonym",
"synonyms": [
"&,and",
"&,und"
]
},
"synonyms/punctuation/multiword": {
"type": "synonym",
"synonyms": [
""
]
},
"synonyms/streets": {
"type": "synonym",
"synonyms": [
Expand Down Expand Up @@ -942,12 +988,12 @@
"crief,crf",
"croft,cft",
"cross,cs,crss",
"crossing,crsg,xing,csg,x-ing",
"crossroad,crd,xroad,x-road,xrd,x-rd",
"crossing,crsg,xing,csg",
"crossroad,crd,xroad,xrd",
"crossroads,xrds",
"crossway,cowy,crwy,xway,xwy,x-way",
"crossway,cowy,crwy,xway,xwy",
"cruiseway,cuwy,crwy",
"cul-de-sac,culdesac,cds,cusac,csac",
"culdesac,cds,cusac,csac",
"curve,cve,crv,crve,curv",
"cutting,cttg,ctg,cutt",
"dale,dle",
Expand Down Expand Up @@ -1639,6 +1685,12 @@
"wl,well",
"wls,wells"
]
},
"synonyms/streets/multiword": {
"type": "synonym",
"synonyms": [
""
]
}
},
"char_filter": {
Expand Down
4 changes: 4 additions & 0 deletions test/settings.js
Expand Up @@ -83,6 +83,7 @@ module.exports.tests.peliasAdminAnalyzer = function(test, common) {
t.deepEqual(analyzer.filter, [
"lowercase",
"trim",
"synonyms/custom_admin/multiword",
"admin_synonyms_multiplexer",
"icu_folding",
"word_delimiter",
Expand Down Expand Up @@ -130,6 +131,7 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) {
t.deepEqual( analyzer.filter, [
"lowercase",
"trim",
"synonyms/custom_name/multiword",
"name_synonyms_multiplexer",
"icu_folding",
"remove_ordinals",
Expand Down Expand Up @@ -186,6 +188,7 @@ module.exports.tests.peliasPhraseAnalyzer = function(test, common) {
"lowercase",
"trim",
"remove_duplicate_spaces",
"synonyms/custom_name/multiword",
"name_synonyms_multiplexer",
"icu_folding",
"remove_ordinals",
Expand Down Expand Up @@ -293,6 +296,7 @@ module.exports.tests.peliasStreetAnalyzer = function(test, common) {
"lowercase",
"trim",
"remove_duplicate_spaces",
"synonyms/custom_street/multiword",
"street_synonyms_multiplexer",
"icu_folding",
"remove_ordinals",
Expand Down