From 317beae045f3eb249ae864ad2dfa4b0213cd7405 Mon Sep 17 00:00:00 2001 From: Benjamin Coe Date: Fri, 25 Aug 2017 16:04:32 -0700 Subject: [PATCH 1/5] feat: search for similar packages --- README.md | 21 ++ index.js | 1 + lib/searchSimilar.js | 62 +++++ test/fixtures/search-similar.json | 429 ++++++++++++++++++++++++++++++ test/spec/searchSimilar.js | 24 ++ test/test.js | 1 + 6 files changed, 538 insertions(+) create mode 100644 lib/searchSimilar.js create mode 100644 test/fixtures/search-similar.json create mode 100644 test/spec/searchSimilar.js diff --git a/README.md b/README.md index 4560870..e498bbc 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,27 @@ Available options: - `size`: The total number of results to return, defaults to `25` +#### .search.similar(q, esClient, [options]) -> Promise + +Perform a fuzzy search for similarly named packages. + +Results are ranked based on a combination of analyzer weightings (`quality`, `popularity`, `maintenance`) and the `_score` returned by the [fuzzy match](https://www.elastic.co/guide/en/elasticsearch/guide/current/fuzzy-match-query.html). + +```js +const queries = require('@npms/queries'); + +// ... +queries.search.similar('chaik', esClient) +.then(results => { + // perhaps we were instead looking for chalk? +}); +``` + +Available options: + +- `size`: The total number of results to return, defaults to `25` +- `analyzerWeight`: How much should we weight the analyzer values by? +- `scoreWeight`: How much should we weight the fuzzy score by? ## Tests diff --git a/index.js b/index.js index ea10151..cd84bcc 100644 --- a/index.js +++ b/index.js @@ -2,3 +2,4 @@ module.exports.search = require('./lib/search'); module.exports.search.suggestions = require('./lib/searchSuggestions'); +module.exports.search.similar = require('./lib/searchSimilar'); diff --git a/lib/searchSimilar.js b/lib/searchSimilar.js new file mode 100644 index 0000000..4786b81 --- /dev/null +++ b/lib/searchSimilar.js @@ -0,0 +1,62 @@ +'use strict'; + +const pick = require('lodash/pick'); +const parseQuery = require('./util/parseSearchQuery'); +const toEsClient = require('./util/toEsClient'); + +function searchSimilar(q, esClient, options) { + esClient = toEsClient(esClient); + options = Object.assign({ + size: 5, + minScore: 6.0, + analyzerWeight: 2.2, + scoreWeight: 1.5 + }, options); + + const text = parseQuery.discardQualifiers(q); + const script = `(doc["score.final"].value * ${options.analyzerWeight}) * (_score * ${options.scoreWeight})`; + + if (!text) { + return Promise.resolve([]); + } + + return Promise.resolve(esClient.search({ + /* eslint camelcase: 0 */ + index: 'npms-current', + type: 'score', + body: { + size: options.size, + query: { + function_score: { + min_score: options.minScore, + boost_mode: 'replace', + query: { + fuzzy: { + 'package.name.raw': { + value: text + } + } + }, + script_score: { + lang: 'groovy', + script: script, + params: {}, + }, + }, + }, + }, + })) + .then((res) => res.hits.hits.map((hit) => { + // We can't use _fields in the query because the JSON properties order get messed up, + // see https://github.com/elastic/elasticsearch/issues/17639 + // So we filter the source fields manually with pick().. this is not ideal since there's payload + // navigating through the network that we do not use, but it's definitively better than having order messed up + const result = pick(hit._source, ['package', 'flags', 'score']); + + result.searchScore = hit._score; + + return result; + })); +} + +module.exports = searchSimilar; diff --git a/test/fixtures/search-similar.json b/test/fixtures/search-similar.json new file mode 100644 index 0000000..f32979f --- /dev/null +++ b/test/fixtures/search-similar.json @@ -0,0 +1,429 @@ +[ + { + "scope": "http://127.0.0.1:9200", + "method": "POST", + "path": "/npms-current/score/_search", + "body": { + "size": 5, + "query": { + "function_score": { + "min_score": 6, + "boost_mode": "replace", + "query": { + "fuzzy": { + "package.name.raw": { + "value": "chaik" + } + } + }, + "script_score": { + "lang": "groovy", + "script": "(doc[\"score.final\"].value * 2.2) * (_score * 1.5)", + "params": {} + } + } + } + }, + "status": 200, + "response": { + "took": 2, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "hits": { + "total": 5, + "max_score": 15.782807, + "hits": [ + { + "_index": "npms-1503693256008", + "_type": "score", + "_id": "chalk", + "_score": 15.782807, + "_source": { + "package": { + "name": "chalk", + "scope": "unscoped", + "version": "2.1.0", + "description": "Terminal string styling done right", + "keywords": [ + "color", + "colour", + "colors", + "terminal", + "console", + "cli", + "string", + "str", + "ansi", + "style", + "styles", + "tty", + "formatting", + "rgb", + "256", + "shell", + "xterm", + "log", + "logging", + "command-line", + "text" + ], + "date": "2017-08-07T03:56:43.217Z", + "links": { + "npm": "https://www.npmjs.com/package/chalk", + "homepage": "https://github.com/chalk/chalk#readme", + "repository": "https://github.com/chalk/chalk", + "bugs": "https://github.com/chalk/chalk/issues" + }, + "publisher": { + "username": "qix", + "email": "i.am.qix@gmail.com" + }, + "maintainers": [ + { + "username": "qix", + "email": "i.am.qix@gmail.com" + }, + { + "username": "unicorn", + "email": "sindresorhus+unicorn@gmail.com" + }, + { + "username": "sindresorhus", + "email": "sindresorhus@gmail.com" + } + ] + }, + "evaluation": { + "quality": { + "carefulness": 0.9199999999999999, + "tests": 1, + "health": 1, + "branding": 0.3 + }, + "popularity": { + "communityInterest": 7636, + "downloadsCount": 27118043, + "downloadsAcceleration": 111170.7437785388, + "dependentsCount": 17781 + }, + "maintenance": { + "releasesFrequency": 0.9024828767123287, + "commitsFrequency": 0.9481369863013699, + "openIssues": 1, + "issuesDistribution": 0.9240756549484997 + } + }, + "score": { + "final": 0.9656039095437992, + "detail": { + "quality": 0.9846826146254087, + "popularity": 0.9149637459682334, + "maintenance": 0.9998908973351281 + } + } + } + }, + { + "_index": "npms-1503693256008", + "_type": "score", + "_id": "chai", + "_score": 13.830539, + "_source": { + "package": { + "name": "chai", + "scope": "unscoped", + "version": "4.1.1", + "description": "BDD/TDD assertion library for node.js and the browser. Test framework agnostic.", + "keywords": [ + "test", + "assertion", + "assert", + "testing", + "chai" + ], + "date": "2017-08-05T07:33:06.266Z", + "links": { + "npm": "https://www.npmjs.com/package/chai", + "homepage": "http://chaijs.com", + "repository": "https://github.com/chaijs/chai", + "bugs": "https://github.com/chaijs/chai/issues" + }, + "author": { + "name": "Jake Luer", + "email": "jake@alogicalparadox.com" + }, + "publisher": { + "username": "chaijs", + "email": "chaijs@keithcirkel.co.uk" + }, + "maintainers": [ + { + "username": "chaijs", + "email": "chaijs@keithcirkel.co.uk" + } + ] + }, + "evaluation": { + "quality": { + "carefulness": 0.8699999999999999, + "tests": 0.9535, + "health": 0.75, + "branding": 1 + }, + "popularity": { + "communityInterest": 5333, + "downloadsCount": 4535616, + "downloadsAcceleration": 15365.191191019789, + "dependentsCount": 2614 + }, + "maintenance": { + "releasesFrequency": 1, + "commitsFrequency": 0.9582260273972603, + "openIssues": 1, + "issuesDistribution": 0.5310499773121844 + } + }, + "score": { + "final": 0.9025737045873155, + "detail": { + "quality": 0.9694625894211915, + "popularity": 0.7691369367028366, + "maintenance": 0.9786771426141864 + } + } + } + }, + { + "_index": "npms-1503693256008", + "_type": "score", + "_id": "chaid", + "_score": 10.885196, + "_source": { + "package": { + "name": "chaid", + "scope": "unscoped", + "version": "1.0.2", + "description": "Id equality assertions for chai", + "keywords": [ + "chai", + "chai-plugin", + "browser", + "plugin", + "id", + "equality", + "assertion", + "mongo", + "mongodb", + "objectid" + ], + "date": "2015-11-19T03:56:05.397Z", + "links": { + "npm": "https://www.npmjs.com/package/chaid", + "homepage": "https://github.com/hurrymaplelad/chaid", + "repository": "https://github.com/hurrymaplelad/chaid", + "bugs": "https://github.com/hurrymaplelad/chaid/issues" + }, + "author": { + "name": "Adam Hull", + "email": "adam@hmlad.com", + "username": "hurrymaplelad" + }, + "publisher": { + "username": "hurrymaplelad", + "email": "adam@hmlad.com" + }, + "maintainers": [ + { + "username": "hurrymaplelad", + "email": "adam@hmlad.com" + } + ] + }, + "evaluation": { + "quality": { + "carefulness": 0.9199999999999999, + "tests": 0.85, + "health": 1, + "branding": 0.3 + }, + "popularity": { + "communityInterest": 4, + "downloadsCount": 889.6666666666666, + "downloadsAcceleration": 5.921404109589042, + "dependentsCount": 1 + }, + "maintenance": { + "releasesFrequency": 0.9, + "commitsFrequency": 0.9, + "openIssues": 0.9, + "issuesDistribution": 0.9 + } + }, + "score": { + "final": 0.6659643667895766, + "detail": { + "quality": 0.9631048526880389, + "popularity": 0.07747243749939547, + "maintenance": 0.999764451023933 + } + } + } + }, + { + "_index": "npms-1503693256008", + "_type": "score", + "_id": "chaik", + "_score": 10.582002, + "_source": { + "package": { + "name": "chaik", + "scope": "unscoped", + "version": "0.0.4", + "description": "can be used in unittest to compare json data format or to check existance in db", + "keywords": [ + "unittest", + "check data existance in database", + "compare json structure" + ], + "date": "2017-02-08T03:30:47.260Z", + "links": { + "npm": "https://www.npmjs.com/package/chaik", + "homepage": "https://github.com/dragon753/chaik#readme", + "repository": "https://github.com/dragon753/chaik", + "bugs": "https://github.com/dragon753/chaik/issues" + }, + "author": { + "name": "dragon753" + }, + "publisher": { + "username": "dragon753", + "email": "dragon753@gmail.com" + }, + "maintainers": [ + { + "username": "dragon753", + "email": "dragon753@gmail.com" + } + ] + }, + "flags": { + "unstable": true + }, + "evaluation": { + "quality": { + "carefulness": 0.355, + "tests": 0.6, + "health": 0.75, + "branding": 0 + }, + "popularity": { + "communityInterest": 2, + "downloadsCount": 24.666666666666668, + "downloadsAcceleration": 0.19429223744292237, + "dependentsCount": 0 + }, + "maintenance": { + "releasesFrequency": 0.3698630136986302, + "commitsFrequency": 0.17907534246575343, + "openIssues": 0.7, + "issuesDistribution": 0.7 + } + }, + "score": { + "final": 0.5179318816289368, + "detail": { + "quality": 0.6951480509761432, + "popularity": 0.02262137581147453, + "maintenance": 0.861342813720222 + } + } + } + }, + { + "_index": "npms-1503693256008", + "_type": "score", + "_id": "chain", + "_score": 6.3325176, + "_source": { + "package": { + "name": "chain", + "scope": "unscoped", + "version": "0.1.3", + "description": "A microframework for handling async JS", + "keywords": [ + "async", + "asynchronous", + "events", + "parallel" + ], + "date": "2011-02-19T01:00:17.936Z", + "links": { + "npm": "https://www.npmjs.com/package/chain", + "homepage": "http://github.com/chriso/chain.js", + "repository": "https://github.com/chriso/chain.js", + "bugs": "http://github.com/chriso/chain.js/issues" + }, + "author": { + "name": "Chris O'Hara", + "email": "cohara87@gmail.com", + "username": "cohara87" + }, + "publisher": { + "username": "cohara87", + "email": "cohara87@gmail.com" + }, + "maintainers": [ + { + "username": "cohara87", + "email": "cohara87@gmail.com" + } + ] + }, + "flags": { + "unstable": true + }, + "evaluation": { + "quality": { + "carefulness": 0.165, + "tests": 0.3, + "health": 1, + "branding": 0 + }, + "popularity": { + "communityInterest": 159, + "downloadsCount": 178.33333333333334, + "downloadsAcceleration": -1.7720509893455099, + "dependentsCount": 1 + }, + "maintenance": { + "releasesFrequency": 0.06164383561643837, + "commitsFrequency": 0, + "openIssues": 1, + "issuesDistribution": 1 + } + }, + "score": { + "final": 0.38742815535645925, + "detail": { + "quality": 0.5225684799071498, + "popularity": 0.11070544436870669, + "maintenance": 0.54831630244362 + } + } + } + } + ] + } + }, + "rawHeaders": [ + "Content-Type", + "application/json; charset=UTF-8", + "Content-Length", + "6520" + ] + } +] diff --git a/test/spec/searchSimilar.js b/test/spec/searchSimilar.js new file mode 100644 index 0000000..f1baeaf --- /dev/null +++ b/test/spec/searchSimilar.js @@ -0,0 +1,24 @@ +'use strict'; + +const expect = require('chai').expect; +const elasticsearch = require('elasticsearch'); +const nockBack = require('nock').back; +const queries = require('../../'); + +const localEsClient = new elasticsearch.Client({ host: '127.0.0.1:9200', log: null, apiVersion: '2.4' }); + +describe('search.similar()', () => { + it('should return the desired results', () => { + let nockDone; + + nockBack('search-similar.json', (_nockDone) => { nockDone = _nockDone; }); + + return queries.search.similar('chaik', localEsClient) + .then((similar) => { + expect(similar).to.have.length(5); + expect(similar[0]).to.contain.all.keys('package', 'score', 'searchScore'); + expect(similar[0].package.name).to.equal('chalk'); + nockDone(); + }) + }); +}); diff --git a/test/test.js b/test/test.js index c01f761..08dcc43 100644 --- a/test/test.js +++ b/test/test.js @@ -9,3 +9,4 @@ nockBack.fixtures = `${__dirname}/fixtures`; require('./spec/util/parseSearchQuery'); require('./spec/search'); require('./spec/searchSuggestions'); +require('./spec/searchSimilar'); From 2105d6fcaaaa7038d4e698771ddc2f22825e7790 Mon Sep 17 00:00:00 2001 From: Benjamin Coe Date: Fri, 25 Aug 2017 16:07:21 -0700 Subject: [PATCH 2/5] docs: we actually return 5 results --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e498bbc..ee650db 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ queries.search.similar('chaik', esClient) Available options: -- `size`: The total number of results to return, defaults to `25` +- `size`: The total number of results to return, defaults to `5` - `analyzerWeight`: How much should we weight the analyzer values by? - `scoreWeight`: How much should we weight the fuzzy score by? From 9b84fedb8f7d3849bcaded42f3e850eeccafb14b Mon Sep 17 00:00:00 2001 From: Benjamin Coe Date: Fri, 25 Aug 2017 16:16:15 -0700 Subject: [PATCH 3/5] docs: document the default values. --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ee650db..3f1567a 100644 --- a/README.md +++ b/README.md @@ -91,9 +91,13 @@ queries.search.similar('chaik', esClient) Available options: -- `size`: The total number of results to return, defaults to `5` -- `analyzerWeight`: How much should we weight the analyzer values by? -- `scoreWeight`: How much should we weight the fuzzy score by? +- `size`: The total number of results to return, defaults to `5`. +- `analyzerWeight`: How much should we weight the analyzer values by? defaults to `2.2`. +- `scoreWeight`: How much should we weight the fuzzy score by? defaults to `1.5`. +- `minScore`: defaults to `6.0`. + +_the above default values were based on trial and error examining the + top npm modules, they will likely change over time._ ## Tests From 7c0d7263c8a7d2d4d88b2b251262a3ad42b5e7b9 Mon Sep 17 00:00:00 2001 From: Benjamin Coe Date: Fri, 25 Aug 2017 16:49:33 -0700 Subject: [PATCH 4/5] chore: playing with defaults a bit more based on JSONStream example --- README.md | 4 ++-- lib/searchSimilar.js | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3f1567a..f80ba88 100644 --- a/README.md +++ b/README.md @@ -91,10 +91,10 @@ queries.search.similar('chaik', esClient) Available options: -- `size`: The total number of results to return, defaults to `5`. +- `size`: The total number of results to return, defaults to `10`. - `analyzerWeight`: How much should we weight the analyzer values by? defaults to `2.2`. - `scoreWeight`: How much should we weight the fuzzy score by? defaults to `1.5`. -- `minScore`: defaults to `6.0`. +- `minScore`: defaults to `4.5`. _the above default values were based on trial and error examining the top npm modules, they will likely change over time._ diff --git a/lib/searchSimilar.js b/lib/searchSimilar.js index 4786b81..71172dc 100644 --- a/lib/searchSimilar.js +++ b/lib/searchSimilar.js @@ -7,8 +7,8 @@ const toEsClient = require('./util/toEsClient'); function searchSimilar(q, esClient, options) { esClient = toEsClient(esClient); options = Object.assign({ - size: 5, - minScore: 6.0, + size: 10, + minScore: 4.5, analyzerWeight: 2.2, scoreWeight: 1.5 }, options); From 745d47a23906c0fcaf5012f73755eb9934a5bbee Mon Sep 17 00:00:00 2001 From: Benjamin Coe Date: Fri, 25 Aug 2017 20:38:54 -0700 Subject: [PATCH 5/5] fix: limits in fixture --- test/fixtures/search-similar.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/fixtures/search-similar.json b/test/fixtures/search-similar.json index f32979f..819b4d3 100644 --- a/test/fixtures/search-similar.json +++ b/test/fixtures/search-similar.json @@ -4,10 +4,10 @@ "method": "POST", "path": "/npms-current/score/_search", "body": { - "size": 5, + "size": 10, "query": { "function_score": { - "min_score": 6, + "min_score": 4.5, "boost_mode": "replace", "query": { "fuzzy": {