Skip to content

Commit

Permalink
Decouple serialization format from internal index data structure
Browse files Browse the repository at this point in the history
Decoupling the inner implementation of the index data structure from the
serialization format makes it possible to optimize and improve the
internals without introducing backward incompatible changes to the
serialization.

Also, the new format is easier to generate in other languages, as it
does not depend on the radix tree implementation, but only on the
indexing pipeline.

Throws error in case of incompatible serialization format
  • Loading branch information
lucaong committed Feb 17, 2022
1 parent da5666a commit 44b6ee2
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 57 deletions.
20 changes: 19 additions & 1 deletion src/MiniSearch.test.js
Expand Up @@ -875,10 +875,19 @@ e forse del mio dir poco ti cale`
const options = { fields: ['title', 'text'], storeFields: ['category'] }
const ms = new MiniSearch(options)
ms.addAll(documents)

const json = JSON.stringify(ms)
const deserialized = MiniSearch.loadJSON(json, options)
expect(ms.search('vita')).toEqual(deserialized.search('vita'))
expect(ms.toJSON()).toEqual(deserialized.toJSON())

const original = ms.toJSON()
const final = deserialized.toJSON()

// Normalize order of data in the serialized index
original.index.sort()
final.index.sort()

expect(original).toEqual(final)
})

it('raises an error if called without options', () => {
Expand All @@ -890,6 +899,15 @@ e forse del mio dir poco ti cale`
MiniSearch.loadJSON(json)
}).toThrowError('MiniSearch: loadJSON should be given the same options used when serializing the index')
})

it('raises an error if given an incompatible serialized version', () => {
const options = { fields: ['title', 'text'] }
const json = "{}"

expect(() => {
MiniSearch.loadJSON(json, options)
}).toThrowError('MiniSearch: cannot deserialize an index created with an incompatible version')
})
})

describe('getDefault', () => {
Expand Down
93 changes: 37 additions & 56 deletions src/MiniSearch.ts
@@ -1,6 +1,4 @@
import SearchableMap from './SearchableMap/SearchableMap'
import { LEAF } from './SearchableMap/TreeIterator'
import { RadixTree } from './SearchableMap/types'

const OR = 'or'
const AND = 'and'
Expand Down Expand Up @@ -265,14 +263,15 @@ export type SearchResult = {
* @ignore
*/
export type AsPlainObject = {
index: { _tree: {}, _prefix: string },
documentCount: number,
nextId: number,
documentIds: { [shortId: string]: any }
fieldIds: { [fieldName: string]: number }
fieldLength: { [shortId: string]: number[] },
fieldLength: { [shortId: string]: number[] }
averageFieldLength: number[],
storedFields: { [shortId: string]: any }
index: [string, { [fieldId: string]: SerializedIndexEntry }][]
serializationVersion: number
}

export type QueryCombination = SearchOptions & { queries: Query[] }
Expand Down Expand Up @@ -921,18 +920,35 @@ export default class MiniSearch<T = any> {
fieldIds,
fieldLength,
averageFieldLength,
storedFields
storedFields,
serializationVersion
} = js
if (serializationVersion !== 1) {
throw new Error('MiniSearch: cannot deserialize an index created with an incompatible version')
}

const miniSearch = new MiniSearch(options)

miniSearch._index = new SearchableMap(objectToTree(index._tree), index._prefix)
miniSearch._documentCount = documentCount
miniSearch._nextId = nextId
miniSearch._documentIds = objectToNumericMap(documentIds)
miniSearch._fieldIds = fieldIds
miniSearch._fieldLength = objectToNumericMap(fieldLength)
miniSearch._averageFieldLength = averageFieldLength
miniSearch._storedFields = objectToNumericMap(storedFields)
miniSearch._index = new SearchableMap()

for (const [term, data] of index) {
const dataMap = new Map() as IndexData

for (const fieldId of Object.keys(data)) {
const { df, ds } = data[fieldId]

dataMap.set(parseInt(fieldId, 10), { df, ds: objectToNumericMap(ds) as IndexEntry['ds'] })
}

miniSearch._index.set(term, dataMap)
}

return miniSearch
}
Expand Down Expand Up @@ -1047,15 +1063,28 @@ export default class MiniSearch<T = any> {
* @return A plain-object serializeable representation of the search index.
*/
toJSON (): AsPlainObject {
const index: [string, { [key: string]: SerializedIndexEntry }][] = []

for (const [term, fieldIndex] of this._index) {
const data: { [key: string]: SerializedIndexEntry } = {}

for (const [fieldId, { df, ds }] of fieldIndex) {
data[fieldId] = { df, ds: Object.fromEntries(ds) }
}

index.push([term, data])
}

return {
index: { _tree: treeToObject(this._index._tree), _prefix: this._index._prefix },
documentCount: this._documentCount,
nextId: this._nextId,
documentIds: Object.fromEntries(this._documentIds),
fieldIds: this._fieldIds,
fieldLength: Object.fromEntries(this._fieldLength),
averageFieldLength: this._averageFieldLength,
storedFields: Object.fromEntries(this._storedFields)
storedFields: Object.fromEntries(this._storedFields),
index,
serializationVersion: 1
}
}

Expand Down Expand Up @@ -1321,33 +1350,8 @@ const defaultAutoSuggestOptions = {

const createMap = () => new Map()

type TreeLikeObject<T = any> = { [key: string]: TreeLikeObject | T }
type SerializedIndexEntry = { df: number, ds: { [key: string]: number } }

const objectToTree = (object: TreeLikeObject): RadixTree<IndexData> => {
const map = new Map()

for (const key of Object.keys(object)) {
const value = object[key]
if (key === LEAF) {
const data = new Map() as IndexData
for (const key of Object.keys(value)) {
const { df, ds } = value[key]
data.set(parseInt(key, 10), {
df,
ds: objectToNumericMap(ds) as IndexEntry['ds']
})
}

map.set(key, data)
} else {
map.set(key, objectToTree(value))
}
}

return map
}

const objectToNumericMap = <T>(object: { [key: string]: T }): Map<number, T> => {
const map = new Map()

Expand All @@ -1358,29 +1362,6 @@ const objectToNumericMap = <T>(object: { [key: string]: T }): Map<number, T> =>
return map
}

const treeToObject = (tree: RadixTree<IndexData>): TreeLikeObject => {
const obj: TreeLikeObject = {}

for (const [key, value] of tree) {
if (key === LEAF) {
const data = {} as { [key: string]: SerializedIndexEntry }

for (const [key, { df, ds }] of (value as IndexData).entries()) {
data[key] = {
df,
ds: Object.fromEntries(ds)
}
}

obj[key] = data
} else {
obj[key] = treeToObject(value as RadixTree<IndexData>)
}
}

return obj
}

// This regular expression matches any Unicode space or punctuation character
// Adapted from https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7BZ%7D%5Cp%7BP%7D&abb=on&c=on&esc=on
const SPACE_OR_PUNCTUATION = /[\n\r -#%-*,-/:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]+/u

0 comments on commit 44b6ee2

Please sign in to comment.