Skip to content

Commit 2b6edfc

Browse files
authored
feat(fst): add fst data structure (#17)
1 parent 1a3397a commit 2b6edfc

File tree

4 files changed

+424
-1
lines changed

4 files changed

+424
-1
lines changed

tokenization/FST.js

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
/**
2+
* FST - Finite-state transducer data structure
3+
*
4+
* A graph structure which is very efficient for querying prefix (or suffix) matches.
5+
*
6+
* see: https://www.elastic.co/blog/you-complete-me
7+
*/
8+
9+
const Graph = require('./Graph')
10+
const ETX = String.fromCharCode(3) // end-of-text
11+
12+
class FST {
13+
constructor () {
14+
this.head = new Graph()
15+
this.tail = new Graph()
16+
}
17+
18+
// add a new token to the index
19+
add (token) {
20+
if (this.has(token)) { return }
21+
this._index(this._split(token))
22+
}
23+
24+
// remove token from index
25+
delete (token) {
26+
this._deindex(this._split(token))
27+
}
28+
29+
// index contains token
30+
has (token) {
31+
let node = this._walk(this.head, '>', this._split(token))
32+
return !!node && !!node.length(`>${ETX}`)
33+
}
34+
35+
// index contains a token with this prefix
36+
hasPrefix (prefix) {
37+
let node = this._walk(this.head, '>', this._split(prefix))
38+
return !!node && (!node.findOne(`>${ETX}`) || node.findOne('_meta')['>count'] > 1)
39+
}
40+
41+
// index contains a token with this suffix
42+
hasSuffix (suffix) {
43+
let node = this._walk(this.tail, '<', this._split(suffix).reverse())
44+
return !!node && (!node.findOne(`<${ETX}`) || node.findOne('_meta')['<count'] > 1)
45+
}
46+
47+
// split token in to characters
48+
_split (token) { return (token || '').split('') }
49+
50+
// walk the graph & return the last node
51+
_walk (parent, direction, chars, create, each) {
52+
let DIR = (direction === '<') ? '<' : '>'
53+
let END = (DIR === '<') ? this.head : this.tail
54+
let LAST = (chars.length - 1)
55+
for (let i = 0; i < chars.length; i++) {
56+
let child = parent.findOne(`${DIR}${chars[i]}`)
57+
if (create === true) {
58+
// create new graph node
59+
if (!child) {
60+
child = new Graph()
61+
child.add('_meta', { '>count': 0, '<count': 0 })
62+
parent.add(`${DIR}${chars[i]}`, child)
63+
}
64+
65+
// update count
66+
child.findOne('_meta')[`${DIR}count`]++
67+
68+
// create final edge
69+
if (i === LAST && !child.length(`${DIR}${ETX}`)) {
70+
child.add(`${DIR}${ETX}`, END)
71+
}
72+
}
73+
if (!child) { return child }
74+
if (typeof each === 'function') {
75+
each(child, parent, chars[i])
76+
}
77+
parent = child
78+
}
79+
80+
return parent
81+
}
82+
83+
/**
84+
* dump FST to stderr, eg:
85+
* [3] >e>x
86+
* [3] >e>x>a
87+
* [2] >e>x>a>m
88+
* [2] >e>x>a>m>p
89+
* [2] >e>x>a>m>p>l
90+
* [2] >e>x>a>m>p>l>e
91+
* [2] >e>x>a>m>p>l>e>s
92+
* [3] >e>x>c
93+
* [1] >e>x>c>e
94+
* [1] >e>x>c>e>s
95+
* [1] >e>x>c>e>s>s
96+
*/
97+
print (node, direction) {
98+
this._recurse(node, direction, (path, count) => {
99+
console.error(`[${count}] ${path}`)
100+
})
101+
}
102+
103+
_recurse (node, direction, each, prefix) {
104+
if (!direction) { direction = '>' }
105+
if (!prefix) { prefix = '' }
106+
for (let key in node.edges) {
107+
if (key[0] !== direction) { continue }
108+
if (!node.length(key)) { continue }
109+
if (node !== this.head && node !== this.tail) {
110+
let count = node.findOne('_meta')[`${direction}count`]
111+
each(prefix, count)
112+
} else if (prefix.length > 0) { return }
113+
this._recurse(node.findOne(key), direction, each, prefix + key)
114+
}
115+
}
116+
117+
// walk the graph & add characters to graph
118+
_index (chars) {
119+
this._walk(this.head, '>', chars, true)
120+
this._walk(this.tail, '<', chars.reverse(), true)
121+
}
122+
123+
// walk the graph & remove characters from graph
124+
_deindex (chars) {
125+
// left-to-right
126+
let node = this._walk(this.head, '>', chars)
127+
if (node && node.remove(`>${ETX}`, this.tail)) {
128+
this._walk(this.head, '>', chars, false, (child, parent, char) => {
129+
let meta = child.findOne('_meta')
130+
if (meta && --meta['>count'] < 1) {
131+
parent.remove(`>${char}`, child)
132+
}
133+
})
134+
}
135+
136+
// right-to-left
137+
let reversed = chars.slice().reverse()
138+
node = this._walk(this.tail, '<', reversed)
139+
if (node && node.remove(`<${ETX}`, this.head)) {
140+
this._walk(this.tail, '<', reversed, false, (child, parent, char) => {
141+
let meta = child.findOne('_meta')
142+
if (meta && --meta['<count'] < 1) {
143+
parent.remove(`<${char}`, child)
144+
}
145+
})
146+
}
147+
}
148+
}
149+
150+
module.exports = FST

0 commit comments

Comments
 (0)