/
index.ts
152 lines (127 loc) 路 4.26 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import { createError } from '../../errors.js'
import { Stemmer, Tokenizer, DefaultTokenizerConfig } from '../../types.js'
import { replaceDiacritics } from './diacritics.js'
import { Language, SPLITTERS, SUPPORTED_LANGUAGES } from './languages.js'
import { stemmer as english } from './english-stemmer.js'
interface DefaultTokenizer extends Tokenizer {
language: Language
stemmer?: Stemmer
tokenizeSkipProperties: Set<string>
stemmerSkipProperties: Set<string>
stopWords?: string[]
allowDuplicates: boolean
normalizationCache: Map<string, string>
normalizeToken(this: DefaultTokenizer, token: string, prop: string | undefined): string
}
function normalizeToken(this: DefaultTokenizer, prop: string, token: string): string {
const key = `${this.language}:${prop}:${token}`
if (this.normalizationCache.has(key)) {
return this.normalizationCache.get(key)!
}
// Remove stopwords if enabled
if (this.stopWords?.includes(token)) {
this.normalizationCache.set(key, '')
return ''
}
// Apply stemming if enabled
if (this.stemmer && !this.stemmerSkipProperties.has(prop)) {
token = this.stemmer(token)
}
token = replaceDiacritics(token)
this.normalizationCache.set(key, token)
return token
}
/* c8 ignore next 10 */
function trim(text: string[]): string[] {
while (text[text.length - 1] === '') {
text.pop()
}
while (text[0] === '') {
text.shift()
}
return text
}
function tokenize(this: DefaultTokenizer, input: string, language?: string, prop?: string): string[] {
if (language && language !== this.language) {
throw createError('LANGUAGE_NOT_SUPPORTED', language)
}
/* c8 ignore next 3 */
if (typeof input !== 'string') {
return [input]
}
let tokens: string[]
if (prop && this.tokenizeSkipProperties.has(prop)) {
tokens = [this.normalizeToken.bind(this, prop ?? '')(input)]
} else {
const splitRule = SPLITTERS[this.language]
tokens = input
.toLowerCase()
.split(splitRule)
.map(this.normalizeToken.bind(this, prop ?? ''))
.filter(Boolean)
}
const trimTokens = trim(tokens)
if (!this.allowDuplicates) {
return Array.from(new Set(trimTokens))
}
return trimTokens
}
export async function createTokenizer(config: DefaultTokenizerConfig = {}): Promise<DefaultTokenizer> {
if (!config.language) {
config.language = 'english'
} else if (!SUPPORTED_LANGUAGES.includes(config.language)) {
throw createError('LANGUAGE_NOT_SUPPORTED', config.language)
}
// Handle stemming - It is disabled by default
let stemmer: Stemmer | undefined
if (config.stemming || (config.stemmer && !('stemming' in config))) {
if (config.stemmer) {
if (typeof config.stemmer !== 'function') {
throw createError('INVALID_STEMMER_FUNCTION_TYPE')
}
stemmer = config.stemmer
} else {
if (config.language === 'english') {
stemmer = english
} else {
throw createError('MISSING_STEMMER', config.language)
}
}
}
// Handle stopwords
let stopWords: string[] | undefined
if (config.stopWords !== false) {
stopWords = []
if (Array.isArray(config.stopWords)) {
stopWords = config.stopWords
} else if (typeof config.stopWords === 'function') {
stopWords = await config.stopWords(stopWords)
} else if (config.stopWords) {
throw createError('CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY')
}
// Make sure stopWords is just an array of strings
if (!Array.isArray(stopWords)) {
throw createError('CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY')
}
for (const s of stopWords) {
if (typeof s !== 'string') {
throw createError('CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY')
}
}
}
// Create the tokenizer
const tokenizer: DefaultTokenizer = {
tokenize,
language: config.language,
stemmer,
stemmerSkipProperties: new Set(config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : []),
tokenizeSkipProperties: new Set(config.tokenizeSkipProperties ? [config.tokenizeSkipProperties].flat() : []),
stopWords,
allowDuplicates: Boolean(config.allowDuplicates),
normalizeToken,
normalizationCache: new Map(),
}
tokenizer.tokenize = tokenize.bind(tokenizer)
tokenizer.normalizeToken = normalizeToken
return tokenizer
}