forked from blevesearch/bleve
-
Notifications
You must be signed in to change notification settings - Fork 0
/
exception.go
136 lines (122 loc) · 4 KB
/
exception.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// package exception implements a Tokenizer which extracts pieces matched by a
// regular expression from the input data, delegates the rest to another
// tokenizer, then insert back extracted parts in the token stream. Use it to
// preserve sequences which a regular tokenizer would alter or remove.
//
// Its constructor takes the following arguments:
//
// "exceptions" ([]string): one or more Go regular expressions matching the
// sequence to preserve. Multiple expressions are combined with "|".
//
// "tokenizer" (string): the name of the tokenizer processing the data not
// matched by "exceptions".
package exception
import (
"fmt"
"regexp"
"strings"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const Name = "exception"
type ExceptionsTokenizer struct {
exception *regexp.Regexp
remaining analysis.Tokenizer
}
func NewExceptionsTokenizer(exception *regexp.Regexp, remaining analysis.Tokenizer) *ExceptionsTokenizer {
return &ExceptionsTokenizer{
exception: exception,
remaining: remaining,
}
}
func (t *ExceptionsTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
matches := t.exception.FindAllIndex(input, -1)
currInput := 0
lastPos := 0
for _, match := range matches {
start := match[0]
end := match[1]
if start > currInput {
// need to defer to remaining for unprocessed section
intermediate := t.remaining.Tokenize(input[currInput:start])
// add intermediate tokens to our result stream
for _, token := range intermediate {
// adjust token offsets
token.Position += lastPos
token.Start += currInput
token.End += currInput
rv = append(rv, token)
}
lastPos += len(intermediate)
currInput = start
}
// create single token with this regexp match
token := &analysis.Token{
Term: input[start:end],
Start: start,
End: end,
Position: lastPos + 1,
}
rv = append(rv, token)
lastPos++
currInput = end
}
if currInput < len(input) {
// need to defer to remaining for unprocessed section
intermediate := t.remaining.Tokenize(input[currInput:])
// add intermediate tokens to our result stream
for _, token := range intermediate {
// adjust token offsets
token.Position += lastPos
token.Start += currInput
token.End += currInput
rv = append(rv, token)
}
}
return rv
}
func ExceptionsTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
exceptions := []string{}
iexceptions, ok := config["exceptions"].([]interface{})
if ok {
for _, exception := range iexceptions {
exception, ok := exception.(string)
if ok {
exceptions = append(exceptions, exception)
}
}
}
aexceptions, ok := config["exceptions"].([]string)
if ok {
exceptions = append(exceptions, aexceptions...)
}
if len(exceptions) == 0 {
return nil, fmt.Errorf("no pattern found in 'exception' property")
}
exceptionPattern := strings.Join(exceptions, "|")
r, err := regexp.Compile(exceptionPattern)
if err != nil {
return nil, fmt.Errorf("unable to build regexp tokenizer: %v", err)
}
remainingName, ok := config["tokenizer"].(string)
if !ok {
return nil, fmt.Errorf("must specify tokenizer for remaining input")
}
remaining, err := cache.TokenizerNamed(remainingName)
if err != nil {
return nil, err
}
return NewExceptionsTokenizer(r, remaining), nil
}
func init() {
registry.RegisterTokenizer(Name, ExceptionsTokenizerConstructor)
}