-
Notifications
You must be signed in to change notification settings - Fork 20
/
text.go
166 lines (141 loc) · 4.64 KB
/
text.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
package utils
import (
"bytes"
"regexp"
"sort"
"strings"
"github.com/blevesearch/segment"
)
var snakedChars = regexp.MustCompile(`[^\p{L}\d_]+`)
// treats sequences of letters/numbers/_/' as tokens, and symbols as individual tokens
var wordTokenRegex = regexp.MustCompile(`[\pM\pL\pN_']+|\pS`)
// Snakify turns the passed in string into a context reference. We replace all whitespace
// characters with _ and replace any duplicate underscores
func Snakify(text string) string {
return strings.ToLower(snakedChars.ReplaceAllString(strings.TrimSpace(text), "_"))
}
// TokenizeString returns the words in the passed in string, split by non word characters including emojis
func TokenizeString(str string) []string {
return wordTokenRegex.FindAllString(str, -1)
}
// TokenizeStringByChars returns the words in the passed in string, split by the chars in the given string
func TokenizeStringByChars(str string, chars string) []string {
runes := []rune(chars)
f := func(c rune) bool {
for _, r := range runes {
if c == r {
return true
}
}
return false
}
return strings.FieldsFunc(str, f)
}
// TokenizeStringByUnicodeSeg tokenizes the given string using the Unicode Text Segmentation standard described at http://www.unicode.org/reports/tr29/
func TokenizeStringByUnicodeSeg(str string) []string {
segmenter := segment.NewWordSegmenter(strings.NewReader(str))
tokens := make([]string, 0)
for segmenter.Segment() {
token := string(segmenter.Bytes())
ttype := segmenter.Type()
if ttype != segment.None {
tokens = append(tokens, token)
}
}
return tokens
}
// PrefixOverlap returns the number of prefix characters which s1 and s2 have in common
func PrefixOverlap(s1, s2 string) int {
r1 := []rune(s1)
r2 := []rune(s2)
i := 0
for ; i < len(r1) && i < len(r2) && r1[i] == r2[i]; i++ {
}
return i
}
// StringSlices returns the slices of s defined by pairs of indexes in indices
func StringSlices(s string, indices []int) []string {
slices := make([]string, 0, len(indices)/2)
for i := 0; i < len(indices); i += 2 {
slices = append(slices, s[indices[i]:indices[i+1]])
}
return slices
}
// StringSliceContains determines whether the given slice of strings contains the given string
func StringSliceContains(slice []string, str string, caseSensitive bool) bool {
for _, s := range slice {
if (caseSensitive && s == str) || (!caseSensitive && strings.ToLower(s) == strings.ToLower(str)) {
return true
}
}
return false
}
// StringSet converts a slice of strings to a set (a string > bool map)
func StringSet(s []string) map[string]bool {
m := make(map[string]bool, len(s))
for _, v := range s {
m[v] = true
}
return m
}
// StringSetKeys returns the keys of string set in lexical order
func StringSetKeys(m map[string]bool) []string {
vals := make([]string, 0, len(m))
for v := range m {
vals = append(vals, v)
}
sort.Strings(vals)
return vals
}
// Indent indents each non-empty line in the given string
func Indent(s string, prefix string) string {
output := strings.Builder{}
bol := true
for _, c := range s {
if bol && c != '\n' {
output.WriteString(prefix)
}
output.WriteRune(c)
bol = c == '\n'
}
return output.String()
}
// Truncate truncates the given string to ensure it's less than limit characters
func Truncate(s string, limit int) string {
return truncate(s, limit, "")
}
// TruncateEllipsis truncates the given string and adds ellipsis where the input is cut
func TruncateEllipsis(s string, limit int) string {
return truncate(s, limit, "...")
}
func truncate(s string, limit int, ending string) string {
runes := []rune(s)
if len(runes) <= limit {
return s
}
return string(runes[:limit-len(ending)]) + ending
}
// Redactor is a function which can redact the given string
type Redactor func(s string) string
// NewRedactor creates a new redaction function which replaces the given values
func NewRedactor(mask string, values ...string) Redactor {
// convert list of redaction values to list of replacements with mask
replacements := make([]string, len(values)*2)
for i := range values {
replacements[i*2] = values[i]
replacements[i*2+1] = mask
}
return strings.NewReplacer(replacements...).Replace
}
// replaces any `\u0000` sequences with the given replacement sequence which may be empty.
// A sequence such as `\\u0000` is preserved as it is an escaped slash followed by the sequence `u0000`
func ReplaceEscapedNulls(data []byte, repl []byte) []byte {
return nullEscapeRegex.ReplaceAllFunc(data, func(m []byte) []byte {
slashes := bytes.Count(m, []byte(`\`))
if slashes%2 == 0 {
return m
}
return append(bytes.Repeat([]byte(`\`), slashes-1), repl...)
})
}
var nullEscapeRegex = regexp.MustCompile(`\\+u0{4}`)