-
Notifications
You must be signed in to change notification settings - Fork 20
/
text.go
98 lines (83 loc) · 2.68 KB
/
text.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
package utils
import (
"regexp"
"strings"
"github.com/blevesearch/segment"
)
var snakedChars = regexp.MustCompile(`[^\p{L}\d_]+`)
// treats sequences of letters/numbers/_/' as tokens, and symbols as individual tokens
var wordTokenRegex = regexp.MustCompile(`[\pM\pL\pN_']+|\pS`)
// Snakify turns the passed in string into a context reference. We replace all whitespace
// characters with _ and replace any duplicate underscores
func Snakify(text string) string {
return strings.ToLower(snakedChars.ReplaceAllString(strings.TrimSpace(text), "_"))
}
// TokenizeString returns the words in the passed in string, split by non word characters including emojis
func TokenizeString(str string) []string {
return wordTokenRegex.FindAllString(str, -1)
}
// TokenizeStringByChars returns the words in the passed in string, split by the chars in the given string
func TokenizeStringByChars(str string, chars string) []string {
runes := []rune(chars)
f := func(c rune) bool {
for _, r := range runes {
if c == r {
return true
}
}
return false
}
return strings.FieldsFunc(str, f)
}
// TokenizeStringByUnicodeSeg tokenizes the given string using the Unicode Text Segmentation standard described at http://www.unicode.org/reports/tr29/
func TokenizeStringByUnicodeSeg(str string) []string {
segmenter := segment.NewWordSegmenter(strings.NewReader(str))
tokens := make([]string, 0)
for segmenter.Segment() {
token := string(segmenter.Bytes())
ttype := segmenter.Type()
if ttype != segment.None {
tokens = append(tokens, token)
}
}
return tokens
}
// PrefixOverlap returns the number of prefix characters which s1 and s2 have in common
func PrefixOverlap(s1, s2 string) int {
r1 := []rune(s1)
r2 := []rune(s2)
i := 0
for ; i < len(r1) && i < len(r2) && r1[i] == r2[i]; i++ {
}
return i
}
// StringSlices returns the slices of s defined by pairs of indexes in indices
func StringSlices(s string, indices []int) []string {
slices := make([]string, 0, len(indices)/2)
for i := 0; i < len(indices); i += 2 {
slices = append(slices, s[indices[i]:indices[i+1]])
}
return slices
}
// StringSliceContains determines whether the given slice of strings contains the given string
func StringSliceContains(slice []string, str string, caseSensitive bool) bool {
for _, s := range slice {
if (caseSensitive && s == str) || (!caseSensitive && strings.EqualFold(s, str)) {
return true
}
}
return false
}
// Indent indents each non-empty line in the given string
func Indent(s string, prefix string) string {
output := strings.Builder{}
bol := true
for _, c := range s {
if bol && c != '\n' {
output.WriteString(prefix)
}
output.WriteRune(c)
bol = c == '\n'
}
return output.String()
}