-
Notifications
You must be signed in to change notification settings - Fork 20
/
text.go
136 lines (116 loc) · 3.63 KB
/
text.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
package utils
import (
"regexp"
"sort"
"strings"
"github.com/blevesearch/segment"
)
var snakedChars = regexp.MustCompile(`[^\p{L}\d_]+`)
// treats sequences of letters/numbers/_/' as tokens, and symbols as individual tokens
var wordTokenRegex = regexp.MustCompile(`[\pM\pL\pN_']+|\pS`)
// Snakify turns the passed in string into a context reference. We replace all whitespace
// characters with _ and replace any duplicate underscores
func Snakify(text string) string {
return strings.ToLower(snakedChars.ReplaceAllString(strings.TrimSpace(text), "_"))
}
// TokenizeString returns the words in the passed in string, split by non word characters including emojis
func TokenizeString(str string) []string {
return wordTokenRegex.FindAllString(str, -1)
}
// TokenizeStringByChars returns the words in the passed in string, split by the chars in the given string
func TokenizeStringByChars(str string, chars string) []string {
runes := []rune(chars)
f := func(c rune) bool {
for _, r := range runes {
if c == r {
return true
}
}
return false
}
return strings.FieldsFunc(str, f)
}
// TokenizeStringByUnicodeSeg tokenizes the given string using the Unicode Text Segmentation standard described at http://www.unicode.org/reports/tr29/
func TokenizeStringByUnicodeSeg(str string) []string {
segmenter := segment.NewWordSegmenter(strings.NewReader(str))
tokens := make([]string, 0)
for segmenter.Segment() {
token := string(segmenter.Bytes())
ttype := segmenter.Type()
if ttype != segment.None {
tokens = append(tokens, token)
}
}
return tokens
}
// PrefixOverlap returns the number of prefix characters which s1 and s2 have in common
func PrefixOverlap(s1, s2 string) int {
r1 := []rune(s1)
r2 := []rune(s2)
i := 0
for ; i < len(r1) && i < len(r2) && r1[i] == r2[i]; i++ {
}
return i
}
// StringSlices returns the slices of s defined by pairs of indexes in indices
func StringSlices(s string, indices []int) []string {
slices := make([]string, 0, len(indices)/2)
for i := 0; i < len(indices); i += 2 {
slices = append(slices, s[indices[i]:indices[i+1]])
}
return slices
}
// StringSliceContains determines whether the given slice of strings contains the given string
func StringSliceContains(slice []string, str string, caseSensitive bool) bool {
for _, s := range slice {
if (caseSensitive && s == str) || (!caseSensitive && strings.ToLower(s) == strings.ToLower(str)) {
return true
}
}
return false
}
// StringSet converts a slice of strings to a set (a string > bool map)
func StringSet(s []string) map[string]bool {
m := make(map[string]bool, len(s))
for _, v := range s {
m[v] = true
}
return m
}
// StringSetKeys returns the keys of string set in lexical order
func StringSetKeys(m map[string]bool) []string {
vals := make([]string, 0, len(m))
for v := range m {
vals = append(vals, v)
}
sort.Strings(vals)
return vals
}
// Indent indents each non-empty line in the given string
func Indent(s string, prefix string) string {
output := strings.Builder{}
bol := true
for _, c := range s {
if bol && c != '\n' {
output.WriteString(prefix)
}
output.WriteRune(c)
bol = c == '\n'
}
return output.String()
}
// Truncate truncates the given string to ensure it's less than limit characters
func Truncate(s string, limit int) string {
return truncate(s, limit, "")
}
// TruncateEllipsis truncates the given string and adds ellipsis where the input is cut
func TruncateEllipsis(s string, limit int) string {
return truncate(s, limit, "...")
}
func truncate(s string, limit int, ending string) string {
runes := []rune(s)
if len(runes) <= limit {
return s
}
return string(runes[:limit-len(ending)]) + ending
}