-
Notifications
You must be signed in to change notification settings - Fork 20
/
text.go
46 lines (39 loc) · 1.28 KB
/
text.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
package utils
import (
"regexp"
"strings"
)
var snakedChars = regexp.MustCompile(`[^\p{L}\d_]+`)
// treats sequences of letters/numbers/_/' as tokens, and symbols as individual tokens
var wordTokenRegex = regexp.MustCompile(`[\pL\pN_']+|\pS`)
// Snakify turns the passed in string into a context reference. We replace all whitespace
// characters with _ and replace any duplicate underscores
func Snakify(text string) string {
return strings.Trim(strings.ToLower(snakedChars.ReplaceAllString(text, "_")), "_")
}
// TokenizeString returns the words in the passed in string, split by non word characters including emojis
func TokenizeString(str string) []string {
return wordTokenRegex.FindAllString(str, -1)
}
// TokenizeStringByChars returns the words in the passed in string, split by the chars in the given string
func TokenizeStringByChars(str string, chars string) []string {
runes := []rune(chars)
f := func(c rune) bool {
for _, r := range runes {
if c == r {
return true
}
}
return false
}
return strings.FieldsFunc(str, f)
}
// PrefixOverlap returns the number of prefix characters which s1 and s2 have in common
func PrefixOverlap(s1, s2 string) int {
r1 := []rune(s1)
r2 := []rune(s2)
r := 0
for ; r < len(r1) && r < len(r2) && r1[r] == r2[r]; r++ {
}
return r
}