/
strutil.go
161 lines (141 loc) · 4.58 KB
/
strutil.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
package strutil
import (
"encoding/json"
"strings"
"unicode"
"github.com/markusmobius/go-dateparser/internal/regexp"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
"golang.org/x/text/unicode/rangetable"
)
var (
nfkcTransformer = transform.Chain(norm.NFKD, norm.NFKC)
unicodeTransformer = transform.Chain(norm.NFKD, runes.Remove(runes.In(unicode.Mn)), norm.NFKC)
aposthropeTransformer = runes.Map(func(r rune) rune {
switch r {
case '\u2019', // right single quotation mark
'\u02bc', // modifier letter apostrophe
'\u02bb', // modifier letter turned comma
'\u055a', // armenian apostrophe
'\ua78c', // latin small letter saltillo
'\u2032', // prime
'\u2035', // reversed prime
'\u02b9', // modifier letter prime
'\uff07': // fullwidth apostrophe
return '\''
default:
return r
}
})
braceTransformer = runes.Remove(runes.In(rangetable.New(
'{', '}',
'(', ')',
'<', '>',
'[', ']',
)))
rxSanitizeSkip = regexp.MustCompile(`\t|\n|\r|\x{00bb}|,\s\x{0432}\b|\x{200e}|\x{b7}|\x{200f}|\x{064e}|\x{064f}`)
rxSanitizeRussian = regexp.MustCompile(`(?i)([\PL\pN])\x{0433}\.`)
rxSanitizeCroatian = regexp.MustCompile(`(?i)(\pN+)\.\s?(\pN+)\.\s?(\pN+)\.( u)?`)
rxSanitizePeriod = regexp.MustCompile(`(?i)([^\pN\.\s])\.`)
rxSanitizeOn = regexp.MustCompile(`(?i)^.*?on:\s+(.*)`)
)
// SanitizeSpaces replaces non-breaking spaces into a normal one,
// then remove any excess whitespaces.
func SanitizeSpaces(s string) string {
s = strings.ReplaceAll(s, "\u00A0", "")
s = strings.Join(strings.Fields(s), " ")
return s
}
// NormalizeUnicode removes Nonspacing Mark (Mn) characters then
// use NFKC as the normal forms.
func NormalizeUnicode(str string) string {
normalized, _, err := transform.String(unicodeTransformer, str)
if err != nil {
return str
}
return normalized
}
// NormalizeAposthrope replace apostrope lookalike chars into the ordinary aposthrope.
func NormalizeAposthrope(s string) string {
normalized, _, _ := transform.String(aposthropeTransformer, s)
return normalized
}
// NormalizeCharset is used to normalize charset in a string. Used before
// detecting language of a string.
func NormalizeCharset(str string) string {
normalized, _, err := transform.String(nfkcTransformer, str)
if err == nil {
str = normalized
}
str = NormalizeAposthrope(str)
str = strings.ReplaceAll(str, ".", "")
str = strings.ToLower(str)
str = SanitizeSpaces(str)
return str
}
// NormalizeString is used to normalize the string before translated
// into another language. This function will normalizes the unicode,
// convert the string to lowercase then remove any excess whitespaces.
func NormalizeString(str string) string {
str = NormalizeUnicode(str)
str = strings.ToLower(str)
str = SanitizeSpaces(str)
return str
}
// SanitizeDate is used to sanitize the specified date string before
// it parsed by the parser.
func SanitizeDate(s string) string {
s = rxSanitizeSkip.ReplaceAllString(s, " ")
s = rxSanitizeRussian.ReplaceAllString(s, "$1 ") // remove 'г.' (Russian for year) but not in words
s = rxSanitizeCroatian.ReplaceAllString(s, "$1.$2.$3 ") // extra '.' and 'u' interferes with parsing relative fractional dates
s = SanitizeSpaces(s)
s = rxSanitizePeriod.ReplaceAllString(s, "$1")
s = rxSanitizeOn.ReplaceAllString(s, "$1")
s = strings.TrimSuffix(s, ":")
s = NormalizeAposthrope(s)
s = strings.TrimSpace(s)
return s
}
// StripBraces, as it name implies, will remove any braces from the string.
func StripBraces(s string) string {
stripped, _, _ := transform.String(braceTransformer, s)
return stripped
}
// RemoveRegion removes region code from a locale, returning the language of locale.
func RemoveRegion(locale string) string {
lastIdx := strings.LastIndex(locale, "-")
if lastIdx >= 0 {
suffix := locale[lastIdx+1:]
if suffix != "" && suffix == strings.ToUpper(suffix) {
return locale[:lastIdx]
}
}
return locale
}
// TrimChars will trim all character in beginning and end of the string
// that matched with the specified characters.
func TrimChars(s string, chars string) string {
charMap := map[rune]struct{}{}
for _, c := range chars {
charMap[c] = struct{}{}
}
return strings.TrimFunc(s, func(r rune) bool {
_, exist := charMap[r]
return exist
})
}
// IsNumberOnly check if string only consist of number.
func IsNumberOnly(s string) bool {
for _, r := range s {
if !unicode.IsDigit(r) {
return false
}
}
return true
}
// Jsonify is used to get a JSON rpresentation of specified data.
func Jsonify(data interface{}) string {
bt, _ := json.Marshal(data)
return string(bt)
}