olivia-ai · Deng-Xian-Sheng · Oct 3, 2022 · Oct 3, 2022 · Oct 3, 2022 · Oct 5, 2022
diff --git a/analysis/format.go b/analysis/format.go
@@ -2,21 +2,21 @@ package analysis
 
 import (
 	"fmt"
+	"github.com/go-ego/gse"
 	"regexp"
 	"strings"
 
 	"github.com/olivia-ai/olivia/locales"
-
 	"github.com/olivia-ai/olivia/util"
 	"github.com/tebeka/snowball"
 )
 
 // arrange checks the format of a string to normalize it, remove ignored characters
 func (sentence *Sentence) arrange() {
 	// Remove punctuation after letters
-	punctuationRegex := regexp.MustCompile(`[a-zA-Z]( )?(\.|\?|!|¿|¡)`)
+	punctuationRegex := regexp.MustCompile("[a-zA-Z\u4e00-\u9fa5]( )?([.?!¿¡，。？！])")
 	sentence.Content = punctuationRegex.ReplaceAllStringFunc(sentence.Content, func(s string) string {
-		punctuation := regexp.MustCompile(`(\.|\?|!)`)
+		punctuation := regexp.MustCompile(`([.?!¿¡，。？！])`)
 		return punctuation.ReplaceAllString(s, "")
 	})
 
@@ -77,16 +77,30 @@ func (sentence Sentence) stem() (tokenizeWords []string) {
 
 	tokens := sentence.tokenize()
 
-	stemmer, err := snowball.New(locale)
-	if err != nil {
-		fmt.Println("Stemmer error", err)
-		return
-	}
-
-	// Get the string token and push it to tokenizeWord
-	for _, tokenizeWord := range tokens {
-		word := stemmer.Stem(tokenizeWord)
-		tokenizeWords = append(tokenizeWords, word)
+	// Do not change the way of word segmentation in other languages
+	if locale == "chinese" {
+		var seg gse.Segmenter
+		err := seg.LoadDict()
+		if err != nil {
+			fmt.Println("Stemmer error", err)
+			return
+		}
+		// Get the string token and push it to tokenizeWord
+		for _, tokenizeWord := range tokens {
+			word := seg.Cut(tokenizeWord, true)
+			tokenizeWords = append(tokenizeWords, word...)
+		}
+	} else {
+		stemmer, err := snowball.New(locale)
+		if err != nil {
+			fmt.Println("Stemmer error", err)
+			return
+		}
+		// Get the string token and push it to tokenizeWord
+		for _, tokenizeWord := range tokens {
+			word := stemmer.Stem(tokenizeWord)
+			tokenizeWords = append(tokenizeWords, word)
+		}
 	}
 
 	return

diff --git a/go.mod b/go.mod
@@ -4,14 +4,12 @@ go 1.12
 
 require (
 	github.com/fatih/color v1.12.0 // indirect
-	github.com/goml/gobrain v0.0.0-20201212123421-2e2d98ca8249 // indirect
+	github.com/go-ego/gse v0.70.2
 	github.com/gookit/color v1.4.2
 	github.com/gorilla/mux v1.8.0
 	github.com/gorilla/websocket v1.4.2
-	github.com/mattn/go-colorable v0.1.8 // indirect
-	github.com/mattn/go-runewidth v0.0.13 // indirect
 	github.com/patrickmn/go-cache v2.1.0+incompatible
-	github.com/schollz/progressbar/v3 v3.8.3 // indirect
+	github.com/schollz/progressbar/v3 v3.8.3
 	github.com/soudy/mathcat v0.0.0-20201027222343-588f3d377cb9
 	github.com/tebeka/snowball v0.4.2
 	github.com/zmb3/spotify v1.3.0

diff --git a/go.sum b/go.sum
diff --git a/language/date/date.go b/language/date/date.go
@@ -36,6 +36,10 @@ var PatternTranslation = map[string]PatternTranslations{
 		DateRegex: `(από )?(το )?((μεθ )?αύριο|((σήμερα|απόψε)|(επόμενη )?(δευτέρα|τρίτη|τετάρτη|πέμπτη|παρασκευή|σάββατο|κυριακή))|(\d{2}|\d)(η)? (of )?(ιανουάριος|φεβρουάριος|μάρτιος|απρίλιος|μάιος|ιούνιος|ιούλιος|αύγουστος|σεπτέμβριος|οκτώβριος|νοέμβριος|δεκέμβριος)|((\d{2}|\d)/(\d{2}|\d)))`,
 		TimeRegex: `(at )?(\d{2}|\d)(:\d{2}|\d)?( )?(μμ|πμ|μ\.μ|π\.μ)`,
 	},
+	"zh": {
+		DateRegex: `(从 )?(明天|((今天|今晚)|(下个 )?(星期一|星期二|星期三|星期四|星期五|星期六|星期日))|(\d{2}|\d)(号|日)? (的 )?(一月|二月|三月|四月|五月|六月|七月|八月|九月|十月|十一月|十二月)|((\d{2}|\d)/(\d{2}|\d)))`,
+		TimeRegex: `(在 )?(\d{2}|\d)(:\d{2}|\d)?( )?(下午|上午)`,
+	},
 }
 
 // PatternTranslations are the translations of the regexs for dates

diff --git a/language/date/rules.go b/language/date/rules.go
@@ -119,6 +119,21 @@ var RuleTranslations = map[string]RuleTranslation{
 		RuleNextDayOfWeek: "επόμενη",
 		RuleNaturalDate:   `ιανουάριος|φεβρουάριος|μάρτιος|απρίλιος|μάιος|ιούνιος|ιούλιος|αύγουστος|σεπτέμβριος|οκτώβριος|νοέμβριος|δεκέμβριος`,
 	},
+	"zh": {
+		DaysOfWeek: []string{
+			"星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日",
+		},
+		Months: []string{
+			"一月", "二月", "三月", "四月", "五月", "六月", "七月",
+			"八月", "九月", "十月", "十一月", "十二月",
+		},
+		RuleToday:         `今天|今晚`,
+		RuleTomorrow:      `明天`,
+		RuleAfterTomorrow: "明天",
+		RuleDayOfWeek:     `(下个 )?(星期一|星期二|星期三|星期四|星期五|星期六|星期日)`,
+		RuleNextDayOfWeek: "下个",
+		RuleNaturalDate:   `一月|二月|三月|四月|五月|六月|七月|八月|九月|十月|十一月|十二月`,
+	},
 }
 
 // A RuleTranslation is all the texts/regexs to match the dates

diff --git a/language/math.go b/language/math.go
@@ -17,6 +17,7 @@ var MathDecimals = map[string]string{
 	"tr": `(\d+( |-)desimal(s)?)|(numara (dan )?desimal(s)? (mı )?\d+)`,
 	"nl": `(\d+( |-)decimal(en)?)|(nummer (van )?decimal(en)? (is )?\d+)`,
 	"el": `(\d+( |-)δεκαδικ(ό|ά)?)|(αριθμός (από )?δεκαδικ(ό|ά)? (είναι )?\d+)`,
+	"zh": `(\d+( |-)小数(s)?)|(数字 (是 )?小数(s)? (是 )?\d+)`,
 }
 
 // FindMathOperation finds a math operation in a string an returns it

diff --git a/language/movies.go b/language/movies.go
@@ -57,6 +57,10 @@ var (
 			"Δράση", "Περιπέτεια", "Κινούμενα Σχέδια", "Παιδικά", "Κωμωδία", "Έγκλημα", "Ντοκιμαντέρ", "Δράμα", "Φαντασία",
 			"Film-Noir", "Τρόμου", "Μουσική", "Μυστηρίου", "Ρομαντική", "Επιστημονική Φαντασία", "Θρίλλερ", "Πολέμου", "Western",
 		},
+		"zh": {
+			"动作", "冒险", "动画", "儿童", "喜剧", "犯罪", "纪录片", "剧情", "奇幻",
+			"黑色", "恐怖", "音乐", "神秘", "浪漫", "科幻", "惊悚", "战争", "西部",
+		},
 	}
 	movies = SerializeMovies()
 )

diff --git a/language/music.go b/language/music.go
@@ -51,6 +51,11 @@ var SpotifyKeyword = map[string]SpotifyKeywords{
 		From: "από",
 		On:   "στο",
 	},
+	"zh": {
+		Play: "播放",
+		From: "来自",
+		On:   "在",
+	},
 }
 
 // SpotifyKeywords are the keywords used to get music name

diff --git a/language/reason.go b/language/reason.go
@@ -42,6 +42,10 @@ var ReasonKeywords = map[string]ReasonKeyword{
 		That: "το οποίο",
 		To:   "στο",
 	},
+	"zh": {
+		That: "那",
+		To:   "到",
+	},
 }
 
 // ReasonKeyword are used to find reason for different languages

diff --git a/locales/locales.go b/locales/locales.go
@@ -11,13 +11,12 @@ import (
 	_ "github.com/olivia-ai/olivia/res/locales/it"
 	_ "github.com/olivia-ai/olivia/res/locales/nl"
 	_ "github.com/olivia-ai/olivia/res/locales/tr"
-	_ "github.com/olivia-ai/olivia/res/locales/el"
-
 )
 
 // Locales is the list of locales's tags and names
 // Please check if the language is supported in https://github.com/tebeka/snowball,
 // if it is please add the correct language name.
+// Note GSE is used in Chinese, but not in other languages.
 var Locales = []Locale{
 	{
 		Tag:  "en",
@@ -55,6 +54,10 @@ var Locales = []Locale{
 		Tag:  "el",
 		Name: "greek",
 	},
+	{
+		Tag:  "zh",
+		Name: "chinese",
+	},
 }
 
 // A Locale is a registered locale in the file