Skip to content
This repository has been archived by the owner on Apr 2, 2024. It is now read-only.

Add Chinese support #171

Open
wants to merge 4 commits into
base: v3
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 27 additions & 13 deletions analysis/format.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@ package analysis

import (
"fmt"
"github.com/go-ego/gse"
"regexp"
"strings"

"github.com/olivia-ai/olivia/locales"

"github.com/olivia-ai/olivia/util"
"github.com/tebeka/snowball"
)

// arrange checks the format of a string to normalize it, remove ignored characters
func (sentence *Sentence) arrange() {
// Remove punctuation after letters
punctuationRegex := regexp.MustCompile(`[a-zA-Z]( )?(\.|\?|!|¿|¡)`)
punctuationRegex := regexp.MustCompile("[a-zA-Z\u4e00-\u9fa5]( )?([.?!¿¡,。?!])")
sentence.Content = punctuationRegex.ReplaceAllStringFunc(sentence.Content, func(s string) string {
punctuation := regexp.MustCompile(`(\.|\?|!)`)
punctuation := regexp.MustCompile(`([.?!¿¡,。?!])`)
return punctuation.ReplaceAllString(s, "")
})

Expand Down Expand Up @@ -77,16 +77,30 @@ func (sentence Sentence) stem() (tokenizeWords []string) {

tokens := sentence.tokenize()

stemmer, err := snowball.New(locale)
if err != nil {
fmt.Println("Stemmer error", err)
return
}

// Get the string token and push it to tokenizeWord
for _, tokenizeWord := range tokens {
word := stemmer.Stem(tokenizeWord)
tokenizeWords = append(tokenizeWords, word)
// Do not change the way of word segmentation in other languages
if locale == "chinese" {
var seg gse.Segmenter
err := seg.LoadDict()
if err != nil {
fmt.Println("Stemmer error", err)
return
}
// Get the string token and push it to tokenizeWord
for _, tokenizeWord := range tokens {
word := seg.Cut(tokenizeWord, true)
tokenizeWords = append(tokenizeWords, word...)
}
} else {
stemmer, err := snowball.New(locale)
if err != nil {
fmt.Println("Stemmer error", err)
return
}
// Get the string token and push it to tokenizeWord
for _, tokenizeWord := range tokens {
word := stemmer.Stem(tokenizeWord)
tokenizeWords = append(tokenizeWords, word)
}
}

return
Expand Down
6 changes: 2 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@ go 1.12

require (
github.com/fatih/color v1.12.0 // indirect
github.com/goml/gobrain v0.0.0-20201212123421-2e2d98ca8249 // indirect
github.com/go-ego/gse v0.70.2
github.com/gookit/color v1.4.2
github.com/gorilla/mux v1.8.0
github.com/gorilla/websocket v1.4.2
github.com/mattn/go-colorable v0.1.8 // indirect
github.com/mattn/go-runewidth v0.0.13 // indirect
github.com/patrickmn/go-cache v2.1.0+incompatible
github.com/schollz/progressbar/v3 v3.8.3 // indirect
github.com/schollz/progressbar/v3 v3.8.3
github.com/soudy/mathcat v0.0.0-20201027222343-588f3d377cb9
github.com/tebeka/snowball v0.4.2
github.com/zmb3/spotify v1.3.0
Expand Down
68 changes: 15 additions & 53 deletions go.sum

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions language/date/date.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ var PatternTranslation = map[string]PatternTranslations{
DateRegex: `(από )?(το )?((μεθ )?αύριο|((σήμερα|απόψε)|(επόμενη )?(δευτέρα|τρίτη|τετάρτη|πέμπτη|παρασκευή|σάββατο|κυριακή))|(\d{2}|\d)(η)? (of )?(ιανουάριος|φεβρουάριος|μάρτιος|απρίλιος|μάιος|ιούνιος|ιούλιος|αύγουστος|σεπτέμβριος|οκτώβριος|νοέμβριος|δεκέμβριος)|((\d{2}|\d)/(\d{2}|\d)))`,
TimeRegex: `(at )?(\d{2}|\d)(:\d{2}|\d)?( )?(μμ|πμ|μ\.μ|π\.μ)`,
},
"zh": {
DateRegex: `(从 )?(明天|((今天|今晚)|(下个 )?(星期一|星期二|星期三|星期四|星期五|星期六|星期日))|(\d{2}|\d)(号|日)? (的 )?(一月|二月|三月|四月|五月|六月|七月|八月|九月|十月|十一月|十二月)|((\d{2}|\d)/(\d{2}|\d)))`,
TimeRegex: `(在 )?(\d{2}|\d)(:\d{2}|\d)?( )?(下午|上午)`,
},
}

// PatternTranslations are the translations of the regexs for dates
Expand Down
15 changes: 15 additions & 0 deletions language/date/rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,21 @@ var RuleTranslations = map[string]RuleTranslation{
RuleNextDayOfWeek: "επόμενη",
RuleNaturalDate: `ιανουάριος|φεβρουάριος|μάρτιος|απρίλιος|μάιος|ιούνιος|ιούλιος|αύγουστος|σεπτέμβριος|οκτώβριος|νοέμβριος|δεκέμβριος`,
},
"zh": {
DaysOfWeek: []string{
"星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日",
},
Months: []string{
"一月", "二月", "三月", "四月", "五月", "六月", "七月",
"八月", "九月", "十月", "十一月", "十二月",
},
RuleToday: `今天|今晚`,
RuleTomorrow: `明天`,
RuleAfterTomorrow: "明天",
RuleDayOfWeek: `(下个 )?(星期一|星期二|星期三|星期四|星期五|星期六|星期日)`,
RuleNextDayOfWeek: "下个",
RuleNaturalDate: `一月|二月|三月|四月|五月|六月|七月|八月|九月|十月|十一月|十二月`,
},
}

// A RuleTranslation is all the texts/regexs to match the dates
Expand Down
1 change: 1 addition & 0 deletions language/math.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ var MathDecimals = map[string]string{
"tr": `(\d+( |-)desimal(s)?)|(numara (dan )?desimal(s)? (mı )?\d+)`,
"nl": `(\d+( |-)decimal(en)?)|(nummer (van )?decimal(en)? (is )?\d+)`,
"el": `(\d+( |-)δεκαδικ(ό|ά)?)|(αριθμός (από )?δεκαδικ(ό|ά)? (είναι )?\d+)`,
"zh": `(\d+( |-)小数(s)?)|(数字 (是 )?小数(s)? (是 )?\d+)`,
}

// FindMathOperation finds a math operation in a string an returns it
Expand Down
4 changes: 4 additions & 0 deletions language/movies.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ var (
"Δράση", "Περιπέτεια", "Κινούμενα Σχέδια", "Παιδικά", "Κωμωδία", "Έγκλημα", "Ντοκιμαντέρ", "Δράμα", "Φαντασία",
"Film-Noir", "Τρόμου", "Μουσική", "Μυστηρίου", "Ρομαντική", "Επιστημονική Φαντασία", "Θρίλλερ", "Πολέμου", "Western",
},
"zh": {
"动作", "冒险", "动画", "儿童", "喜剧", "犯罪", "纪录片", "剧情", "奇幻",
"黑色", "恐怖", "音乐", "神秘", "浪漫", "科幻", "惊悚", "战争", "西部",
},
}
movies = SerializeMovies()
)
Expand Down
5 changes: 5 additions & 0 deletions language/music.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ var SpotifyKeyword = map[string]SpotifyKeywords{
From: "από",
On: "στο",
},
"zh": {
Play: "播放",
From: "来自",
On: "在",
},
}

// SpotifyKeywords are the keywords used to get music name
Expand Down
4 changes: 4 additions & 0 deletions language/reason.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ var ReasonKeywords = map[string]ReasonKeyword{
That: "το οποίο",
To: "στο",
},
"zh": {
That: "那",
To: "到",
},
}

// ReasonKeyword are used to find reason for different languages
Expand Down
7 changes: 5 additions & 2 deletions locales/locales.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,12 @@ import (
_ "github.com/olivia-ai/olivia/res/locales/it"
_ "github.com/olivia-ai/olivia/res/locales/nl"
_ "github.com/olivia-ai/olivia/res/locales/tr"
_ "github.com/olivia-ai/olivia/res/locales/el"

)

// Locales is the list of locales's tags and names
// Please check if the language is supported in https://github.com/tebeka/snowball,
// if it is please add the correct language name.
// Note GSE is used in Chinese, but not in other languages.
var Locales = []Locale{
{
Tag: "en",
Expand Down Expand Up @@ -55,6 +54,10 @@ var Locales = []Locale{
Tag: "el",
Name: "greek",
},
{
Tag: "zh",
Name: "chinese",
},
}

// A Locale is a registered locale in the file
Expand Down
Loading