From 6e8a24c3408fb334563806afe73e8740d8b4a441 Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Sun, 2 May 2021 12:05:55 +0200 Subject: [PATCH] Fix errors in rule engine for Latvian (#92) --- .../github/pemistahl/lingua/api/LanguageDetector.kt | 10 +++++++--- .../com/github/pemistahl/lingua/internal/Constant.kt | 11 +++++------ .../pemistahl/lingua/api/LanguageDetectorTest.kt | 12 ++++++------ 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt b/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt index a3126b06..400d2c0c 100644 --- a/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt +++ b/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt @@ -22,7 +22,6 @@ import com.github.pemistahl.lingua.api.Language.UNKNOWN import com.github.pemistahl.lingua.internal.Alphabet import com.github.pemistahl.lingua.internal.Constant.CHARS_TO_LANGUAGES_MAPPING import com.github.pemistahl.lingua.internal.Constant.JAPANESE_CHARACTER_SET -import com.github.pemistahl.lingua.internal.Constant.MULTIPLE_SPACE_CHARACTERS import com.github.pemistahl.lingua.internal.Constant.MULTIPLE_WHITESPACE import com.github.pemistahl.lingua.internal.Constant.NO_LETTER import com.github.pemistahl.lingua.internal.Constant.NUMBERS @@ -178,7 +177,7 @@ class LanguageDetector internal constructor( } val normalizedText = normalizedTextBuilder.toString() return if (normalizedText.contains(' ')) { - normalizedText.split(MULTIPLE_SPACE_CHARACTERS) + normalizedText.split(' ').filter { it.isNotBlank() } } else { listOf(normalizedText) } @@ -284,7 +283,12 @@ class LanguageDetector internal constructor( if (filteredLanguageCounts.size == 1) { return filteredLanguageCounts.toList().first().first } - + if (filteredLanguageCounts.size == 2 && + filteredLanguageCounts.containsKey(CHINESE) && + filteredLanguageCounts.containsKey(JAPANESE) + ) { + return JAPANESE + } val sortedTotalLanguageCounts = filteredLanguageCounts.toList().sortedByDescending { it.second } val (mostFrequentLanguage, firstCharCount) = sortedTotalLanguageCounts[0] val (_, secondCharCount) = sortedTotalLanguageCounts[1] diff --git a/src/main/kotlin/com/github/pemistahl/lingua/internal/Constant.kt b/src/main/kotlin/com/github/pemistahl/lingua/internal/Constant.kt index 0b663e65..d0bcb5e7 100644 --- a/src/main/kotlin/com/github/pemistahl/lingua/internal/Constant.kt +++ b/src/main/kotlin/com/github/pemistahl/lingua/internal/Constant.kt @@ -75,12 +75,12 @@ internal object Constant { "ЈјЉљЊњ" to setOf(MACEDONIAN, SERBIAN), "ĀāĒēĪī" to setOf(LATVIAN, YORUBA), "ẸẹỌọ" to setOf(VIETNAMESE, YORUBA), + "ÐðÞþ" to setOf(ICELANDIC, TURKISH), + "Ûû" to setOf(FRENCH, HUNGARIAN), "Ūū" to setOf(LATVIAN, LITHUANIAN, YORUBA), "Şş" to setOf(AZERBAIJANI, ROMANIAN, TURKISH), "Ďď" to setOf(CZECH, ROMANIAN, SLOVAK), - "ÐðÞþ" to setOf(ICELANDIC, LATVIAN, TURKISH), - "Ûû" to setOf(FRENCH, HUNGARIAN, LATVIAN), "Ćć" to setOf(BOSNIAN, CROATIAN, POLISH), "Đđ" to setOf(BOSNIAN, CROATIAN, VIETNAMESE), "Іі" to setOf(BELARUSIAN, KAZAKH, UKRAINIAN), @@ -94,19 +94,19 @@ internal object Constant { "Øø" to setOf(BOKMAL, DANISH, NYNORSK), "ЁёЫыЭэ" to setOf(BELARUSIAN, KAZAKH, MONGOLIAN, RUSSIAN), "ЩщЪъ" to setOf(BULGARIAN, KAZAKH, MONGOLIAN, RUSSIAN), + "Òò" to setOf(CATALAN, ITALIAN, VIETNAMESE, YORUBA), + "Ââ" to setOf(PORTUGUESE, ROMANIAN, TURKISH, VIETNAMESE), - "Òò" to setOf(CATALAN, ITALIAN, LATVIAN, VIETNAMESE, YORUBA), "Ýý" to setOf(CZECH, ICELANDIC, SLOVAK, TURKISH, VIETNAMESE), "Ää" to setOf(ESTONIAN, FINNISH, GERMAN, SLOVAK, SWEDISH), - "Ââ" to setOf(LATVIAN, PORTUGUESE, ROMANIAN, TURKISH, VIETNAMESE), "Àà" to setOf(CATALAN, FRENCH, ITALIAN, PORTUGUESE, VIETNAMESE), "Ææ" to setOf(BOKMAL, DANISH, ICELANDIC, NYNORSK), "Åå" to setOf(BOKMAL, DANISH, NYNORSK, SWEDISH), "Üü" to setOf(AZERBAIJANI, CATALAN, ESTONIAN, GERMAN, HUNGARIAN, SPANISH, TURKISH), "Č芚Žž" to setOf(BOSNIAN, CZECH, CROATIAN, LATVIAN, LITHUANIAN, SLOVAK, SLOVENE), + "Çç" to setOf(ALBANIAN, AZERBAIJANI, BASQUE, CATALAN, FRENCH, PORTUGUESE, TURKISH), - "Çç" to setOf(ALBANIAN, AZERBAIJANI, BASQUE, CATALAN, FRENCH, LATVIAN, PORTUGUESE, TURKISH), "Öö" to setOf(AZERBAIJANI, ESTONIAN, FINNISH, GERMAN, HUNGARIAN, ICELANDIC, SWEDISH, TURKISH), "Óó" to setOf(CATALAN, HUNGARIAN, ICELANDIC, IRISH, POLISH, PORTUGUESE, SLOVAK, SPANISH, VIETNAMESE, YORUBA), @@ -124,7 +124,6 @@ internal object Constant { } val LANGUAGES_SUPPORTING_LOGOGRAMS = setOf(CHINESE, JAPANESE, KOREAN) val MULTIPLE_WHITESPACE = Regex("\\s+") - val MULTIPLE_SPACE_CHARACTERS = Regex(" +") val NO_LETTER = Regex("^[^\\p{L}]+$") val NUMBERS = Regex("\\p{N}") val PUNCTUATION = Regex("\\p{P}") diff --git a/src/test/kotlin/com/github/pemistahl/lingua/api/LanguageDetectorTest.kt b/src/test/kotlin/com/github/pemistahl/lingua/api/LanguageDetectorTest.kt index 6595e952..5da63aaa 100644 --- a/src/test/kotlin/com/github/pemistahl/lingua/api/LanguageDetectorTest.kt +++ b/src/test/kotlin/com/github/pemistahl/lingua/api/LanguageDetectorTest.kt @@ -610,15 +610,15 @@ class LanguageDetectorTest { ), arguments( "minjaverðir", - listOf(ICELANDIC, LATVIAN, TURKISH) + listOf(ICELANDIC, TURKISH) ), arguments( "þagnarskyldu", - listOf(ICELANDIC, LATVIAN, TURKISH) + listOf(ICELANDIC, TURKISH) ), arguments( "nebûtu", - listOf(FRENCH, HUNGARIAN, LATVIAN) + listOf(FRENCH, HUNGARIAN) ), arguments( "hashemidëve", @@ -642,7 +642,7 @@ class LanguageDetectorTest { ), arguments( "viòiem", - listOf(CATALAN, ITALIAN, LATVIAN, VIETNAMESE, YORUBA) + listOf(CATALAN, ITALIAN, VIETNAMESE, YORUBA) ), arguments( "contrôle", @@ -662,7 +662,7 @@ class LanguageDetectorTest { ), arguments( "labâk", - listOf(LATVIAN, PORTUGUESE, ROMANIAN, TURKISH, VIETNAMESE) + listOf(PORTUGUESE, ROMANIAN, TURKISH, VIETNAMESE) ), arguments( "pràctiques", @@ -694,7 +694,7 @@ class LanguageDetectorTest { ), arguments( "façonnage", - listOf(ALBANIAN, AZERBAIJANI, BASQUE, CATALAN, FRENCH, LATVIAN, PORTUGUESE, TURKISH) + listOf(ALBANIAN, AZERBAIJANI, BASQUE, CATALAN, FRENCH, PORTUGUESE, TURKISH) ), arguments( "höher",