Skip to content

Commit

Permalink
Fix errors in rule engine for Latvian (#92)
Browse files Browse the repository at this point in the history
  • Loading branch information
pemistahl committed May 2, 2021
1 parent 91c22a3 commit 6e8a24c
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import com.github.pemistahl.lingua.api.Language.UNKNOWN
import com.github.pemistahl.lingua.internal.Alphabet
import com.github.pemistahl.lingua.internal.Constant.CHARS_TO_LANGUAGES_MAPPING
import com.github.pemistahl.lingua.internal.Constant.JAPANESE_CHARACTER_SET
import com.github.pemistahl.lingua.internal.Constant.MULTIPLE_SPACE_CHARACTERS
import com.github.pemistahl.lingua.internal.Constant.MULTIPLE_WHITESPACE
import com.github.pemistahl.lingua.internal.Constant.NO_LETTER
import com.github.pemistahl.lingua.internal.Constant.NUMBERS
Expand Down Expand Up @@ -178,7 +177,7 @@ class LanguageDetector internal constructor(
}
val normalizedText = normalizedTextBuilder.toString()
return if (normalizedText.contains(' ')) {
normalizedText.split(MULTIPLE_SPACE_CHARACTERS)
normalizedText.split(' ').filter { it.isNotBlank() }
} else {
listOf(normalizedText)
}
Expand Down Expand Up @@ -284,7 +283,12 @@ class LanguageDetector internal constructor(
if (filteredLanguageCounts.size == 1) {
return filteredLanguageCounts.toList().first().first
}

if (filteredLanguageCounts.size == 2 &&
filteredLanguageCounts.containsKey(CHINESE) &&
filteredLanguageCounts.containsKey(JAPANESE)
) {
return JAPANESE
}
val sortedTotalLanguageCounts = filteredLanguageCounts.toList().sortedByDescending { it.second }
val (mostFrequentLanguage, firstCharCount) = sortedTotalLanguageCounts[0]
val (_, secondCharCount) = sortedTotalLanguageCounts[1]
Expand Down
11 changes: 5 additions & 6 deletions src/main/kotlin/com/github/pemistahl/lingua/internal/Constant.kt
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,12 @@ internal object Constant {
"ЈјЉљЊњ" to setOf(MACEDONIAN, SERBIAN),
"ĀāĒēĪī" to setOf(LATVIAN, YORUBA),
"ẸẹỌọ" to setOf(VIETNAMESE, YORUBA),
"ÐðÞþ" to setOf(ICELANDIC, TURKISH),
"Ûû" to setOf(FRENCH, HUNGARIAN),

"Ūū" to setOf(LATVIAN, LITHUANIAN, YORUBA),
"Şş" to setOf(AZERBAIJANI, ROMANIAN, TURKISH),
"Ďď" to setOf(CZECH, ROMANIAN, SLOVAK),
"ÐðÞþ" to setOf(ICELANDIC, LATVIAN, TURKISH),
"Ûû" to setOf(FRENCH, HUNGARIAN, LATVIAN),
"Ćć" to setOf(BOSNIAN, CROATIAN, POLISH),
"Đđ" to setOf(BOSNIAN, CROATIAN, VIETNAMESE),
"Іі" to setOf(BELARUSIAN, KAZAKH, UKRAINIAN),
Expand All @@ -94,19 +94,19 @@ internal object Constant {
"Øø" to setOf(BOKMAL, DANISH, NYNORSK),
"ЁёЫыЭэ" to setOf(BELARUSIAN, KAZAKH, MONGOLIAN, RUSSIAN),
"ЩщЪъ" to setOf(BULGARIAN, KAZAKH, MONGOLIAN, RUSSIAN),
"Òò" to setOf(CATALAN, ITALIAN, VIETNAMESE, YORUBA),
"Ââ" to setOf(PORTUGUESE, ROMANIAN, TURKISH, VIETNAMESE),

"Òò" to setOf(CATALAN, ITALIAN, LATVIAN, VIETNAMESE, YORUBA),
"Ýý" to setOf(CZECH, ICELANDIC, SLOVAK, TURKISH, VIETNAMESE),
"Ää" to setOf(ESTONIAN, FINNISH, GERMAN, SLOVAK, SWEDISH),
"Ââ" to setOf(LATVIAN, PORTUGUESE, ROMANIAN, TURKISH, VIETNAMESE),
"Àà" to setOf(CATALAN, FRENCH, ITALIAN, PORTUGUESE, VIETNAMESE),
"Ææ" to setOf(BOKMAL, DANISH, ICELANDIC, NYNORSK),
"Åå" to setOf(BOKMAL, DANISH, NYNORSK, SWEDISH),

"Üü" to setOf(AZERBAIJANI, CATALAN, ESTONIAN, GERMAN, HUNGARIAN, SPANISH, TURKISH),
"Č芚Žž" to setOf(BOSNIAN, CZECH, CROATIAN, LATVIAN, LITHUANIAN, SLOVAK, SLOVENE),
"Çç" to setOf(ALBANIAN, AZERBAIJANI, BASQUE, CATALAN, FRENCH, PORTUGUESE, TURKISH),

"Çç" to setOf(ALBANIAN, AZERBAIJANI, BASQUE, CATALAN, FRENCH, LATVIAN, PORTUGUESE, TURKISH),
"Öö" to setOf(AZERBAIJANI, ESTONIAN, FINNISH, GERMAN, HUNGARIAN, ICELANDIC, SWEDISH, TURKISH),

"Óó" to setOf(CATALAN, HUNGARIAN, ICELANDIC, IRISH, POLISH, PORTUGUESE, SLOVAK, SPANISH, VIETNAMESE, YORUBA),
Expand All @@ -124,7 +124,6 @@ internal object Constant {
}
val LANGUAGES_SUPPORTING_LOGOGRAMS = setOf(CHINESE, JAPANESE, KOREAN)
val MULTIPLE_WHITESPACE = Regex("\\s+")
val MULTIPLE_SPACE_CHARACTERS = Regex(" +")
val NO_LETTER = Regex("^[^\\p{L}]+$")
val NUMBERS = Regex("\\p{N}")
val PUNCTUATION = Regex("\\p{P}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -610,15 +610,15 @@ class LanguageDetectorTest {
),
arguments(
"minjaverðir",
listOf(ICELANDIC, LATVIAN, TURKISH)
listOf(ICELANDIC, TURKISH)
),
arguments(
"þagnarskyldu",
listOf(ICELANDIC, LATVIAN, TURKISH)
listOf(ICELANDIC, TURKISH)
),
arguments(
"nebûtu",
listOf(FRENCH, HUNGARIAN, LATVIAN)
listOf(FRENCH, HUNGARIAN)
),
arguments(
"hashemidëve",
Expand All @@ -642,7 +642,7 @@ class LanguageDetectorTest {
),
arguments(
"viòiem",
listOf(CATALAN, ITALIAN, LATVIAN, VIETNAMESE, YORUBA)
listOf(CATALAN, ITALIAN, VIETNAMESE, YORUBA)
),
arguments(
"contrôle",
Expand All @@ -662,7 +662,7 @@ class LanguageDetectorTest {
),
arguments(
"labâk",
listOf(LATVIAN, PORTUGUESE, ROMANIAN, TURKISH, VIETNAMESE)
listOf(PORTUGUESE, ROMANIAN, TURKISH, VIETNAMESE)
),
arguments(
"pràctiques",
Expand Down Expand Up @@ -694,7 +694,7 @@ class LanguageDetectorTest {
),
arguments(
"façonnage",
listOf(ALBANIAN, AZERBAIJANI, BASQUE, CATALAN, FRENCH, LATVIAN, PORTUGUESE, TURKISH)
listOf(ALBANIAN, AZERBAIJANI, BASQUE, CATALAN, FRENCH, PORTUGUESE, TURKISH)
),
arguments(
"höher",
Expand Down

0 comments on commit 6e8a24c

Please sign in to comment.