Skip to content

Commit

Permalink
Use configs to drive language resources
Browse files Browse the repository at this point in the history
Closes #11
  • Loading branch information
eponvert committed Dec 8, 2015
1 parent 6f6e5b3 commit 84475b2
Show file tree
Hide file tree
Showing 15 changed files with 106 additions and 99 deletions.
@@ -1,3 +1,4 @@
lang.ar.stopwords = [
ب
ا
أ
Expand Down Expand Up @@ -160,3 +161,4 @@
يكون
يمكن
مليون
]
@@ -1,3 +1,4 @@
lang.de.stopwords = [
a
ab
aber
Expand Down Expand Up @@ -492,7 +493,7 @@ u
uhr
um
und
und?
"und?"
uns
unser
unsere
Expand All @@ -514,7 +515,7 @@ vom
von
vor
w
wahr?
"wahr?"
während
währenddem
währenddessen
Expand Down Expand Up @@ -601,3 +602,4 @@ zweiter
zweites
zwischen
zwölf
]
@@ -1,3 +1,4 @@
lang.en.stopwords = [
a
a's
able
Expand Down Expand Up @@ -520,3 +521,4 @@ amp
ly
bit
didn
]
@@ -1,3 +1,4 @@
lang.es.stopwords = [
él
ésta
éstas
Expand Down Expand Up @@ -349,3 +350,4 @@ vez
y
ya
yo
]
@@ -1,3 +1,4 @@
lang.fr.stopwords = [
a
à
â
Expand Down Expand Up @@ -461,3 +462,4 @@ x
y
z
zut
]
@@ -1,3 +1,4 @@
lang.in.stopwords = [
yang
di
dan
Expand Down Expand Up @@ -327,3 +328,4 @@ keterangan
penggunaan
masing-masing
menghadapi
]
@@ -1,3 +1,4 @@
lang.ja.stopwords = [
これ
それ
あれ
Expand Down Expand Up @@ -41,4 +42,5 @@
それで
しかし
しかし
]
@@ -1,3 +1,4 @@
lang.ms.stopwords = [
abdul
abdullah
acara
Expand Down Expand Up @@ -473,3 +474,4 @@ wang
wanita
wilayah
yang
]
@@ -1,3 +1,4 @@
lang.nl.stopwords = [
aan
af
al
Expand Down Expand Up @@ -34,7 +35,7 @@ heeft
hem
het
hier
hij
hij
hoe
hun
iemand
Expand All @@ -43,7 +44,7 @@ ik
in
is
ja
je
je
kan
kon
kunnen
Expand Down Expand Up @@ -101,4 +102,5 @@ zij
zijn
zo
zonder
zou
zou
]
@@ -1,3 +1,4 @@
lang.pt.stopwords = [
a
à
adeus
Expand Down Expand Up @@ -391,3 +392,4 @@ vosso
vossos
zero
vc
]
15 changes: 15 additions & 0 deletions src/main/resources/reference.conf
@@ -1,3 +1,18 @@
include "/ar.conf"
include "/de.conf"
include "/en.conf"
include "/es.conf"
include "/fr.conf"
include "/in.conf"
include "/ja.conf"
include "/ms.conf"
include "/en.conf"
include "/en.conf"
include "/en.conf"
include "/en.conf"
include "/en.conf"
include "/en.conf"

lang.classify {
default_threshold = 0.85
default_frequency = 0.1
Expand Down
@@ -1,3 +1,4 @@
lang.sv.stopwords = [
aderton
adertonde
adjö
Expand Down Expand Up @@ -82,10 +83,10 @@ era
ert
ett
ettusen
fanns
får
fått
fått
fem
femte
femtio
Expand Down Expand Up @@ -384,3 +385,4 @@ vilka
vilken
vilket
vill
]
@@ -1,3 +1,4 @@
lang.tr.stopwords = [
acaba
altmýþ
altý
Expand Down Expand Up @@ -112,3 +113,4 @@ yüz
þunda
þundan
þunu
]
126 changes: 47 additions & 79 deletions src/main/scala/com/peoplepattern/text/LangBundle.scala
Expand Up @@ -215,82 +215,40 @@ trait LangBundle {
object LangBundle {

import scala.io.Source
import com.typesafe.config.ConfigFactory
import scala.collection.JavaConversions._

private def srcFromResource(path: String) = {
Source.fromInputStream(getClass.getResourceAsStream(path))
val conf = ConfigFactory.load

private def chkLangCode(code: String) {
require("^[a-z]{2}$".r.pattern.matcher(code).matches)
}

private def stopwords(lang: String) = {
val src = srcFromResource(s"/$lang/stopwords.txt")
try {
src.getLines.toSet
} finally {
src.close()
}
def stopwords(lang: String): Set[String] = {
chkLangCode(lang)
Set(conf.getStringList(s"lang.$lang.stopwords"): _*)
}

private def mkBundle(lang: String) = {
val stops = stopwords(lang)
def mkBundle(lang: String) = {
new LangBundle {
val stopwords = stops
lazy val stopwords = LangBundle.stopwords(lang)
}
}

/** The [[LangBundle]] for German */
lazy val de = mkBundle("de")

/** The [[LangBundle]] for English */
lazy val en = mkBundle("en")

/** The [[LangBundle]] for Spanish */
lazy val es = mkBundle("es")

/** The [[LangBundle]] for French */
lazy val fr = mkBundle("fr")

/** The [[LangBundle]] for Indonesian */
lazy val in = mkBundle("in")

/**
* The [[LangBundle]] for Japanese
*
* TODO improved tokenizer
*/
lazy val ja = mkBundle("ja")

/** The [[LangBundle]] for Malay */
lazy val ms = mkBundle("ms")

/** The [[LangBundle]] for Dutch */
lazy val nl = mkBundle("nl")

/** The [[LangBundle]] for Portuguese */
lazy val pt = mkBundle("pt")

/** The [[LangBundle]] for Swedish */
lazy val sv = mkBundle("sv")

/** The [[LangBundle]] for Turkish */
lazy val tr = mkBundle("tr")

/** The [[LangBundle]] for Armenian */
lazy val ar = mkBundle("ar")
private val LangRegx = """^([a-z][a-z])\.stopwords$""".r

/** The set of supported languages */
def langs = Set(
"de",
"en",
"es",
"fr",
"in",
"ja",
"ms",
"nl",
"pt",
"sv",
"tr",
"ar"
)
lazy val langs: Set[String] = conf.getConfig("lang")
.entrySet
.map(_.getKey)
.map { key =>
key match {
case LangRegx(lang) => Some(lang)
case _ => None
}
}
.flatten
.toSet

/**
* A language bundle for text for which we don't have an identified language
Expand All @@ -304,24 +262,34 @@ object LangBundle {
}
}

// To update with custom language processing for, e.g. Japanese tokenization,
// do something like:
//
// val jaBundle = new LangBundle { /* custom stuff */ }
//
// private val langBundles: Map[String, LangBundle] =
// langs.map { lang => lang -> mkBundle(lang) }.toMap + ("ja" -> jaBundle)

private val langBundles: Map[String, LangBundle] =
langs.map { lang => lang -> mkBundle(lang) }.toMap

/**
* Look up the [[LangBundle]] by language code
*
* @param langCode two-letter ISO 639-1 language code
*/
def apply(langCode: String) = {
chkLangCode(langCode)
langBundles.getOrElse(langCode, unk)
}

/**
* Look up the [[LangBundle]] by language code
*
* @param lang two-letter ISO 639-1 language code
* @param lang two-letter ISO 639-1 language code or None
*/
def bundleForLang(lang: Option[String]): LangBundle = lang match {
case Some("de") => de
case Some("en") => en
case Some("es") => es
case Some("fr") => fr
case Some("in") => in
case Some("ja") => ja
case Some("ms") => ms
case Some("nl") => nl
case Some("pt") => pt
case Some("sv") => sv
case Some("tr") => tr
case Some("ar") => ar
case _ => unk
case Some(lang) => apply(lang)
case None => unk
}
}

0 comments on commit 84475b2

Please sign in to comment.