Use configs to drive language resources

Closes #11
peoplepattern · Dec 8, 2015 · 84475b2 · 84475b2
1 parent 6f6e5b3
commit 84475b2
Show file tree

Hide file tree

Showing 15 changed files with 106 additions and 99 deletions.
diff --git a/src/main/resources/ar/stopwords.txt → src/main/resources/ar.conf b/src/main/resources/ar/stopwords.txt → src/main/resources/ar.conf
@@ -1,3 +1,4 @@
+lang.ar.stopwords = [
 ب
 ا
 أ
@@ -160,3 +161,4 @@
 يكون
 يمكن
 مليون
+]
diff --git a/src/main/resources/de/stopwords.txt → src/main/resources/de.conf b/src/main/resources/de/stopwords.txt → src/main/resources/de.conf
@@ -1,3 +1,4 @@
+lang.de.stopwords = [
 a
 ab
 aber
@@ -492,7 +493,7 @@ u
 uhr
 um
 und
-und?
+"und?"
 uns
 unser
 unsere
@@ -514,7 +515,7 @@ vom
 von
 vor
 w
-wahr?
+"wahr?"
 während
 währenddem
 währenddessen
@@ -601,3 +602,4 @@ zweiter
 zweites
 zwischen
 zwölf
+]
diff --git a/src/main/resources/en/stopwords.txt → src/main/resources/en.conf b/src/main/resources/en/stopwords.txt → src/main/resources/en.conf
@@ -1,3 +1,4 @@
+lang.en.stopwords = [
 a
 a's
 able
@@ -520,3 +521,4 @@ amp
 ly
 bit
 didn
+]
diff --git a/src/main/resources/es/stopwords.txt → src/main/resources/es.conf b/src/main/resources/es/stopwords.txt → src/main/resources/es.conf
@@ -1,3 +1,4 @@
+lang.es.stopwords = [
 él
 ésta
 éstas
@@ -349,3 +350,4 @@ vez
 y
 ya
 yo
+]
diff --git a/src/main/resources/fr/stopwords.txt → src/main/resources/fr.conf b/src/main/resources/fr/stopwords.txt → src/main/resources/fr.conf
@@ -1,3 +1,4 @@
+lang.fr.stopwords = [
 a
 à
 â
@@ -461,3 +462,4 @@ x
 y
 z
 zut
+]
diff --git a/src/main/resources/in/stopwords.txt → src/main/resources/in.conf b/src/main/resources/in/stopwords.txt → src/main/resources/in.conf
@@ -1,3 +1,4 @@
+lang.in.stopwords = [
 yang
 di
 dan
@@ -327,3 +328,4 @@ keterangan
 penggunaan
 masing-masing
 menghadapi
+]
diff --git a/src/main/resources/ja/stopwords.txt → src/main/resources/ja.conf b/src/main/resources/ja/stopwords.txt → src/main/resources/ja.conf
@@ -1,3 +1,4 @@
+lang.ja.stopwords = [
 これ
 それ
 あれ
@@ -41,4 +42,5 @@
 と
 し
 それで
-しかし
+しかし
+]
diff --git a/src/main/resources/ms/stopwords.txt → src/main/resources/ms.conf b/src/main/resources/ms/stopwords.txt → src/main/resources/ms.conf
@@ -1,3 +1,4 @@
+lang.ms.stopwords = [
 abdul
 abdullah
 acara
@@ -473,3 +474,4 @@ wang
 wanita
 wilayah
 yang
+]
diff --git a/src/main/resources/nl/stopwords.txt → src/main/resources/nl.conf b/src/main/resources/nl/stopwords.txt → src/main/resources/nl.conf
@@ -1,3 +1,4 @@
+lang.nl.stopwords = [
 aan
 af
 al
@@ -34,7 +35,7 @@ heeft
 hem
 het
 hier
-hij 
+hij
 hoe
 hun
 iemand
@@ -43,7 +44,7 @@ ik
 in
 is
 ja
-je 
+je
 kan
 kon
 kunnen
@@ -101,4 +102,5 @@ zij
 zijn
 zo
 zonder
-zou
+zou
+]
diff --git a/src/main/resources/pt/stopwords.txt → src/main/resources/pt.conf b/src/main/resources/pt/stopwords.txt → src/main/resources/pt.conf
@@ -1,3 +1,4 @@
+lang.pt.stopwords = [
 a
 à
 adeus
@@ -391,3 +392,4 @@ vosso
 vossos
 zero
 vc
+]
diff --git a/src/main/resources/reference.conf b/src/main/resources/reference.conf
@@ -1,3 +1,18 @@
+include "/ar.conf"
+include "/de.conf"
+include "/en.conf"
+include "/es.conf"
+include "/fr.conf"
+include "/in.conf"
+include "/ja.conf"
+include "/ms.conf"
+include "/en.conf"
+include "/en.conf"
+include "/en.conf"
+include "/en.conf"
+include "/en.conf"
+include "/en.conf"
+
 lang.classify {
   default_threshold = 0.85
   default_frequency = 0.1

diff --git a/src/main/resources/sv/stopwords.txt → src/main/resources/sv.conf b/src/main/resources/sv/stopwords.txt → src/main/resources/sv.conf
@@ -1,3 +1,4 @@
+lang.sv.stopwords = [
 aderton
 adertonde
 adjö
@@ -82,10 +83,10 @@ era
 ert
 ett
 ettusen
-få 
+få
 fanns
 får
-fått 
+fått
 fem
 femte
 femtio
@@ -384,3 +385,4 @@ vilka
 vilken
 vilket
 vill
+]
diff --git a/src/main/resources/tr/stopwords.txt → src/main/resources/tr.conf b/src/main/resources/tr/stopwords.txt → src/main/resources/tr.conf
@@ -1,3 +1,4 @@
+lang.tr.stopwords = [
 acaba
 altmýþ
 altý
@@ -112,3 +113,4 @@ yüz
 þunda
 þundan
 þunu
+]
diff --git a/src/main/scala/com/peoplepattern/text/LangBundle.scala b/src/main/scala/com/peoplepattern/text/LangBundle.scala
@@ -215,82 +215,40 @@ trait LangBundle {
 object LangBundle {
 
   import scala.io.Source
+  import com.typesafe.config.ConfigFactory
+  import scala.collection.JavaConversions._
 
-  private def srcFromResource(path: String) = {
-    Source.fromInputStream(getClass.getResourceAsStream(path))
+  val conf = ConfigFactory.load
+
+  private def chkLangCode(code: String) {
+    require("^[a-z]{2}$".r.pattern.matcher(code).matches)
   }
 
-  private def stopwords(lang: String) = {
-    val src = srcFromResource(s"/$lang/stopwords.txt")
-    try {
-      src.getLines.toSet
-    } finally {
-      src.close()
-    }
+  def stopwords(lang: String): Set[String] = {
+    chkLangCode(lang)
+    Set(conf.getStringList(s"lang.$lang.stopwords"): _*)
   }
 
-  private def mkBundle(lang: String) = {
-    val stops = stopwords(lang)
+  def mkBundle(lang: String) = {
     new LangBundle {
-      val stopwords = stops
+      lazy val stopwords = LangBundle.stopwords(lang)
     }
   }
 
-  /** The [[LangBundle]] for German */
-  lazy val de = mkBundle("de")
-
-  /** The [[LangBundle]] for English */
-  lazy val en = mkBundle("en")
-
-  /** The [[LangBundle]] for Spanish */
-  lazy val es = mkBundle("es")
-
-  /** The [[LangBundle]] for French */
-  lazy val fr = mkBundle("fr")
-
-  /** The [[LangBundle]] for Indonesian */
-  lazy val in = mkBundle("in")
-
-  /**
-   * The [[LangBundle]] for Japanese
-   *
-   * TODO improved tokenizer
-   */
-  lazy val ja = mkBundle("ja")
-
-  /** The [[LangBundle]] for Malay */
-  lazy val ms = mkBundle("ms")
-
-  /** The [[LangBundle]] for Dutch */
-  lazy val nl = mkBundle("nl")
-
-  /** The [[LangBundle]] for Portuguese */
-  lazy val pt = mkBundle("pt")
-
-  /** The [[LangBundle]] for Swedish */
-  lazy val sv = mkBundle("sv")
-
-  /** The [[LangBundle]] for Turkish */
-  lazy val tr = mkBundle("tr")
-
-  /** The [[LangBundle]] for Armenian */
-  lazy val ar = mkBundle("ar")
+  private val LangRegx = """^([a-z][a-z])\.stopwords$""".r
 
   /** The set of supported languages */
-  def langs = Set(
-    "de",
-    "en",
-    "es",
-    "fr",
-    "in",
-    "ja",
-    "ms",
-    "nl",
-    "pt",
-    "sv",
-    "tr",
-    "ar"
-  )
+  lazy val langs: Set[String] = conf.getConfig("lang")
+    .entrySet
+    .map(_.getKey)
+    .map { key =>
+      key match {
+        case LangRegx(lang) => Some(lang)
+        case _ => None
+      }
+    }
+    .flatten
+    .toSet
 
   /**
    * A language bundle for text for which we don't have an identified language
@@ -304,24 +262,34 @@ object LangBundle {
     }
   }
 
+  // To update with custom language processing for, e.g. Japanese tokenization,
+  // do something like:
+  //
+  // val jaBundle = new LangBundle { /* custom stuff */ }
+  //
+  // private val langBundles: Map[String, LangBundle] =
+  //   langs.map { lang => lang -> mkBundle(lang) }.toMap + ("ja" -> jaBundle)
+
+  private val langBundles: Map[String, LangBundle] =
+    langs.map { lang => lang -> mkBundle(lang) }.toMap
+
+  /**
+   * Look up the [[LangBundle]] by language code
+   *
+   * @param langCode two-letter ISO 639-1 language code
+   */
+  def apply(langCode: String) = {
+    chkLangCode(langCode)
+    langBundles.getOrElse(langCode, unk)
+  }
+
   /**
    * Look up the [[LangBundle]] by language code
    *
-   * @param lang two-letter ISO 639-1 language code
+   * @param lang two-letter ISO 639-1 language code or None
    */
   def bundleForLang(lang: Option[String]): LangBundle = lang match {
-    case Some("de") => de
-    case Some("en") => en
-    case Some("es") => es
-    case Some("fr") => fr
-    case Some("in") => in
-    case Some("ja") => ja
-    case Some("ms") => ms
-    case Some("nl") => nl
-    case Some("pt") => pt
-    case Some("sv") => sv
-    case Some("tr") => tr
-    case Some("ar") => ar
-    case _ => unk
+    case Some(lang) => apply(lang)
+    case None => unk
   }
 }