Accpet PR #16

peoplepattern · Dec 21, 2015 · 42c2ed0 · 42c2ed0
2 parents bb70fad + 9c1b390
commit 42c2ed0
Show file tree

Hide file tree

Showing 5 changed files with 135 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## Unreleased
 
+### Added
+- Russian support
+- Japanese support
+
 ### Changed
 
 - Fixed configuration error, restored support for Dutch, Portuguese, Swedish,

diff --git a/build.sbt b/build.sbt
@@ -18,11 +18,14 @@ scalacOptions in ThisBuild ++= Seq(
 )
 
 libraryDependencies ++= Seq(
-  "com.carrotsearch" % "langid-java" % "1.0.0",
-  "com.typesafe"     % "config"      % "1.3.0",
-  "org.scalatest"    %% "scalatest"  % "2.2.5" % "test"
+  "com.carrotsearch"     % "langid-java"     % "1.0.0",
+  "com.typesafe"         % "config"          % "1.3.0",
+  "com.atilika.kuromoji" % "kuromoji-ipadic" % "0.9.0",
+  "org.scalatest"        %% "scalatest"      % "2.2.5" % "test"
 )
 
+initialCommands := "import com.peoplepattern.text._"
+
 scalariformSettings
 
 licenses := Seq("Apache-2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0"))

diff --git a/src/main/scala/com/peoplepattern/text/JaLangBundle.scala b/src/main/scala/com/peoplepattern/text/JaLangBundle.scala
@@ -0,0 +1,81 @@
+package com.peoplepattern.text
+
+import scala.collection.mutable.Buffer
+import scala.collection.JavaConverters._
+import com.atilika.kuromoji.ipadic.Tokenizer
+
+/**
+ * Custom language bundle for Japanese
+ *
+ * Uses Kuromoji for tokenization https://github.com/atilika/kuromoji
+ */
+object JaLangBundle extends LangBundle {
+
+  val stopwords = LangBundle.stopwords("ja")
+
+  /**
+   * Whether the char is in the ASCII range
+   */
+  private def asciichar(c: Char): Boolean = 0.toChar <= c && c <= 255.toChar
+
+  private def asciiSplit(text: String): Vector[(String, Boolean)] = {
+    var tmp = text
+    val buf = Buffer.empty[(String, Boolean)]
+    while (tmp.nonEmpty) {
+      val ascii = tmp.takeWhile(asciichar)
+      if (ascii.nonEmpty) {
+        buf += ascii.toString -> true
+        tmp = tmp.drop(ascii.size)
+      }
+
+      val nonascii = tmp.takeWhile(!asciichar(_))
+      if (nonascii.nonEmpty) {
+        buf += nonascii.toString -> false
+        tmp = tmp.drop(nonascii.size)
+      }
+    }
+    buf.toVector
+  }
+
+  /**
+   * Pure japanese text tokenization using Kuromoji
+   */
+  def jatokens(text: String): Vector[String] = {
+    val tokenizer = new Tokenizer
+    tokenizer.tokenize(text).asScala.toVector.map(_.getSurface)
+  }
+
+  override def tokens(text: String) = {
+    /* Split the text into block of ASCII and non-ASCII
+     * - Tokenize the ASCII with the default method to get hashtags, URLs, emails etc
+     * - Tokenize the non-ASCII with Kuromoji
+     */
+    val tokens = asciiSplit(text).flatMap {
+      case (block, isAscii) => {
+        if (isAscii) {
+          super.tokens(block)
+        } else {
+          jatokens(block)
+        }
+      }
+    }
+
+    if (tokens.contains("#")) {
+      // Post process YET AGAIN to make sure we merge in any Japanese hashtags
+      val buf = Buffer.empty[String]
+      var j = 0
+      while (j < tokens.size) {
+        if (tokens(j) == "#" && j + 1 < tokens.size && tokens(j + 1).nonEmpty && tokens(j + 1).head.isLetter) {
+          buf += (tokens(j) ++ tokens(j + 1))
+          j += 2
+        } else {
+          buf += tokens(j)
+          j += 1
+        }
+      }
+      buf.toVector
+    } else {
+      tokens
+    }
+  }
+}
diff --git a/src/main/scala/com/peoplepattern/text/LangBundle.scala b/src/main/scala/com/peoplepattern/text/LangBundle.scala
@@ -262,16 +262,9 @@ object LangBundle {
     }
   }
 
-  // To update with custom language processing for, e.g. Japanese tokenization,
-  // do something like:
-  //
-  // val jaBundle = new LangBundle { /* custom stuff */ }
-  //
-  // private val langBundles: Map[String, LangBundle] =
-  //   langs.map { lang => lang -> mkBundle(lang) }.toMap + ("ja" -> jaBundle)
-
+  // Note how we're appending custom bundles
   private val langBundles: Map[String, LangBundle] =
-    langs.map { lang => lang -> mkBundle(lang) }.toMap
+    langs.map { lang => lang -> mkBundle(lang) }.toMap + ("ja" -> JaLangBundle)
 
   /**
    * Look up the [[LangBundle]] by language code

diff --git a/src/test/scala/com/peoplepattern/text/JaLangBundleSpec.scala b/src/test/scala/com/peoplepattern/text/JaLangBundleSpec.scala
@@ -0,0 +1,42 @@
+package com.peoplepattern.text
+
+import org.scalatest._
+
+class JaLangBundleSpec extends FlatSpec {
+
+  "JaLangBundle" should "tokenize basic text" in {
+    val text = "お寿司が食べたい。"
+    val expected = Vector("お", "寿司", "が", "食べ", "たい", "。")
+    assert(JaLangBundle.tokens(text) == expected)
+  }
+
+  it should "handle empty strings" in {
+    val text = ""
+    val expected = Vector.empty[String]
+    assert(JaLangBundle.tokens(text) == expected)
+  }
+
+  it should "handle western punctuation" in {
+    val text = "お寿司が食べたい."
+    val expected = Vector("お", "寿司", "が", "食べ", "たい", ".")
+    assert(JaLangBundle.tokens(text) == expected)
+  }
+
+  it should "handle hashtags and URLs" in {
+    val text = "スターバックス今年最後の限定ドリンクは、まるで”飲むデザート”？！http://bit.ly/1k464Fq #starbucks #deseart"
+    val expected = Vector("スター", "バックス", "今年", "最後", "の", "限定", "ドリンク", "は", "、", "まるで", "”", "飲む", "デザート", "”", "？", "！", "http://bit.ly/1k464Fq", "#starbucks", "#deseart")
+    assert(JaLangBundle.tokens(text) == expected)
+  }
+
+  it should "get Japanese hashtags" in {
+    val text = "究極サンタサクヤちゃんの画面写真です！！！#パズドラ "
+    val expected = Vector("究極", "サンタサクヤ", "ちゃん", "の", "画面", "写真", "です", "！", "！", "！", "#パズドラ")
+    assert(JaLangBundle.tokens(text) == expected)
+  }
+
+  it should "handle a hashtag in the middle of a text" in {
+    val text = "ブログ更新☆ 「第5回東京ガールギークディナー」で女性の役に立つお話を聞いてきたよ #TGGD: テック系女子のイベントで女性のキャリアや考え方に役立つお話を聞いてきましたよ。ふむふむ＆あるある！という感じでとても楽しい... http://bit.ly/16bZHC5 "
+    val expected = Vector("ブログ", "更新", "☆", "「", "第", "5", "回", "東京", "ガールギークディナー", "」", "で", "女性", "の", "役に立つ", "お話", "を", "聞い", "て", "き", "た", "よ", "#TGGD", ":", "テック", "系", "女子", "の", "イベント", "で", "女性", "の", "キャリア", "や", "考え方", "に", "役立つ", "お話", "を", "聞い", "て", "き", "まし", "た", "よ", "。", "ふむふむ", "＆", "ある", "ある", "！", "という", "感じ", "で", "とても", "楽しい", "...", "http://bit.ly/16bZHC5")
+    assert(JaLangBundle.tokens(text) == expected)
+  }
+}