Skip to content

Commit

Permalink
Accpet PR #16
Browse files Browse the repository at this point in the history
  • Loading branch information
eponvert committed Dec 21, 2015
2 parents bb70fad + 9c1b390 commit 42c2ed0
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 12 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Expand Up @@ -2,6 +2,10 @@

## Unreleased

### Added
- Russian support
- Japanese support

### Changed

- Fixed configuration error, restored support for Dutch, Portuguese, Swedish,
Expand Down
9 changes: 6 additions & 3 deletions build.sbt
Expand Up @@ -18,11 +18,14 @@ scalacOptions in ThisBuild ++= Seq(
)

libraryDependencies ++= Seq(
"com.carrotsearch" % "langid-java" % "1.0.0",
"com.typesafe" % "config" % "1.3.0",
"org.scalatest" %% "scalatest" % "2.2.5" % "test"
"com.carrotsearch" % "langid-java" % "1.0.0",
"com.typesafe" % "config" % "1.3.0",
"com.atilika.kuromoji" % "kuromoji-ipadic" % "0.9.0",
"org.scalatest" %% "scalatest" % "2.2.5" % "test"
)

initialCommands := "import com.peoplepattern.text._"

scalariformSettings

licenses := Seq("Apache-2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0"))
Expand Down
81 changes: 81 additions & 0 deletions src/main/scala/com/peoplepattern/text/JaLangBundle.scala
@@ -0,0 +1,81 @@
package com.peoplepattern.text

import scala.collection.mutable.Buffer
import scala.collection.JavaConverters._
import com.atilika.kuromoji.ipadic.Tokenizer

/**
* Custom language bundle for Japanese
*
* Uses Kuromoji for tokenization https://github.com/atilika/kuromoji
*/
object JaLangBundle extends LangBundle {

val stopwords = LangBundle.stopwords("ja")

/**
* Whether the char is in the ASCII range
*/
private def asciichar(c: Char): Boolean = 0.toChar <= c && c <= 255.toChar

private def asciiSplit(text: String): Vector[(String, Boolean)] = {
var tmp = text
val buf = Buffer.empty[(String, Boolean)]
while (tmp.nonEmpty) {
val ascii = tmp.takeWhile(asciichar)
if (ascii.nonEmpty) {
buf += ascii.toString -> true
tmp = tmp.drop(ascii.size)
}

val nonascii = tmp.takeWhile(!asciichar(_))
if (nonascii.nonEmpty) {
buf += nonascii.toString -> false
tmp = tmp.drop(nonascii.size)
}
}
buf.toVector
}

/**
* Pure japanese text tokenization using Kuromoji
*/
def jatokens(text: String): Vector[String] = {
val tokenizer = new Tokenizer
tokenizer.tokenize(text).asScala.toVector.map(_.getSurface)
}

override def tokens(text: String) = {
/* Split the text into block of ASCII and non-ASCII
* - Tokenize the ASCII with the default method to get hashtags, URLs, emails etc
* - Tokenize the non-ASCII with Kuromoji
*/
val tokens = asciiSplit(text).flatMap {
case (block, isAscii) => {
if (isAscii) {
super.tokens(block)
} else {
jatokens(block)
}
}
}

if (tokens.contains("#")) {
// Post process YET AGAIN to make sure we merge in any Japanese hashtags
val buf = Buffer.empty[String]
var j = 0
while (j < tokens.size) {
if (tokens(j) == "#" && j + 1 < tokens.size && tokens(j + 1).nonEmpty && tokens(j + 1).head.isLetter) {
buf += (tokens(j) ++ tokens(j + 1))
j += 2
} else {
buf += tokens(j)
j += 1
}
}
buf.toVector
} else {
tokens
}
}
}
11 changes: 2 additions & 9 deletions src/main/scala/com/peoplepattern/text/LangBundle.scala
Expand Up @@ -262,16 +262,9 @@ object LangBundle {
}
}

// To update with custom language processing for, e.g. Japanese tokenization,
// do something like:
//
// val jaBundle = new LangBundle { /* custom stuff */ }
//
// private val langBundles: Map[String, LangBundle] =
// langs.map { lang => lang -> mkBundle(lang) }.toMap + ("ja" -> jaBundle)

// Note how we're appending custom bundles
private val langBundles: Map[String, LangBundle] =
langs.map { lang => lang -> mkBundle(lang) }.toMap
langs.map { lang => lang -> mkBundle(lang) }.toMap + ("ja" -> JaLangBundle)

/**
* Look up the [[LangBundle]] by language code
Expand Down
42 changes: 42 additions & 0 deletions src/test/scala/com/peoplepattern/text/JaLangBundleSpec.scala
@@ -0,0 +1,42 @@
package com.peoplepattern.text

import org.scalatest._

class JaLangBundleSpec extends FlatSpec {

"JaLangBundle" should "tokenize basic text" in {
val text = "お寿司が食べたい。"
val expected = Vector("", "寿司", "", "食べ", "たい", "")
assert(JaLangBundle.tokens(text) == expected)
}

it should "handle empty strings" in {
val text = ""
val expected = Vector.empty[String]
assert(JaLangBundle.tokens(text) == expected)
}

it should "handle western punctuation" in {
val text = "お寿司が食べたい."
val expected = Vector("", "寿司", "", "食べ", "たい", ".")
assert(JaLangBundle.tokens(text) == expected)
}

it should "handle hashtags and URLs" in {
val text = "スターバックス今年最後の限定ドリンクは、まるで”飲むデザート”?!http://bit.ly/1k464Fq #starbucks #deseart"
val expected = Vector("スター", "バックス", "今年", "最後", "", "限定", "ドリンク", "", "", "まるで", "", "飲む", "デザート", "", "", "", "http://bit.ly/1k464Fq", "#starbucks", "#deseart")
assert(JaLangBundle.tokens(text) == expected)
}

it should "get Japanese hashtags" in {
val text = "究極サンタサクヤちゃんの画面写真です!!!#パズドラ "
val expected = Vector("究極", "サンタサクヤ", "ちゃん", "", "画面", "写真", "です", "", "", "", "#パズドラ")
assert(JaLangBundle.tokens(text) == expected)
}

it should "handle a hashtag in the middle of a text" in {
val text = "ブログ更新☆ 「第5回東京ガールギークディナー」で女性の役に立つお話を聞いてきたよ #TGGD: テック系女子のイベントで女性のキャリアや考え方に役立つお話を聞いてきましたよ。ふむふむ&あるある!という感じでとても楽しい... http://bit.ly/16bZHC5 "
val expected = Vector("ブログ", "更新", "", "", "", "5", "", "東京", "ガールギークディナー", "", "", "女性", "", "役に立つ", "お話", "", "聞い", "", "", "", "", "#TGGD", ":", "テック", "", "女子", "", "イベント", "", "女性", "", "キャリア", "", "考え方", "", "役立つ", "お話", "", "聞い", "", "", "まし", "", "", "", "ふむふむ", "", "ある", "ある", "", "という", "感じ", "", "とても", "楽しい", "...", "http://bit.ly/16bZHC5")
assert(JaLangBundle.tokens(text) == expected)
}
}

0 comments on commit 42c2ed0

Please sign in to comment.