Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
135 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
package com.peoplepattern.text | ||
|
||
import scala.collection.mutable.Buffer | ||
import scala.collection.JavaConverters._ | ||
import com.atilika.kuromoji.ipadic.Tokenizer | ||
|
||
/** | ||
* Custom language bundle for Japanese | ||
* | ||
* Uses Kuromoji for tokenization https://github.com/atilika/kuromoji | ||
*/ | ||
object JaLangBundle extends LangBundle { | ||
|
||
val stopwords = LangBundle.stopwords("ja") | ||
|
||
/** | ||
* Whether the char is in the ASCII range | ||
*/ | ||
private def asciichar(c: Char): Boolean = 0.toChar <= c && c <= 255.toChar | ||
|
||
private def asciiSplit(text: String): Vector[(String, Boolean)] = { | ||
var tmp = text | ||
val buf = Buffer.empty[(String, Boolean)] | ||
while (tmp.nonEmpty) { | ||
val ascii = tmp.takeWhile(asciichar) | ||
if (ascii.nonEmpty) { | ||
buf += ascii.toString -> true | ||
tmp = tmp.drop(ascii.size) | ||
} | ||
|
||
val nonascii = tmp.takeWhile(!asciichar(_)) | ||
if (nonascii.nonEmpty) { | ||
buf += nonascii.toString -> false | ||
tmp = tmp.drop(nonascii.size) | ||
} | ||
} | ||
buf.toVector | ||
} | ||
|
||
/** | ||
* Pure japanese text tokenization using Kuromoji | ||
*/ | ||
def jatokens(text: String): Vector[String] = { | ||
val tokenizer = new Tokenizer | ||
tokenizer.tokenize(text).asScala.toVector.map(_.getSurface) | ||
} | ||
|
||
override def tokens(text: String) = { | ||
/* Split the text into block of ASCII and non-ASCII | ||
* - Tokenize the ASCII with the default method to get hashtags, URLs, emails etc | ||
* - Tokenize the non-ASCII with Kuromoji | ||
*/ | ||
val tokens = asciiSplit(text).flatMap { | ||
case (block, isAscii) => { | ||
if (isAscii) { | ||
super.tokens(block) | ||
} else { | ||
jatokens(block) | ||
} | ||
} | ||
} | ||
|
||
if (tokens.contains("#")) { | ||
// Post process YET AGAIN to make sure we merge in any Japanese hashtags | ||
val buf = Buffer.empty[String] | ||
var j = 0 | ||
while (j < tokens.size) { | ||
if (tokens(j) == "#" && j + 1 < tokens.size && tokens(j + 1).nonEmpty && tokens(j + 1).head.isLetter) { | ||
buf += (tokens(j) ++ tokens(j + 1)) | ||
j += 2 | ||
} else { | ||
buf += tokens(j) | ||
j += 1 | ||
} | ||
} | ||
buf.toVector | ||
} else { | ||
tokens | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
42 changes: 42 additions & 0 deletions
42
src/test/scala/com/peoplepattern/text/JaLangBundleSpec.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package com.peoplepattern.text | ||
|
||
import org.scalatest._ | ||
|
||
class JaLangBundleSpec extends FlatSpec { | ||
|
||
"JaLangBundle" should "tokenize basic text" in { | ||
val text = "お寿司が食べたい。" | ||
val expected = Vector("お", "寿司", "が", "食べ", "たい", "。") | ||
assert(JaLangBundle.tokens(text) == expected) | ||
} | ||
|
||
it should "handle empty strings" in { | ||
val text = "" | ||
val expected = Vector.empty[String] | ||
assert(JaLangBundle.tokens(text) == expected) | ||
} | ||
|
||
it should "handle western punctuation" in { | ||
val text = "お寿司が食べたい." | ||
val expected = Vector("お", "寿司", "が", "食べ", "たい", ".") | ||
assert(JaLangBundle.tokens(text) == expected) | ||
} | ||
|
||
it should "handle hashtags and URLs" in { | ||
val text = "スターバックス今年最後の限定ドリンクは、まるで”飲むデザート”?!http://bit.ly/1k464Fq #starbucks #deseart" | ||
val expected = Vector("スター", "バックス", "今年", "最後", "の", "限定", "ドリンク", "は", "、", "まるで", "”", "飲む", "デザート", "”", "?", "!", "http://bit.ly/1k464Fq", "#starbucks", "#deseart") | ||
assert(JaLangBundle.tokens(text) == expected) | ||
} | ||
|
||
it should "get Japanese hashtags" in { | ||
val text = "究極サンタサクヤちゃんの画面写真です!!!#パズドラ " | ||
val expected = Vector("究極", "サンタサクヤ", "ちゃん", "の", "画面", "写真", "です", "!", "!", "!", "#パズドラ") | ||
assert(JaLangBundle.tokens(text) == expected) | ||
} | ||
|
||
it should "handle a hashtag in the middle of a text" in { | ||
val text = "ブログ更新☆ 「第5回東京ガールギークディナー」で女性の役に立つお話を聞いてきたよ #TGGD: テック系女子のイベントで女性のキャリアや考え方に役立つお話を聞いてきましたよ。ふむふむ&あるある!という感じでとても楽しい... http://bit.ly/16bZHC5 " | ||
val expected = Vector("ブログ", "更新", "☆", "「", "第", "5", "回", "東京", "ガールギークディナー", "」", "で", "女性", "の", "役に立つ", "お話", "を", "聞い", "て", "き", "た", "よ", "#TGGD", ":", "テック", "系", "女子", "の", "イベント", "で", "女性", "の", "キャリア", "や", "考え方", "に", "役立つ", "お話", "を", "聞い", "て", "き", "まし", "た", "よ", "。", "ふむふむ", "&", "ある", "ある", "!", "という", "感じ", "で", "とても", "楽しい", "...", "http://bit.ly/16bZHC5") | ||
assert(JaLangBundle.tokens(text) == expected) | ||
} | ||
} |