Skip to content

Commit

Permalink
StringUtils, more tests and Scaladoc
Browse files Browse the repository at this point in the history
Including a StringUtils package of convenience methods for strings
  • Loading branch information
eponvert committed Dec 7, 2015
1 parent a1c47a8 commit a54844f
Show file tree
Hide file tree
Showing 5 changed files with 427 additions and 6 deletions.
49 changes: 49 additions & 0 deletions src/main/scala/com/peoplepattern/text/Implicits.scala
@@ -1,23 +1,72 @@
package com.peoplepattern.text

import java.net.{ URL, MalformedURLException }

object Implicits {

/** Helpers to provide access to LangBundle and StringUtil stuff on strings */
implicit class StringWithAnalysis(str: String) {

/** The detected language of the string, if predicted */
lazy val lang: Option[String] = LanguageIdentifier.classify(str).map(_._1)

/** The language bundle for the string based on it's predicted language */
lazy val bundle: LangBundle = LangBundle.bundleForLang(lang)

/** The tokens of the string using language specific tokenization */
lazy val tokens: Vector[String] = bundle.tokens(str)

/** The terms of the string using language specific tokens & stopwords */
lazy val terms: Set[String] = bundle.terms(tokens)

/** The terms + hashtags + @-mentions of the string */
lazy val termsPlus: Set[String] = bundle.termsPlus(tokens)

/** The term bigrams of the string */
lazy val termBigrams: Set[String] = termNgrams(2, 2)

/** The term 3-grams of the string */
lazy val termTrigrams: Set[String] = termNgrams(3, 3)

/**
* Extract term n-grams from the string
*
* @param min the minimum n-gram length to extract
* @param max the maximum n-gram length to extract
*/
def termNgrams(min: Int, max: Int) = bundle.termNgrams(tokens, min, max)

/** Whether the string is empty or only white-space */
def isBlank: Boolean = StringUtil.isBlank(str)

/** Whether the string is *not* empty or only white-space */
def nonBlank: Boolean = !isBlank

/**
* Extract counts of the char n-grams in the string
*
* @param min the minimum n-gram length to extract
* @param max the maximum n-gram length to extract
*/
def charNgrams(min: Int, max: Int): Map[String, Int] = StringUtil.charNgrams(str, min, max)

/** Extract counts of the char bigrams in the string */
def charBigrams(s: String): Map[String, Int] = charNgrams(2, 2)

/** The term as a URL, if it can be parsed as such */
lazy val asUrl: Option[URL] = try {
Some(new URL(str))
} catch {
case _: MalformedURLException => None
}

/** The terms as a simplified URL if it can be parsed as such */
lazy val simplifiedUrl: Option[String] = asUrl.map(StringUtil.simplifyUrl)

/** Whether the string can be parsed as a URL */
def isURL: Boolean = asUrl.nonEmpty

/** Some(this) if the string is non-empty and doesn't contain only white-space */
def asOpt: Option[String] = StringUtil.asOpt(str)
}
}
109 changes: 103 additions & 6 deletions src/main/scala/com/peoplepattern/text/LangBundle.scala
Expand Up @@ -81,14 +81,18 @@ trait LangBundle {
b.toVector
}

/** Language specific stopwords */
def stopwords: Set[String]

/** Whether the string is probably a linguistic term with meaning */
def isContentTerm(term: String) = {
term.forall(_.isLetter) && !stopwords(term.toLowerCase)
}

/** Tokenize the string and extract the set of terms */
def terms(text: String): Set[String] = terms(tokens(text))

/** Extract terms from the sequence of tokens */
def terms(tokens: Seq[String]): Set[String] = {
tokens.filter(isContentTerm).map(_.toLowerCase).toSet
}
Expand All @@ -97,32 +101,54 @@ trait LangBundle {
c.isLetter || c.isDigit || c == '_'
}

/** Whether the string can could be a social media hashtag */
def isHashtag(term: String) = {
term.size >= 2 && term(0) == '#' && term.tail.forall(isSocialThingChar)
}

/** Whether the string could be a social media @-mention */
def isMention(term: String) = {
term.size >= 2 && term(0) == '@' && term.tail.forall(isSocialThingChar)
}

/**
* Word-only terms plus hashtags, emoji, @-mentions
* Tokenize the string and extract terms plus hashtags, emoji, @-mentions
*/
def termsPlus(text: String): Set[String] = termsPlus(tokens(text))

/**
* Word-only terms plus hashtags, emoji, @-mentions
* Extract terms plus hashtags, emoji, @-mentions from the token sequence
*/
def termsPlus(tokens: Seq[String]): Set[String] = {
tokens.filter(w =>
isContentTerm(w) || isMention(w) || isHashtag(w)
).map(_.toLowerCase).toSet
}

/**
* Extract the set of term-only n-grams from the text
*
* For example from the text "this is the winning team" only the bigram
* "winning team" would be extracted
*
* @param text the text to extract n-grams from
* @param min the minimum length of extracted n-grams
* @param max the maximum length of extracted n-grams
*/
def termNgrams(text: String, min: Int, max: Int): Set[String] = {
termNgrams(tokens(text), min, max)
}

/**
* Extract the set of term-only n-grams from the token sequence
*
* For example from the text "this is the winning team" only the bigram
* "winning team" would be extracted
*
* @param tokens the token sequence to extract n-grams from
* @param min the minimum length of extracted n-grams
* @param max the maximum length of extracted n-grams
*/
def termNgrams(tokens: Seq[String], min: Int, max: Int): Set[String] = {
val seqs = Buffer.empty[Vector[String]]
val thisbf = Buffer.empty[String]
Expand All @@ -144,24 +170,57 @@ trait LangBundle {
termNgrams.toSet
}

/**
* Extract the set of term-only bigrams from the text
*
* For example from the text "this is the winning team" only the bigram
* "winning team" would be extracted
*
* @param text the text to extract n-grams from
*/
def termBigrams(text: String) = termNgrams(text, 2, 2)

/**
* Extract the set of term-only bigrams from the token sequence
*
* For example from the text "this is the winning team" only the bigram
* "winning team" would be extracted
*
* @param tokens the token sequence to extract n-grams from
*/
def termBigrams(tokens: Seq[String]) = termNgrams(tokens, 2, 2)

/**
* Extract the set of term-only bigrams from the text
*
* For example from the text "this is red sox nation" only the trigram
* "red sox nation" would be extracted
*
* @param text the text to extract n-grams from
*/
def termTrigrams(text: String) = termNgrams(text, 3, 3)

/**
* Extract the set of term-only bigrams from the text
*
* For example from the text "this is red sox nation" only the trigram
* "red sox nation" would be extracted
*
* @param tokens the token sequence to extract n-grams from
*/
def termTrigrams(tokens: Seq[String]) = termNgrams(tokens, 3, 3)
}

/** Helpers and language-specifi [[LangBundle]]s **/
object LangBundle {

import scala.io.Source

def srcFromResource(path: String) = {
private def srcFromResource(path: String) = {
Source.fromInputStream(getClass.getResourceAsStream(path))
}

def stopwords(lang: String) = {
private def stopwords(lang: String) = {
val src = srcFromResource(s"/$lang/stopwords.txt")
try {
src.getLines.toSet
Expand All @@ -170,26 +229,54 @@ object LangBundle {
}
}

def mkBundle(lang: String) = {
private def mkBundle(lang: String) = {
val stops = stopwords(lang)
new LangBundle {
val stopwords = stops
}
}

/** The [[LangBundle]] for German */
lazy val de = mkBundle("de")

/** The [[LangBundle]] for English */
lazy val en = mkBundle("en")

/** The [[LangBundle]] for Spanish */
lazy val es = mkBundle("es")

/** The [[LangBundle]] for French */
lazy val fr = mkBundle("fr")

/** The [[LangBundle]] for Indonesian */
lazy val in = mkBundle("in")
lazy val ja = mkBundle("ja") // TODO improved tokenizer

/**
* The [[LangBundle]] for Japanese
*
* TODO improved tokenizer
*/
lazy val ja = mkBundle("ja")

/** The [[LangBundle]] for Malay */
lazy val ms = mkBundle("ms")

/** The [[LangBundle]] for Dutch */
lazy val nl = mkBundle("nl")

/** The [[LangBundle]] for Portuguese */
lazy val pt = mkBundle("pt")

/** The [[LangBundle]] for Swedish */
lazy val sv = mkBundle("sv")

/** The [[LangBundle]] for Turkish */
lazy val tr = mkBundle("tr")

/** The [[LangBundle]] for Armenian */
lazy val ar = mkBundle("ar")

/** The set of supported languages */
def langs = Set(
"de",
"en",
Expand All @@ -205,13 +292,23 @@ object LangBundle {
"ar"
)

/**
* A language bundle for text for which we don't have an identified language
*
* Uses all known stopwords
*/
lazy val unk = {
val stops: Set[String] = langs.flatMap(stopwords)
new LangBundle {
val stopwords = stops
}
}

/**
* Look up the [[LangBundle]] by language code
*
* @param lang two-letter ISO 639-1 language code
*/
def bundleForLang(lang: Option[String]): LangBundle = lang match {
case Some("de") => de
case Some("en") => en
Expand Down
78 changes: 78 additions & 0 deletions src/main/scala/com/peoplepattern/text/StringUtil.scala
@@ -0,0 +1,78 @@
package com.peoplepattern.text

import scala.collection.mutable
import java.net.{ URL, MalformedURLException }

object StringUtil {

/** Determine if the string is null or only contains white-space */
def isBlank(s: String): Boolean = Option(s) match {
case Some(str) => str.trim.isEmpty
case _ => true
}

/** Counts of the character n-grams in the string */
def charNgrams(s: String, min: Int, max: Int): Map[String, Int] = {
val mmap = mutable.Map.empty[String, Int].withDefaultValue(0)
for {
len <- min to max
ngram <- s.sliding(len)
} mmap(ngram) += 1
mmap.toMap
}

/** Counts of the character 2-grams in the string */
def charBigrams(s: String) = charNgrams(s, 2, 2)

/** Whether the string contains a URL */
def isUrl(s: String): Boolean = try {
val u = new URL(s)
u != null
} catch {
case _: MalformedURLException => false
}

/**
* Simplify a URL into something less unique
*
* Eg simplifyUrl(http://money.cnn.com/video/technology/2015/11/30/tech-gift-guide-selfie-gadgets.cnnmoney/index.html)
* is simplified to money.cnn.com/video
*/
def simplifyUrl(u: URL): String = simplifyUrl(u, None, 1, false)

/**
* Simplify a URL into something less unique
*
* @param u the URL
* @param prefix prefix to append to the simplified URL, e.g. "url:"
* @param path the number of parts of the path to keep, e.g. with path = 2 "video/technology/2015" -> "video/technology"
* @param keepProtocol if true, keep the "http" or "https" part of the URL
*/
def simplifyUrl(
u: URL,
prefix: Option[String],
path: Int,
keepProtocol: Boolean): String = {
val sb = new StringBuilder
prefix foreach { p =>
sb ++= p
}
if (keepProtocol) {
sb ++= u.getProtocol ++= "://"
}
sb ++= u.getHost
if (path > 0) {
val urlPath = u.getPath
val simplifiedPath = urlPath.split("/").dropWhile(_.isEmpty).take(path)
if (simplifiedPath.nonEmpty) {
sb ++= "/" ++= simplifiedPath.mkString("/")
}
}
sb.toString
}

/** Convert a string to Some(s) if it's not blank or null, otherwise None */
def asOpt(s: String): Option[String] = {
if (isBlank(s)) None else Some(s)
}
}
13 changes: 13 additions & 0 deletions src/main/scala/com/peoplepattern/text/package.scala
@@ -0,0 +1,13 @@
package com.peoplepattern

/**
* Text processing utilities for language prediction, tokenization, term extraction etc
*
* <ul>
* <li>See [[LangBundle]] for tokenization and term extraction
* <li>See [[LanguageIdentifier]] for language identification
* <li>See [[StringUtil]] for low-level string helpers
* <li>See [[Implicits.StringWithAnalysis]] for implicit helper methods for strings
* </ul>
*/
package object text

0 comments on commit a54844f

Please sign in to comment.