Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
StringUtils, more tests and Scaladoc
Including a StringUtils package of convenience methods for strings
- Loading branch information
Showing
5 changed files
with
427 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,72 @@ | ||
package com.peoplepattern.text | ||
|
||
import java.net.{ URL, MalformedURLException } | ||
|
||
object Implicits { | ||
|
||
/** Helpers to provide access to LangBundle and StringUtil stuff on strings */ | ||
implicit class StringWithAnalysis(str: String) { | ||
|
||
/** The detected language of the string, if predicted */ | ||
lazy val lang: Option[String] = LanguageIdentifier.classify(str).map(_._1) | ||
|
||
/** The language bundle for the string based on it's predicted language */ | ||
lazy val bundle: LangBundle = LangBundle.bundleForLang(lang) | ||
|
||
/** The tokens of the string using language specific tokenization */ | ||
lazy val tokens: Vector[String] = bundle.tokens(str) | ||
|
||
/** The terms of the string using language specific tokens & stopwords */ | ||
lazy val terms: Set[String] = bundle.terms(tokens) | ||
|
||
/** The terms + hashtags + @-mentions of the string */ | ||
lazy val termsPlus: Set[String] = bundle.termsPlus(tokens) | ||
|
||
/** The term bigrams of the string */ | ||
lazy val termBigrams: Set[String] = termNgrams(2, 2) | ||
|
||
/** The term 3-grams of the string */ | ||
lazy val termTrigrams: Set[String] = termNgrams(3, 3) | ||
|
||
/** | ||
* Extract term n-grams from the string | ||
* | ||
* @param min the minimum n-gram length to extract | ||
* @param max the maximum n-gram length to extract | ||
*/ | ||
def termNgrams(min: Int, max: Int) = bundle.termNgrams(tokens, min, max) | ||
|
||
/** Whether the string is empty or only white-space */ | ||
def isBlank: Boolean = StringUtil.isBlank(str) | ||
|
||
/** Whether the string is *not* empty or only white-space */ | ||
def nonBlank: Boolean = !isBlank | ||
|
||
/** | ||
* Extract counts of the char n-grams in the string | ||
* | ||
* @param min the minimum n-gram length to extract | ||
* @param max the maximum n-gram length to extract | ||
*/ | ||
def charNgrams(min: Int, max: Int): Map[String, Int] = StringUtil.charNgrams(str, min, max) | ||
|
||
/** Extract counts of the char bigrams in the string */ | ||
def charBigrams(s: String): Map[String, Int] = charNgrams(2, 2) | ||
|
||
/** The term as a URL, if it can be parsed as such */ | ||
lazy val asUrl: Option[URL] = try { | ||
Some(new URL(str)) | ||
} catch { | ||
case _: MalformedURLException => None | ||
} | ||
|
||
/** The terms as a simplified URL if it can be parsed as such */ | ||
lazy val simplifiedUrl: Option[String] = asUrl.map(StringUtil.simplifyUrl) | ||
|
||
/** Whether the string can be parsed as a URL */ | ||
def isURL: Boolean = asUrl.nonEmpty | ||
|
||
/** Some(this) if the string is non-empty and doesn't contain only white-space */ | ||
def asOpt: Option[String] = StringUtil.asOpt(str) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
package com.peoplepattern.text | ||
|
||
import scala.collection.mutable | ||
import java.net.{ URL, MalformedURLException } | ||
|
||
object StringUtil { | ||
|
||
/** Determine if the string is null or only contains white-space */ | ||
def isBlank(s: String): Boolean = Option(s) match { | ||
case Some(str) => str.trim.isEmpty | ||
case _ => true | ||
} | ||
|
||
/** Counts of the character n-grams in the string */ | ||
def charNgrams(s: String, min: Int, max: Int): Map[String, Int] = { | ||
val mmap = mutable.Map.empty[String, Int].withDefaultValue(0) | ||
for { | ||
len <- min to max | ||
ngram <- s.sliding(len) | ||
} mmap(ngram) += 1 | ||
mmap.toMap | ||
} | ||
|
||
/** Counts of the character 2-grams in the string */ | ||
def charBigrams(s: String) = charNgrams(s, 2, 2) | ||
|
||
/** Whether the string contains a URL */ | ||
def isUrl(s: String): Boolean = try { | ||
val u = new URL(s) | ||
u != null | ||
} catch { | ||
case _: MalformedURLException => false | ||
} | ||
|
||
/** | ||
* Simplify a URL into something less unique | ||
* | ||
* Eg simplifyUrl(http://money.cnn.com/video/technology/2015/11/30/tech-gift-guide-selfie-gadgets.cnnmoney/index.html) | ||
* is simplified to money.cnn.com/video | ||
*/ | ||
def simplifyUrl(u: URL): String = simplifyUrl(u, None, 1, false) | ||
|
||
/** | ||
* Simplify a URL into something less unique | ||
* | ||
* @param u the URL | ||
* @param prefix prefix to append to the simplified URL, e.g. "url:" | ||
* @param path the number of parts of the path to keep, e.g. with path = 2 "video/technology/2015" -> "video/technology" | ||
* @param keepProtocol if true, keep the "http" or "https" part of the URL | ||
*/ | ||
def simplifyUrl( | ||
u: URL, | ||
prefix: Option[String], | ||
path: Int, | ||
keepProtocol: Boolean): String = { | ||
val sb = new StringBuilder | ||
prefix foreach { p => | ||
sb ++= p | ||
} | ||
if (keepProtocol) { | ||
sb ++= u.getProtocol ++= "://" | ||
} | ||
sb ++= u.getHost | ||
if (path > 0) { | ||
val urlPath = u.getPath | ||
val simplifiedPath = urlPath.split("/").dropWhile(_.isEmpty).take(path) | ||
if (simplifiedPath.nonEmpty) { | ||
sb ++= "/" ++= simplifiedPath.mkString("/") | ||
} | ||
} | ||
sb.toString | ||
} | ||
|
||
/** Convert a string to Some(s) if it's not blank or null, otherwise None */ | ||
def asOpt(s: String): Option[String] = { | ||
if (isBlank(s)) None else Some(s) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
package com.peoplepattern | ||
|
||
/** | ||
* Text processing utilities for language prediction, tokenization, term extraction etc | ||
* | ||
* <ul> | ||
* <li>See [[LangBundle]] for tokenization and term extraction | ||
* <li>See [[LanguageIdentifier]] for language identification | ||
* <li>See [[StringUtil]] for low-level string helpers | ||
* <li>See [[Implicits.StringWithAnalysis]] for implicit helper methods for strings | ||
* </ul> | ||
*/ | ||
package object text |
Oops, something went wrong.