StringUtils, more tests and Scaladoc

Including a StringUtils package of convenience methods for strings
peoplepattern · Dec 7, 2015 · a54844f · a54844f
1 parent a1c47a8
commit a54844f
Show file tree

Hide file tree

Showing 5 changed files with 427 additions and 6 deletions.
diff --git a/src/main/scala/com/peoplepattern/text/Implicits.scala b/src/main/scala/com/peoplepattern/text/Implicits.scala
@@ -1,23 +1,72 @@
 package com.peoplepattern.text
 
+import java.net.{ URL, MalformedURLException }
+
 object Implicits {
 
+  /** Helpers to provide access to LangBundle and StringUtil stuff on strings */
   implicit class StringWithAnalysis(str: String) {
 
+    /** The detected language of the string, if predicted */
     lazy val lang: Option[String] = LanguageIdentifier.classify(str).map(_._1)
 
+    /** The language bundle for the string based on it's predicted language */
     lazy val bundle: LangBundle = LangBundle.bundleForLang(lang)
 
+    /** The tokens of the string using language specific tokenization */
     lazy val tokens: Vector[String] = bundle.tokens(str)
 
+    /** The terms of the string using language specific tokens & stopwords */
     lazy val terms: Set[String] = bundle.terms(tokens)
 
+    /** The terms + hashtags + @-mentions of the string */
     lazy val termsPlus: Set[String] = bundle.termsPlus(tokens)
 
+    /** The term bigrams of the string */
     lazy val termBigrams: Set[String] = termNgrams(2, 2)
 
+    /** The term 3-grams of the string */
     lazy val termTrigrams: Set[String] = termNgrams(3, 3)
 
+    /**
+     * Extract term n-grams from the string
+     *
+     * @param min the minimum n-gram length to extract
+     * @param max the maximum n-gram length to extract
+     */
     def termNgrams(min: Int, max: Int) = bundle.termNgrams(tokens, min, max)
+
+    /** Whether the string is empty or only white-space */
+    def isBlank: Boolean = StringUtil.isBlank(str)
+
+    /** Whether the string is *not* empty or only white-space */
+    def nonBlank: Boolean = !isBlank
+
+    /**
+     * Extract counts of the char n-grams in the string
+     *
+     * @param min the minimum n-gram length to extract
+     * @param max the maximum n-gram length to extract
+     */
+    def charNgrams(min: Int, max: Int): Map[String, Int] = StringUtil.charNgrams(str, min, max)
+
+    /** Extract counts of the char bigrams in the string */
+    def charBigrams(s: String): Map[String, Int] = charNgrams(2, 2)
+
+    /** The term as a URL, if it can be parsed as such */
+    lazy val asUrl: Option[URL] = try {
+      Some(new URL(str))
+    } catch {
+      case _: MalformedURLException => None
+    }
+
+    /** The terms as a simplified URL if it can be parsed as such */
+    lazy val simplifiedUrl: Option[String] = asUrl.map(StringUtil.simplifyUrl)
+
+    /** Whether the string can be parsed as a URL */
+    def isURL: Boolean = asUrl.nonEmpty
+
+    /** Some(this) if the string is non-empty and doesn't contain only white-space */
+    def asOpt: Option[String] = StringUtil.asOpt(str)
   }
 }
diff --git a/src/main/scala/com/peoplepattern/text/LangBundle.scala b/src/main/scala/com/peoplepattern/text/LangBundle.scala
@@ -81,14 +81,18 @@ trait LangBundle {
     b.toVector
   }
 
+  /** Language specific stopwords */
   def stopwords: Set[String]
 
+  /** Whether the string is probably a linguistic term with meaning */
   def isContentTerm(term: String) = {
     term.forall(_.isLetter) && !stopwords(term.toLowerCase)
   }
 
+  /** Tokenize the string and extract the set of terms */
   def terms(text: String): Set[String] = terms(tokens(text))
 
+  /** Extract terms from the sequence of tokens */
   def terms(tokens: Seq[String]): Set[String] = {
     tokens.filter(isContentTerm).map(_.toLowerCase).toSet
   }
@@ -97,32 +101,54 @@ trait LangBundle {
     c.isLetter || c.isDigit || c == '_'
   }
 
+  /** Whether the string can could be a social media hashtag */
   def isHashtag(term: String) = {
     term.size >= 2 && term(0) == '#' && term.tail.forall(isSocialThingChar)
   }
 
+  /** Whether the string could be a social media @-mention */
   def isMention(term: String) = {
     term.size >= 2 && term(0) == '@' && term.tail.forall(isSocialThingChar)
   }
 
   /**
-   * Word-only terms plus hashtags, emoji, @-mentions
+   * Tokenize the string and extract terms plus hashtags, emoji, @-mentions
    */
   def termsPlus(text: String): Set[String] = termsPlus(tokens(text))
 
   /**
-   * Word-only terms plus hashtags, emoji, @-mentions
+   * Extract terms plus hashtags, emoji, @-mentions from the token sequence
    */
   def termsPlus(tokens: Seq[String]): Set[String] = {
     tokens.filter(w =>
       isContentTerm(w) || isMention(w) || isHashtag(w)
     ).map(_.toLowerCase).toSet
   }
 
+  /**
+   * Extract the set of term-only n-grams from the text
+   *
+   * For example from the text "this is the winning team" only the bigram
+   * "winning team" would be extracted
+   *
+   * @param text the text to extract n-grams from
+   * @param min the minimum length of extracted n-grams
+   * @param max the maximum length of extracted n-grams
+   */
   def termNgrams(text: String, min: Int, max: Int): Set[String] = {
     termNgrams(tokens(text), min, max)
   }
 
+  /**
+   * Extract the set of term-only n-grams from the token sequence
+   *
+   * For example from the text "this is the winning team" only the bigram
+   * "winning team" would be extracted
+   *
+   * @param tokens the token sequence to extract n-grams from
+   * @param min the minimum length of extracted n-grams
+   * @param max the maximum length of extracted n-grams
+   */
   def termNgrams(tokens: Seq[String], min: Int, max: Int): Set[String] = {
     val seqs = Buffer.empty[Vector[String]]
     val thisbf = Buffer.empty[String]
@@ -144,24 +170,57 @@ trait LangBundle {
     termNgrams.toSet
   }
 
+  /**
+   * Extract the set of term-only bigrams from the text
+   *
+   * For example from the text "this is the winning team" only the bigram
+   * "winning team" would be extracted
+   *
+   * @param text the text to extract n-grams from
+   */
   def termBigrams(text: String) = termNgrams(text, 2, 2)
 
+  /**
+   * Extract the set of term-only bigrams from the token sequence
+   *
+   * For example from the text "this is the winning team" only the bigram
+   * "winning team" would be extracted
+   *
+   * @param tokens the token sequence to extract n-grams from
+   */
   def termBigrams(tokens: Seq[String]) = termNgrams(tokens, 2, 2)
 
+  /**
+   * Extract the set of term-only bigrams from the text
+   *
+   * For example from the text "this is red sox nation" only the trigram
+   * "red sox nation" would be extracted
+   *
+   * @param text the text to extract n-grams from
+   */
   def termTrigrams(text: String) = termNgrams(text, 3, 3)
 
+  /**
+   * Extract the set of term-only bigrams from the text
+   *
+   * For example from the text "this is red sox nation" only the trigram
+   * "red sox nation" would be extracted
+   *
+   * @param tokens the token sequence to extract n-grams from
+   */
   def termTrigrams(tokens: Seq[String]) = termNgrams(tokens, 3, 3)
 }
 
+/** Helpers and language-specifi [[LangBundle]]s **/
 object LangBundle {
 
   import scala.io.Source
 
-  def srcFromResource(path: String) = {
+  private def srcFromResource(path: String) = {
     Source.fromInputStream(getClass.getResourceAsStream(path))
   }
 
-  def stopwords(lang: String) = {
+  private def stopwords(lang: String) = {
     val src = srcFromResource(s"/$lang/stopwords.txt")
     try {
       src.getLines.toSet
@@ -170,26 +229,54 @@ object LangBundle {
     }
   }
 
-  def mkBundle(lang: String) = {
+  private def mkBundle(lang: String) = {
     val stops = stopwords(lang)
     new LangBundle {
       val stopwords = stops
     }
   }
 
+  /** The [[LangBundle]] for German */
   lazy val de = mkBundle("de")
+
+  /** The [[LangBundle]] for English */
   lazy val en = mkBundle("en")
+
+  /** The [[LangBundle]] for Spanish */
   lazy val es = mkBundle("es")
+
+  /** The [[LangBundle]] for French */
   lazy val fr = mkBundle("fr")
+
+  /** The [[LangBundle]] for Indonesian */
   lazy val in = mkBundle("in")
-  lazy val ja = mkBundle("ja") // TODO improved tokenizer
+
+  /**
+   * The [[LangBundle]] for Japanese
+   *
+   * TODO improved tokenizer
+   */
+  lazy val ja = mkBundle("ja")
+
+  /** The [[LangBundle]] for Malay */
   lazy val ms = mkBundle("ms")
+
+  /** The [[LangBundle]] for Dutch */
   lazy val nl = mkBundle("nl")
+
+  /** The [[LangBundle]] for Portuguese */
   lazy val pt = mkBundle("pt")
+
+  /** The [[LangBundle]] for Swedish */
   lazy val sv = mkBundle("sv")
+
+  /** The [[LangBundle]] for Turkish */
   lazy val tr = mkBundle("tr")
+
+  /** The [[LangBundle]] for Armenian */
   lazy val ar = mkBundle("ar")
 
+  /** The set of supported languages */
   def langs = Set(
     "de",
     "en",
@@ -205,13 +292,23 @@ object LangBundle {
     "ar"
   )
 
+  /**
+   * A language bundle for text for which we don't have an identified language
+   *
+   * Uses all known stopwords
+   */
   lazy val unk = {
     val stops: Set[String] = langs.flatMap(stopwords)
     new LangBundle {
       val stopwords = stops
     }
   }
 
+  /**
+   * Look up the [[LangBundle]] by language code
+   *
+   * @param lang two-letter ISO 639-1 language code
+   */
   def bundleForLang(lang: Option[String]): LangBundle = lang match {
     case Some("de") => de
     case Some("en") => en

diff --git a/src/main/scala/com/peoplepattern/text/StringUtil.scala b/src/main/scala/com/peoplepattern/text/StringUtil.scala
@@ -0,0 +1,78 @@
+package com.peoplepattern.text
+
+import scala.collection.mutable
+import java.net.{ URL, MalformedURLException }
+
+object StringUtil {
+
+  /** Determine if the string is null or only contains white-space */
+  def isBlank(s: String): Boolean = Option(s) match {
+    case Some(str) => str.trim.isEmpty
+    case _ => true
+  }
+
+  /** Counts of the character n-grams in the string */
+  def charNgrams(s: String, min: Int, max: Int): Map[String, Int] = {
+    val mmap = mutable.Map.empty[String, Int].withDefaultValue(0)
+    for {
+      len <- min to max
+      ngram <- s.sliding(len)
+    } mmap(ngram) += 1
+    mmap.toMap
+  }
+
+  /** Counts of the character 2-grams in the string */
+  def charBigrams(s: String) = charNgrams(s, 2, 2)
+
+  /** Whether the string contains a URL */
+  def isUrl(s: String): Boolean = try {
+    val u = new URL(s)
+    u != null
+  } catch {
+    case _: MalformedURLException => false
+  }
+
+  /**
+   * Simplify a URL into something less unique
+   *
+   * Eg simplifyUrl(http://money.cnn.com/video/technology/2015/11/30/tech-gift-guide-selfie-gadgets.cnnmoney/index.html)
+   * is simplified to money.cnn.com/video
+   */
+  def simplifyUrl(u: URL): String = simplifyUrl(u, None, 1, false)
+
+  /**
+   * Simplify a URL into something less unique
+   *
+   * @param u the URL
+   * @param prefix prefix to append to the simplified URL, e.g. "url:"
+   * @param path the number of parts of the path to keep, e.g. with path = 2 "video/technology/2015" -> "video/technology"
+   * @param keepProtocol if true, keep the "http" or "https" part of the URL
+   */
+  def simplifyUrl(
+    u: URL,
+    prefix: Option[String],
+    path: Int,
+    keepProtocol: Boolean): String = {
+    val sb = new StringBuilder
+    prefix foreach { p =>
+      sb ++= p
+    }
+    if (keepProtocol) {
+      sb ++= u.getProtocol ++= "://"
+    }
+    sb ++= u.getHost
+    if (path > 0) {
+      val urlPath = u.getPath
+      val simplifiedPath = urlPath.split("/").dropWhile(_.isEmpty).take(path)
+      if (simplifiedPath.nonEmpty) {
+        sb ++= "/" ++= simplifiedPath.mkString("/")
+      }
+    }
+    sb.toString
+  }
+
+  /** Convert a string to Some(s) if it's not blank or null, otherwise None */
+  def asOpt(s: String): Option[String] = {
+    if (isBlank(s)) None else Some(s)
+  }
+}
diff --git a/src/main/scala/com/peoplepattern/text/package.scala b/src/main/scala/com/peoplepattern/text/package.scala
@@ -0,0 +1,13 @@
+package com.peoplepattern
+
+/**
+ * Text processing utilities for language prediction, tokenization, term extraction etc
+ *
+ * <ul>
+ * <li>See [[LangBundle]] for tokenization and term extraction
+ * <li>See [[LanguageIdentifier]] for language identification
+ * <li>See [[StringUtil]] for low-level string helpers
+ * <li>See [[Implicits.StringWithAnalysis]] for implicit helper methods for strings
+ * </ul>
+ */
+package object text