#n-gram based language ML Classifier
###This notebook implements the method described by  Cavnar and Trenkle in the paper [N-Gram-Based Text Categorization](http://odur.let.rug.nl/~vannoord/TextCat/textcat.pdf)
n-grams are continuos segments of size 'n' taken from a given string. Given the sentence _"cavnar and trenkle"_, 
- bi-grams: `ca,av,vn,na,ar,r_,_a,an,nd,d_,_t,tr,re,en,nk,kl,le,e_`
- tri-grams: `cav,avn,vna,nar,ar_,r_a,_an,and,nd_,d_t,_tr,tre,ren,enk,nkl,kle,le_`
- quad-grams: `cavn,...`

Next to the frequency of letter (one-gram) that we explored in [A naive approach to language classification](/notebooks/languageclassification/language-detection-letter-freq.snb) they also capture common letter combinations that are typical in a language. n-grams where n>1 also provide record of start and end of words, further adding features to the language classification model that we can create with it.

## Define data location

In [ ]:
val notebooksFolder = sys.env("NOTEBOOKS_DIR")
val baseFolder = s"$notebooksFolder/languageclassification/data"

notebooksFolder: String = /home/maasg/playground/sparkfun/spark-notebooks
baseFolder: String = /home/maasg/playground/sparkfun/spark-notebooks/languageclassification/data


### `NgramLanguageClassifier` becomes a `Model[NgramLanguageClassifier]` to use with an `Estimator`

In [ ]:
import org.apache.spark.ml.Model
import org.apache.spark.sql._
import org.apache.spark.ml.param._
import org.apache.spark.sql.types._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.{DataType, DataTypes}
import org.apache.spark.sql.functions.udf

class NgramLanguageClassifier(override val uid: String, model: Seq[(String, Map[String,Int])]) extends 
Model[NgramLanguageClassifier] with Serializable {
  
  val ProfileCutPoint = 300

  val ngrams: String => Seq[String] = str => {
    val cleaned = str.toLowerCase.replaceAll("[^A-zÀ-ÿ'’]+", "_").reverse.dropWhile(_ == '_').reverse

    def _ngram(n:Int) : Seq[String] = {
      if (n == 1) {
        cleaned.collect{case c if c != '_' => c.toString}
      } else {
        val padding = Seq.fill(n - 1)("_").mkString("")
        val padded = "_" + cleaned + padding
        padded.sliding(n,1).toSeq
      }
    }
    (1 to Math.min(cleaned.size, 5)).flatMap(i => _ngram(i))
  }
  
  val ngramProfile: Seq[String] => Seq[(Int, String)] = ngrams => {
     ngrams
    .groupBy(identity)
    .map{case (k, col) => k -> col.size } // do not use mapValues -> evil
    .toSeq
    .sortBy(- _._2)
    .take(ProfileCutPoint)
    .zipWithIndex
    .map{case ((k,_),idx) => (idx,k)}
  }
  
  val classifier: String => String = txt => {
    val profile = (ngrams andThen ngramProfile)(txt)
    val scores = model.map{case (lang, ngramMap) => 
                                  lang -> profile.map{case (idx, ngram) => 
                                                      ngramMap.get(ngram)
                                                              .map(refIdx => Math.abs(refIdx - idx))
                                                              .getOrElse(ProfileCutPoint)
                                                     }.sum
                                 }
    scores.minBy(_._2)._1  
  }

  def createTransformFunc: String => String = classifier
    
  def outputDataType: DataType = DataTypes.StringType
  
  override def copy(extra: org.apache.spark.ml.param.ParamMap): NgramLanguageClassifier = defaultCopy(extra)
  
  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    validateInputType(inputType)
    if (schema.fieldNames.contains($(outputCol))) {
      throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.")
    }
    val outputFields = schema.fields :+
      StructField($(outputCol), outputDataType, nullable = false)
    StructType(outputFields)
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val transformUDF = udf(this.createTransformFunc, outputDataType)
    dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
  }
  
  def validateInputType(inputType: DataType): Unit = {
    require(inputType == DataTypes.StringType, s"Bad input type: $inputType. Requires String.")
  }
  
  final val inputCol: Param[String] = new Param[String](this, "inputCol", "input column name")

  /** @group getParam */
  final def getInputCol: String = $(inputCol)
  
  final val inputCols: StringArrayParam = new StringArrayParam(this, "inputCols", "input column names")

  /** @group getParam */
  final def getInputCols: Array[String] = $(inputCols)
  
  /**
   * Param for output column name.
   * @group param
   */
  final val outputCol: Param[String] = new Param[String](this, "outputCol", "output column name")

  setDefault(outputCol, uid + "__output")

  /** @group getParam */
  final def getOutputCol: String = $(outputCol)
  
}

import org.apache.spark.ml.Model
import org.apache.spark.sql._
import org.apache.spark.ml.param._
import org.apache.spark.sql.types._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.{DataType, DataTypes}
import org.apache.spark.sql.functions.udf
defined class NgramLanguageClassifier


In [ ]:
import org.apache.spark.ml.Estimator
import org.apache.spark.sql._
class NgramEstimator extends Estimator[NgramLanguageClassifier] {
  override def fit(ds:Dataset[_]):NgramLanguageClassifier = ???
  override def copy(extra: org.apache.spark.ml.param.ParamMap): org.apache.spark.ml.Estimator[NgramLanguageClassifier] = ???
  val uid: String = ???
  def transformSchema(schema: org.apache.spark.sql.types.StructType): org.apache.spark.sql.types.StructType = ???
}

import org.apache.spark.ml.Estimator
import org.apache.spark.sql._
defined class NgramEstimator
