# Amazon-reviews predictions with Spark ML

In [None]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf

val conf = new SparkConf()
      .setMaster("yarn")
      .set("spark.executor.memory", "4g")
      .set("spark.driver.memory", "4g")
      .set("spark.driver.maxResultSize", "2g")
      .set("spark.executor.instances", "5")
      .set("spark.executor.cores", "4")
      .set("spark.default.parallelism", "20")

// Initialize SparkSession
val sc = SparkSession.builder.config(conf).getOrCreate()

Intitializing Scala interpreter ...

## Read data and transform to RDD

In [None]:
val K = 75
val file_path_stopwords = "../data/stopwords.txt"
val file_path_reviews = "hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json"
// val file_path_reviews = "hdfs:///user/dic24_shared/amazon-reviews/full/reviewscombined.json"

val tokenizePattern = "[^a-zA-Z<>^|]+"

In [None]:
%%time
val df = sc.read.json(file_path_reviews).select("category", "reviewText")

In [None]:
import scala.io.Source.fromFile

val stopWords = fromFile(file_path_stopwords).getLines.toArray

### Helpers

In [None]:
import java.io.PrintWriter
import org.apache.spark.rdd.RDD
import scala.collection.immutable.TreeSet

def writeRDDToFile(rdd: RDD[(String, Seq[(String, Double)])], filePath: String): Unit = {
    var mergedTerms = TreeSet[String]()
    val writer = new PrintWriter(filePath)
    
    val collectedData = rdd.sortByKey().collect()

    for ((category, topk) <- collectedData) {
        val topKStr = topk.map { case (term, chiSquared) => 
            mergedTerms += term
            s"$term:$chiSquared"
        }.mkString(" ")
        writer.println(s"<$category> $topKStr")
    }
    
    writer.print(mergedTerms.mkString(" "))
    
    writer.close()
}


In [None]:
import java.io.PrintWriter

def writeArrToFile(arr: Array[String], filePath: String) = {
    val writer = new PrintWriter(filePath)
    
    writer.println(arr.mkString(" "))
    writer.close()
}

In [None]:
import java.io.PrintWriter

def evaluateGridSearch(paramMaps: Array[org.apache.spark.ml.param.ParamMap], validationMetrics: Array[Double], filePath: String) = {
    val paramsAndMetrics = paramMaps.zip(validationMetrics)
    val writer = new PrintWriter(filePath)
    
    writer.println("maxIter, NumTopFeatures, regParam, standardization, f1-score")

    paramsAndMetrics.foreach { case (paramMap, metric) =>
        val maxIter = paramMap.get(lscv.maxIter).head
        val NumTopFeatures = paramMap.get(selector.numTopFeatures).head
        val regParam = paramMap.get(lscv.regParam).head
        val standardization = paramMap.get(lscv.standardization).head

        writer.println(s"$maxIter, $NumTopFeatures, $regParam, $standardization, ${metric}")
    }

    writer.close()
}

## Calculate Chi-Square
This approaches, first calculates the different number of documents per category and afterward calculates the chi-squared values per term per category.

In [None]:
val rdd = df.rdd

In [None]:
%%time
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

def preprocessing(row: Row): Seq[((String, Option[String]), Int)] = {
  val category = row.getString(0)
  val reviewText = row.getString(1)

  val terms = reviewText
    .toLowerCase()
    .split(tokenizePattern)
    .filter(token => token.length > 1 && !stopWords.contains(token))
    .toSet

  val counts = Seq(((category, None), 1)) ++ terms.map(token => ((category, Some(token)), 1))
  counts.toSeq
}

def tokenToKey(row: ((String, Option[String]), Int)): (Option[String], (String, Int)) = {
  val ((category, token), count) = row
  (token, (category, count))
}

def tokenSum(row: (Option[String], Iterable[(String, Int)])): Iterable[(String, (Option[String], Int, Int))] = {
  val (token, values) = row
  val counts = values.groupBy(_._1).mapValues(_.map(_._2).sum)
  val n_t = counts.values.sum

  counts.map { case (category, count) => (category, (token, count, n_t)) }
}

def chiSquared(row: (String, Iterable[(Option[String], Int, Int)])): (String, Seq[(String, Double)]) = {
  val (category, values) = row
  val counts = values.map { case (token, count, n_t) => token -> (count, n_t) }.toMap
  val (n_c, n) = counts.getOrElse(None, (0, counts.values.map(_._2).sum))

  val results = counts
    .collect {
      case (Some(token), (a, n_t)) =>
        val b = n_t - a
        val c = n_c - a
        val d = n - a - b - c
        val chiSquaredValue = n.toDouble * math.pow(a * d - b * c, 2) / ((a + b).toDouble * (a + c) * (b + d) * (c + d))
      
        (token, chiSquaredValue)
    }
    .toSeq
    .sortBy(-_._2)
    .take(K)

  (category, results)
}

In [None]:
%%time
val topTermsPerCategory = rdd
  .flatMap(preprocessing)
  .reduceByKey(_ + _)
  .map(tokenToKey)
  .groupByKey()
  .flatMap(tokenSum)
  .groupByKey()
  .map(chiSquared)
  .sortByKey()

In [None]:
%%time
writeRDDToFile(topTermsPerCategory, "../output_rdd.txt")

## Datasets/DataFrames: Spark ML and Pipelines

First, create all the necessary transformers!

In [None]:
import org.apache.spark.ml.feature.{StringIndexer, RegexTokenizer, StopWordsRemover}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.ml.feature.{HashingTF, IDF}

val indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("category_index")

val tokenizer = new RegexTokenizer()
    .setInputCol("reviewText")
    .setOutputCol("raw_terms")
    .setMinTokenLength(2)
    .setPattern(tokenizePattern)
    .setToLowercase(true)

val remover = new StopWordsRemover()
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("terms")
    .setStopWords(stopWords)

val countVectorizer = new CountVectorizer()
    .setInputCol(remover.getOutputCol)
    .setOutputCol("raw_features")
    .setMinDF(1)

val idf = new IDF()
    .setInputCol(countVectorizer.getOutputCol)
    .setOutputCol("features")

Afterward, we create the Chi^2-Selector

In [None]:
import org.apache.spark.ml.feature.{ChiSqSelector, ChiSqSelectorModel}

val selector = new ChiSqSelector()
  .setNumTopFeatures(2000)
  .setFeaturesCol(idf.getOutputCol)
  .setLabelCol("category_index")
  .setOutputCol("selectedFeatures")

Lastly, we create the pipeline to execute all the transformers and select the top K features

In [None]:
import org.apache.spark.ml.{Pipeline, PipelineModel}

val pipeline = new Pipeline()
    .setStages(Array(tokenizer, remover, countVectorizer, idf, indexer, selector))

After creation of the pipeline, we can now fit it to our data, we want to transform

In [None]:
%%time
val model = pipeline.fit(df)

Afterward, we can extract the vocabulary and selected features to map them

In [None]:
val vocabulary = model.stages(2).asInstanceOf[CountVectorizerModel].vocabulary
val selectedFeatures = model.stages.last.asInstanceOf[ChiSqSelectorModel].selectedFeatures

In [None]:
import scala.util.Sorting.quickSort

val top2000terms = selectedFeatures.map(i => vocabulary(i))
quickSort(top2000terms)

In [None]:
%%time
writeArrToFile(top2000terms, "../output_ds.txt")

## Text Classification

In [27]:
val seed = 12041500
val fraction = 0.5

seed: Int = 12041500
fraction: Double = 0.5


In [29]:
val sampledDF = df.sample(true, fraction, seed)

sampledDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]


In [30]:
val Array(training, test) = sampledDF.randomSplit(Array(0.8, 0.2), seed = seed)

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]


In [31]:
import org.apache.spark.ml.feature.Normalizer

val normalizer = new Normalizer()
  .setInputCol(selector.getOutputCol)
  .setOutputCol("normFeatures")
  .setP(2.0)

import org.apache.spark.ml.feature.Normalizer
normalizer: org.apache.spark.ml.feature.Normalizer = Normalizer: uid=normalizer_a52da129e251, p=2.0


In [32]:
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}

val lsvc = new LinearSVC()

val classifier = new OneVsRest()
    .setClassifier(lsvc)
    .setFeaturesCol(normalizer.getOutputCol)
    .setLabelCol(indexer.getOutputCol)

import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
lsvc: org.apache.spark.ml.classification.LinearSVC = linearsvc_680b806ceee2
classifier: org.apache.spark.ml.classification.OneVsRest = oneVsRest_ddde733371fe


In [33]:
val pipeline_classifier = new Pipeline()
    .setStages(Array(tokenizer, remover, countVectorizer, idf, indexer, selector, normalizer, classifier))

pipeline_classifier: org.apache.spark.ml.Pipeline = pipeline_145c10c57836


In [34]:
import org.apache.spark.ml.tuning.ParamGridBuilder

val paramGrid = new ParamGridBuilder()
    .addGrid(selector.numTopFeatures, Array(20, 2000))
    .addGrid(lsvc.maxIter, Array(10, 50))
    .addGrid(lsvc.regParam, Array(0.001, 0.01, 0.1))
    .addGrid(lsvc.standardization, Array(false, true))
    .build()

import org.apache.spark.ml.tuning.ParamGridBuilder
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linearsvc_680b806ceee2-maxIter: 10,
	chiSqSelector_154469e462f0-numTopFeatures: 20,
	linearsvc_680b806ceee2-regParam: 0.001,
	linearsvc_680b806ceee2-standardization: false
}, {
	linearsvc_680b806ceee2-maxIter: 50,
	chiSqSelector_154469e462f0-numTopFeatures: 20,
	linearsvc_680b806ceee2-regParam: 0.001,
	linearsvc_680b806ceee2-standardization: false
}, {
	linearsvc_680b806ceee2-maxIter: 10,
	chiSqSelector_154469e462f0-numTopFeatures: 20,
	linearsvc_680b806ceee2-regParam: 0.01,
	linearsvc_680b806ceee2-standardization: false
}, {
	linearsvc_680b806ceee2-maxIter: 50,
	chiSqSelector_154469e462f0-numTopFeatures: 20,
	linearsvc_680b806ceee2-regParam: 0.01,
	linearsvc_680b806ceee2-s...


In [35]:
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 

val evaluater = new MulticlassClassificationEvaluator()
    .setLabelCol(indexer.getOutputCol)
    .setMetricName("f1")

import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
evaluater: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = MulticlassClassificationEvaluator: uid=mcEval_18e861264f98, metricName=f1, metricLabel=0.0, beta=1.0, eps=1.0E-15


In [36]:
import org.apache.spark.ml.tuning.TrainValidationSplit
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 

val trainValidationSplit = new TrainValidationSplit()
    .setEstimator(pipeline_classifier)
    .setEvaluator(evaluater)
    .setEstimatorParamMaps(paramGrid)
    .setTrainRatio(0.8)
    .setSeed(seed)
    .setParallelism(20)

import org.apache.spark.ml.tuning.TrainValidationSplit
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_5af76543b683


In [None]:
%%time
val model = trainValidationSplit.fit(training)

In [None]:
val predictions = model.transform(test)

Lastly, we sample and therefore reduce the size of the data to improve performance

In [39]:
println(s"F1-Score = ${evaluater.evaluate(predictions)}")

F1-Score = 0.6685228303780189


In [40]:
import org.apache.spark.ml.classification.{OneVsRestModel, LinearSVCModel}

val bestModel = model.bestModel.asInstanceOf[PipelineModel]
val bestClassifier = bestModel.stages.last.asInstanceOf[OneVsRestModel]
val bestBinaryClassifierModel = bestClassifier.models.head.asInstanceOf[LinearSVCModel]

println(s"Best binary classifier parameters:\n" +
  s"\tmaxIter: ${bestBinaryClassifierModel.getMaxIter}\n" +
  s"\tregParam: ${bestBinaryClassifierModel.getRegParam}\n" +
  s"\tstandardization: ${bestBinaryClassifierModel.getStandardization}")

Best binary classifier parameters:
	maxIter: 10
	regParam: 0.001
	standardization: true


import org.apache.spark.ml.classification.{OneVsRestModel, LinearSVCModel}
bestModel: org.apache.spark.ml.PipelineModel = pipeline_482b12d36bfa
bestClassifier: org.apache.spark.ml.classification.OneVsRestModel = OneVsRestModel: uid=oneVsRest_bb12bdfb41f3, classifier=linearsvc_cad71bb5a8e8, numClasses=22, numFeatures=2000
bestBinaryClassifierModel: org.apache.spark.ml.classification.LinearSVCModel = LinearSVCModel: uid=linearsvc_cad71bb5a8e8, numClasses=2, numFeatures=2000


At this point, we store for each parameter combination of the grid together with the performance metrics into a csv file for further investigation

In [None]:
val paramMaps = model.getEstimatorParamMaps
val validationMetrics = model.validationMetrics
val paramsAndMetrics = paramMaps.zip(validationMetrics)

// evaluateGridSearch(paramMaps, validationMetrics, "../grid_search_evaluation.csv")
evaluateGridSearch(paramMaps, validationMetrics, "Exercise_2/data/grid_search_evaluation.csv")