# Amazon-reviews predictions with Spark ML

In [8]:
import org.apache.spark.sql.SparkSession

val sc = SparkSession.builder.getOrCreate()

import org.apache.spark.sql.SparkSession
sc: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@1724680f


## Read data and transform to RDD

In [9]:
val K = 75
val file_path_stopwords = "./data/stopwords.txt"
val file_path_reviews = "hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json"
// val file_path_reviews = "hdfs:///user/dic24_shared/amazon-reviews/full/reviewscombined.json"

val tokenizePattern = "[^a-zA-Z<>^|]+"

K: Int = 75
file_path_stopwords: String = ./data/stopwords.txt
file_path_reviews: String = hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json
tokenizePattern: String = [^a-zA-Z<>^|]+


In [10]:
%%time
val df = sc.read.json(file_path_reviews).select("category", "reviewText")

Time: 8.568068742752075 seconds.



df: org.apache.spark.sql.DataFrame = [category: string, reviewText: string]


In [11]:
import scala.io.Source.fromFile

val stopWords = fromFile(file_path_stopwords).getLines.toArray

import scala.io.Source.fromFile
stopWords: Array[String] = Array(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, bibs, bike, book, books, both, brief, bulbs, but, by, c, came, camera, can, cannot, cant, car, case, cause, causes, ...


### Helpers

In [12]:
import java.io.PrintWriter
import org.apache.spark.rdd.RDD

def writeRDDToFile(rdd: RDD[String], mergedDict: String, filePath: String) = {
    val writer = new PrintWriter(filePath)
    rdd.collect().foreach(line => writer.println(line))
    writer.println(mergedDict)
    writer.close()
}

import java.io.PrintWriter
import org.apache.spark.rdd.RDD
writeRDDToFile: (rdd: org.apache.spark.rdd.RDD[String], mergedDict: String, filePath: String)Unit


## Calculate Chi-Square

In [49]:
val rdd = df.rdd

rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[8] at rdd at <console>:33


In [50]:
%%time
val counts = df.rdd.map(row => (row.getString(0), 1)).countByKey()
val N = df.count()

Time: 5.945173025131226 seconds.



counts: scala.collection.Map[String,Long] = Map(Patio_Lawn_and_Garde -> 994, Movies_and_TV -> 4607, Electronic -> 7825, Office_Product -> 1243, Tools_and_Home_Improvement -> 1926, Kindle_Store -> 3205, Home_and_Kitche -> 4254, Digital_Music -> 836, Automotive -> 1374, Grocery_and_Gourmet_Food -> 1297, Baby -> 916, Book -> 22507, Clothing_Shoes_and_Jewelry -> 5749, Toys_and_Game -> 2253, Health_and_Personal_Care -> 2982, Sports_and_Outdoor -> 3269, Beauty -> 2023, CDs_and_Vinyl -> 3749, Musical_Instrument -> 500, Cell_Phones_and_Accessorie -> 3447, Apps_for_Android -> 2638, Pet_Supplie -> 1235)
N: Long = 78829


In [51]:
%%time
val countsAsMap = counts.toMap

Time: 0.3215475082397461 seconds.



countsAsMap: scala.collection.immutable.Map[String,Long] = Map(Patio_Lawn_and_Garde -> 994, Movies_and_TV -> 4607, Electronic -> 7825, Office_Product -> 1243, Tools_and_Home_Improvement -> 1926, Kindle_Store -> 3205, Home_and_Kitche -> 4254, Digital_Music -> 836, Automotive -> 1374, Grocery_and_Gourmet_Food -> 1297, Baby -> 916, Book -> 22507, Clothing_Shoes_and_Jewelry -> 5749, Toys_and_Game -> 2253, Health_and_Personal_Care -> 2982, Sports_and_Outdoor -> 3269, Beauty -> 2023, CDs_and_Vinyl -> 3749, Musical_Instrument -> 500, Cell_Phones_and_Accessorie -> 3447, Apps_for_Android -> 2638, Pet_Supplie -> 1235)


In [52]:
%%time
val preprocessedRDD = rdd
    .map(row => (row.getString(0), row.getString(1).toLowerCase().split(tokenizePattern).distinct))
    .map(row => (row._1, row._2.filter(w => w.length > 1 && !stopWords.contains(w))))

val termCategoryCounts = preprocessedRDD
    .flatMapValues(terms => terms)
    .map({ case (category, term) => ((category, term), 1) })
    .reduceByKey(_ + _)
    .map({ case ((category, term), count) => (term, (category, count)) })

val chiSquaredValues = termCategoryCounts
    .groupByKey()
    .flatMapValues({ categoryCounts =>
        val n_t = categoryCounts.map(row => row._2).sum
        categoryCounts.map({ case (category, count) =>
            val A = count
            val B = n_t - A
            val C = countsAsMap(category) - A
            val D = N - A - B - C
            val chiSquared = (N * math.pow((A * D) - (B * C), 2)) / ((A + B) * (A + C) * (B + D) * (C + D))
            (category, chiSquared)
        })
      })
   
val topTermsPerCategory = chiSquaredValues
    .map({ case (term, (category, chiSquared)) => (category, (term, chiSquared)) })
    .groupByKey()
    .mapValues(_.toList.sortBy(-_._2).take(K))
    .sortByKey()

Time: 30.290690183639526 seconds.



preprocessedRDD: org.apache.spark.rdd.RDD[(String, Array[String])] = MapPartitionsRDD[20] at map at <console>:43
termCategoryCounts: org.apache.spark.rdd.RDD[(String, (String, Int))] = MapPartitionsRDD[24] at map at <console>:49
chiSquaredValues: org.apache.spark.rdd.RDD[(String, (String, Double))] = MapPartitionsRDD[26] at flatMapValues at <console>:53
topTermsPerCategory: org.apache.spark.rdd.RDD[(String, List[(String, Double)])] = ShuffledRDD[32] at sortByKey at <console>:69


In [53]:
%%time
val mergedDict = topTermsPerCategory.flatMap(row => row._2.map(term => term._1)).distinct

Time: 1.604891061782837 seconds.



mergedDict: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[36] at distinct at <console>:33


In [55]:
%%time
val result = topTermsPerCategory.map(row => {
    val key = row._1
    val values = row._2.map { case (str, num) => s"$str:$num" }.mkString(" ")
    s"<$key> $values"
})

Time: 1.2280044555664062 seconds.



result: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[38] at map at <console>:33


In [56]:
%%time
writeRDDToFile(result, mergedDict.sortBy(x => x).reduce(_ + " " + _), "./output_rdd.txt")

Time: 2.9966464042663574 seconds.



In [26]:
!hadoop fs -rm -r hdfs://captain01.os.hpc.tuwien.ac.at:9000/user/e12041500/output_rdd.txt

Deleted hdfs://captain01.os.hpc.tuwien.ac.at:9000/user/e12041500/output_rdd.txt




In [27]:
%%time
result.coalesce(1).saveAsTextFile("output_rdd.txt")

Time: 1.3839399814605713 seconds.



### Second approach

In [61]:
%%time
val filteredCategoryTerm = rdd
    .map(row => (row.getString(0), row.getString(1).toLowerCase().split(tokenizePattern).distinct))
    .map(row => (row._1, row._2.filter(w => w.length > 1 && !stopWords.contains(w))))
    .flatMap(row => row._2.map(term => ((row._1, term), 1)))

val countTerms = filteredCategoryTerm
    .map(row => (row._1._2, 1))
    .reduceByKey(_ + _)

val countCategoryTerm = filteredCategoryTerm
    .reduceByKey(_ + _)

val joinedCategoryTerm = countCategoryTerm
    .map(row => (row._1._2, (row._1._1, row._2)))
    .join(countTerms)

val chiSquaredTermCategory = joinedCategoryTerm
    .map(row => {    
        val A = row._2._1._2
        val B = row._2._2 - A
        val C = countsAsMap(row._2._1._1) - A
        val D = N - A - B - C
    
        val chiSquared = (N * math.pow((A * D) - (B * C), 2)) / ((A + B) * (A + C) * (B + D) * (C + D))
        (row._2._1._1, (row._1, chiSquared))
    })
    .groupByKey()
    .map(row => (row._1, row._2.toList.sortBy(x => -x._2).take(K)))
    .sortByKey()

Time: 39.51532602310181 seconds.



filteredCategoryTerm: org.apache.spark.rdd.RDD[((String, String), Int)] = MapPartitionsRDD[83] at flatMap at <console>:45
countTerms: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[85] at reduceByKey at <console>:49
countCategoryTerm: org.apache.spark.rdd.RDD[((String, String), Int)] = ShuffledRDD[86] at reduceByKey at <console>:52
joinedCategoryTerm: org.apache.spark.rdd.RDD[(String, ((String, Int), Int))] = MapPartitionsRDD[90] at join at <console>:56
chiSquaredTermCategory: org.apache.spark.rdd.RDD[(String, List[(String, Double)])] = ShuffledRDD[96] at sortByKey at <console>:70


In [62]:
%%time
val mergedDict = chiSquaredTermCategory.flatMap(row => row._2.map(term => term._1)).distinct

Time: 0.9137146472930908 seconds.



mergedDict: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[100] at distinct at <console>:33


In [63]:
%%time
val result = chiSquaredTermCategory.map(row => {
    val key = row._1
    val values = row._2.map { case (str, num) => s"$str:$num" }.mkString(" ")
    s"<$key> $values"
})

Time: 0.6810376644134521 seconds.



result: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[101] at map at <console>:33


In [15]:
%%time
writeRDDToFile(result, mergedDict.sortBy(x => x).reduce(_ + " " + _), "./output2_rdd.txt")

Time: 1.0665204524993896 seconds.



## Datasets/DataFrames: Spark ML and Pipelines

In [13]:
val K = 2000

K: Int = 2000


Should the dataset first be grouped into category and all reviewTexts concatenated, to really get the top 2000 terms, or should we aggregate it afterwards?

In [105]:
val combinedDF = df
    .groupBy("category")
    .agg(concat_ws(" ", collect_list("reviewText")).alias("reviewText"))

combinedDF: org.apache.spark.sql.DataFrame = [category: string, reviewText: string]


In [None]:
combinedDF.show(1)

First, create all the necessary transformers!

In [14]:
import org.apache.spark.ml.feature.{StringIndexer, RegexTokenizer, StopWordsRemover}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.ml.feature.{HashingTF, IDF}

val indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("category_index")

val tokenizer = new RegexTokenizer()
    .setInputCol("reviewText")
    .setOutputCol("words")
    .setGaps(false)
    .setPattern(tokenizePattern)

val remover = new StopWordsRemover()
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("filtered")
    .setStopWords(stopWords)

val cvModel = new CountVectorizer()
    .setInputCol(remover.getOutputCol)
    .setOutputCol("features")
    .setMinDF(1)

val hashingTF = new HashingTF()
    .setInputCol(remover.getOutputCol)
    .setOutputCol("features")

// Decide which frequencyCounter you want hashingTF vs cvModel?
val idf = new IDF()
    .setInputCol(hashingTF.getOutputCol)
    .setOutputCol("weightedfeatures")

import org.apache.spark.ml.feature.{StringIndexer, RegexTokenizer, StopWordsRemover}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.ml.feature.{HashingTF, IDF}
indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_675215802098
tokenizer: org.apache.spark.ml.feature.RegexTokenizer = RegexTokenizer: uid=regexTok_b7a4e035ad6d, minTokenLength=1, gaps=false, pattern=[^a-zA-Z<>^|]+, toLowercase=true
remover: org.apache.spark.ml.feature.StopWordsRemover = StopWordsRemover: uid=stopWords_dc7274aa73af, numStopWords=596, locale=en_US, caseSensitive=false
cvModel: org.apache.spark.ml.feature.CountVectorizer = cntVec_c75a7dbc9541
hashingTF: org.apache.spark.ml.feature.HashingTF = HashingTF: uid=hashingTF_67d39d7f75e2, binary=false, numFeatur...


Afterward, we create the Chi^2-Selector

In [15]:
import org.apache.spark.ml.feature.ChiSqSelector

val selector = new ChiSqSelector()
  .setNumTopFeatures(K)
  .setFeaturesCol(idf.getOutputCol)
  .setLabelCol("category_index")
  .setOutputCol("selectedFeatures")

import org.apache.spark.ml.feature.ChiSqSelector
selector: org.apache.spark.ml.feature.ChiSqSelector = chiSqSelector_14fec18aabfa


Lastly, we create the pipeline to execute all the transformers and select the top K features

In [16]:
import org.apache.spark.ml.{Pipeline, PipelineModel}

val pipeline = new Pipeline()
    .setStages(Array(tokenizer, remover, hashingTF, idf, indexer, selector))
// todo: should we save the pipeline?

After creation of the pipeline, we can now fit it to our data, we want to transform

In [90]:
val model = pipeline.fit(combinedDF)
// todo: should we save the model?

model: org.apache.spark.ml.PipelineModel = pipeline_ce35100f7e27


Last but not least, we need to transform our data and display the format, which could be used going forward to the text classification task!

In [91]:
val rescaledData = model.transform(combinedDF).select("category", "selectedFeatures")

rescaledData: org.apache.spark.sql.DataFrame = [category: string, selectedFeatures: vector]


In [92]:
rescaledData.show()

+--------------------+--------------------+
|            category|    selectedFeatures|
+--------------------+--------------------+
|Health_and_Person...|(2000,[4,6,10,12,...|
|        Kindle_Store|(2000,[0,1,4,6,11...|
|                Baby|(2000,[6,12,20,21...|
|       Movies_and_TV|(2000,[1,2,4,6,7,...|
|Clothing_Shoes_an...|(2000,[1,6,7,8,13...|
|                Book|(2000,[0,1,2,3,4,...|
|          Automotive|(2000,[6,7,16,19,...|
|  Sports_and_Outdoor|(2000,[3,4,6,7,8,...|
|    Apps_for_Android|(2000,[6,12,18,20...|
|Tools_and_Home_Im...|(2000,[4,6,7,16,2...|
|Cell_Phones_and_A...|(2000,[6,9,12,13,...|
|       CDs_and_Vinyl|(2000,[1,2,4,5,6,...|
|          Electronic|(2000,[0,1,5,6,7,...|
|         Pet_Supplie|(2000,[6,20,34,38...|
|Grocery_and_Gourm...|(2000,[4,6,12,20,...|
|     Home_and_Kitche|(2000,[6,7,11,12,...|
|       Toys_and_Game|(2000,[6,7,8,12,1...|
|       Digital_Music|(2000,[1,12,18,20...|
|  Musical_Instrument|(2000,[6,7,18,19,...|
|Patio_Lawn_and_Garde|(2000,[6,1

## Text Classification

In [22]:
val seed = 12041500

seed: Int = 12041500


The next line of code is optional and can be used to down sample the dataframe to make model training faster, in case of high load on the cluster

In [36]:
val sampledDF = df.sample(withReplacement = false, fraction = 0.1, seed = 12041500)

sampledDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]


First, we split our data into training- and test-set.

In [38]:
val Array(training, test) = sampledDF.randomSplit(Array(0.8, 0.2), seed = seed)

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]


Additionally, we create a Normalizer for our selected features.

In [39]:
import org.apache.spark.ml.feature.Normalizer

val normalizer = new Normalizer()
  .setInputCol(selector.getOutputCol)
  .setOutputCol("normFeatures")
  .setP(2.0)

import org.apache.spark.ml.feature.Normalizer
normalizer: org.apache.spark.ml.feature.Normalizer = Normalizer: uid=normalizer_0eef80a7078a, p=2.0


Then, we create the classifier. In our case we use a Linear Vector Machine. However, because we deal with a multiclass problem, we wrap it in a OneVsRest-classifier to bypass the limitation of Linear Vector Machines, which can only work with binary problems.


In [40]:
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}

val lsvc = new LinearSVC()
    // .setStandardization(false) // we use a normalizer ? Normalizer vs Standardization ?
    .setFeaturesCol(normalizer.getOutputCol)
    .setLabelCol(indexer.getOutputCol)

val classifier = new OneVsRest()
    .setClassifier(lsvc)
    .setFeaturesCol(normalizer.getOutputCol)
    .setLabelCol(indexer.getOutputCol)

import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
lsvc: org.apache.spark.ml.classification.LinearSVC = linearsvc_b3d835bac5ea
classifier: org.apache.spark.ml.classification.OneVsRest = oneVsRest_96ad20cd751c


After, setting all things up, we create the pipeline, which uses all the prior created transformers and fits the transformed data to our classifier

In [41]:
import org.apache.spark.ml.{Pipeline, PipelineModel}

val pipeline = new Pipeline()
    .setStages(Array(tokenizer, remover, hashingTF, idf, indexer, selector, normalizer, classifier))

import org.apache.spark.ml.{Pipeline, PipelineModel}
pipeline: org.apache.spark.ml.Pipeline = pipeline_eebf351ef7e0


Now, we only need to create our MulticlassClassificationEvaluator and ParamGridBuilder, which we use for hyperparameter tuning and evaluation of our model. We evaluate our model based on the F1-Score and our hyperparameter tuning happens based on the following params:
- Compare chi square overall top 2000 filtered features with another, heavier filtering with much less dimensionality
- Compare different SVM settings by 
    - varying the regularization parameter (choose 3 different values), 
    - standardization of training features (2 values),
    - and maximum number of iterations (2 values).



In [43]:
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 

val evaluater = new MulticlassClassificationEvaluator()
    .setLabelCol(indexer.getOutputCol)
    .setMetricName("weightedFMeasure")

import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
evaluater: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = MulticlassClassificationEvaluator: uid=mcEval_5f9738fa46b4, metricName=weightedFMeasure, metricLabel=0.0, beta=1.0, eps=1.0E-15


In [None]:
import org.apache.spark.ml.tuning.ParamGridBuilder

val paramGrid = new ParamGridBuilder()
    .addGrid(lsvc.maxIter, Array(10, 100))
    .addGrid(lsvc.regParam, Array(0.01, 0.1, 0.5))
    .addGrid(lsvc.standardization, Array(false, true))
    .addGrid(selector.numTopFeatures, Array(200, 2000))
    // .addGrid(normalizer.p, Array(1.0, 2.0)) ? instead of standardization
    .build()

Now, we simply perform the grid-search on a train-validation split and evaluate the best hyperparams on our previously created evaluater.

In [45]:
import org.apache.spark.ml.tuning.TrainValidationSplit

val trainValidationSplit = new TrainValidationSplit()
    .setEstimator(pipeline)
    .setEvaluator(evaluater)
    .setEstimatorParamMaps(paramGrid)
    .setTrainRatio(0.8)
    .setSeed(seed)
    .setParallelism(2)

import org.apache.spark.ml.tuning.TrainValidationSplit
trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_47a148e1a6c8


Lastly, we fit the model, with the best hyperparameters to the data and perform predictions with it. Afterward, we evaluate the model based on the F1-Score.

In [None]:
val model = trainValidationSplit.fit(training)

In [None]:
val predictions = model.transform(test)

In [None]:
val f1 = evaluater.evaluate(predictions)
println(s"F1-Score = ${f1}")

In [None]:
// todo: test, which one performs the best
// model.params
// model.explainParams()
// model.extractParamMap()