# Amazon-reviews predictions with Spark ML

In [1]:
import org.apache.spark.sql.SparkSession

val sc = SparkSession.builder.getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://captain01.os.hpc.tuwien.ac.at:9999/proxy/application_1715326141961_1103
SparkContext available as 'sc' (version = 3.2.3, master = yarn, app id = application_1715326141961_1103)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
sc: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@9d0f0fa


## Read data and transform to RDD

In [2]:
val K = 75
val file_path_stopwords = "./data/stopwords.txt"
val file_path_reviews = "hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json"
// val file_path_reviews = "hdfs:///user/dic24_shared/amazon-reviews/full/reviewscombined.json"

val tokenizePattern = "[^a-zA-Z<>^|]+"

K: Int = 75
file_path_stopwords: String = ./data/stopwords.txt
file_path_reviews: String = hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json
tokenizePattern: String = [^a-zA-Z<>^|]+


In [3]:
%%time
val df = sc.read.json(file_path_reviews).select("category", "reviewText")

Time: 9.168583631515503 seconds.



df: org.apache.spark.sql.DataFrame = [category: string, reviewText: string]


In [4]:
import scala.io.Source.fromFile

val stopWords = fromFile(file_path_stopwords).getLines.toArray

import scala.io.Source.fromFile
stopWords: Array[String] = Array(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, bibs, bike, book, books, both, brief, bulbs, but, by, c, came, camera, can, cannot, cant, car, case, cause, causes, ...


### Helpers

In [313]:
import java.io.PrintWriter
import org.apache.spark.rdd.RDD
import scala.collection.immutable.TreeSet

def writeRDDToFile(rdd: RDD[(String, List[(String, Double)])], filePath: String) = {
    var mergedTerms = TreeSet[String]()
    val writer = new PrintWriter(filePath)
    
    val collectedData = rdd.sortByKey().collect()

    for ((category, topk) <- collectedData) {
        val topK = topk.map({ case (term, chiSquared) => {
            mergedTerms += term
            s"$term:$chiSquared"
        }}).mkString(" ")
        writer.println(f"<$category> $topk")
    }
    
    writer.print(mergedTerms.mkString(" "))
    
    writer.close()
}

import java.io.PrintWriter
import org.apache.spark.rdd.RDD
import scala.collection.immutable.TreeSet
writeRDDToFile: (rdd: org.apache.spark.rdd.RDD[(String, List[(String, Double)])], filePath: String)Unit


In [314]:
import org.apache.spark.sql.{Row, Dataset}
import scala.collection.immutable.TreeSet
import java.io.PrintWriter

def writeDFToFile(df: Dataset[Row], filePath: String) = {
    var mergedTerms = TreeSet[String]()
    val writer = new PrintWriter(filePath)
    
    val collectedData = df.collect()

    for (row <- collectedData) {
      val category = row.getString(0)
      val topk = row.getAs [Seq[Seq[String]]]("topk").map({
        case Seq(token, chiSquared) => { 
            mergedTerms += token
            s"$token:$chiSquared" 
        }
      }).mkString(" ")

      writer.println(f"<$category> $topk")
    }
    writer.print(mergedTerms.mkString(" "))
    
    writer.close()
}

import org.apache.spark.sql.{Row, Dataset}
import scala.collection.immutable.TreeSet
import java.io.PrintWriter
writeDFToFile: (df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row], filePath: String)Unit


## Calculate Chi-Square

In [287]:
val rdd = df.rdd

rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[1045] at rdd at <console>:387


In [288]:
%%time
val counts = df.rdd.map(row => (row.getString(0), 1)).countByKey()
val N = df.count()

Time: 3.3981635570526123 seconds.



counts: scala.collection.Map[String,Long] = Map(Patio_Lawn_and_Garde -> 994, Movies_and_TV -> 4607, Electronic -> 7825, Office_Product -> 1243, Tools_and_Home_Improvement -> 1926, Kindle_Store -> 3205, Home_and_Kitche -> 4254, Digital_Music -> 836, Automotive -> 1374, Grocery_and_Gourmet_Food -> 1297, Baby -> 916, Book -> 22507, Clothing_Shoes_and_Jewelry -> 5749, Toys_and_Game -> 2253, Health_and_Personal_Care -> 2982, Sports_and_Outdoor -> 3269, Beauty -> 2023, CDs_and_Vinyl -> 3749, Musical_Instrument -> 500, Cell_Phones_and_Accessorie -> 3447, Apps_for_Android -> 2638, Pet_Supplie -> 1235)
N: Long = 78829


In [289]:
%%time
val countsAsMap = counts.toMap

Time: 3.675842046737671 seconds.



countsAsMap: scala.collection.immutable.Map[String,Long] = Map(Patio_Lawn_and_Garde -> 994, Movies_and_TV -> 4607, Electronic -> 7825, Office_Product -> 1243, Tools_and_Home_Improvement -> 1926, Kindle_Store -> 3205, Home_and_Kitche -> 4254, Digital_Music -> 836, Automotive -> 1374, Grocery_and_Gourmet_Food -> 1297, Baby -> 916, Book -> 22507, Clothing_Shoes_and_Jewelry -> 5749, Toys_and_Game -> 2253, Health_and_Personal_Care -> 2982, Sports_and_Outdoor -> 3269, Beauty -> 2023, CDs_and_Vinyl -> 3749, Musical_Instrument -> 500, Cell_Phones_and_Accessorie -> 3447, Apps_for_Android -> 2638, Pet_Supplie -> 1235)


In [291]:
%%time
val preprocessedRDD = rdd
    .map(row => (row.getString(0), row.getString(1).toLowerCase().split(tokenizePattern).distinct))
    .map(row => (row._1, row._2.filter(w => w.length > 1 && !stopWords.contains(w))))

val termCategoryCounts = preprocessedRDD
    .flatMapValues(terms => terms)
    .map({ case (category, term) => ((category, term), 1) })
    .reduceByKey(_ + _)
    .map({ case ((category, term), count) => (term, (category, count)) })

val chiSquaredValues = termCategoryCounts
    .groupByKey()
    .flatMapValues({ categoryCounts =>
        val n_t = categoryCounts.map(row => row._2).sum
        categoryCounts.map({ case (category, count) =>
            val A = count
            val B = n_t - A
            val C = countsAsMap(category) - A
            val D = N - A - B - C
            val chiSquared = (N * math.pow((A * D) - (B * C), 2)) / ((A + B) * (A + C) * (B + D) * (C + D))
            (category, chiSquared)
        })
      })
   
val topTermsPerCategory = chiSquaredValues
    .map({ case (term, (category, chiSquared)) => (category, (term, chiSquared)) })
    .groupByKey()
    .mapValues(_.toList.sortBy(-_._2).take(K))

Time: 13.75816035270691 seconds.



preprocessedRDD: org.apache.spark.rdd.RDD[(String, Array[String])] = MapPartitionsRDD[1115] at map at <console>:424
termCategoryCounts: org.apache.spark.rdd.RDD[(String, (String, Int))] = MapPartitionsRDD[1119] at map at <console>:430
chiSquaredValues: org.apache.spark.rdd.RDD[(String, (String, Double))] = MapPartitionsRDD[1121] at flatMapValues at <console>:434
topTermsPerCategory: org.apache.spark.rdd.RDD[(String, List[(String, Double)])] = MapPartitionsRDD[1124] at mapValues at <console>:449


In [292]:
%%time
writeRDDToFile(topTermsPerCategory, "./output_rdd.txt")

Time: 31.581246852874756 seconds.



### Second approach

In [61]:
%%time
val filteredCategoryTerm = rdd
    .map(row => (row.getString(0), row.getString(1).toLowerCase().split(tokenizePattern).distinct))
    .map(row => (row._1, row._2.filter(w => w.length > 1 && !stopWords.contains(w))))
    .flatMap(row => row._2.map(term => ((row._1, term), 1)))

val countTerms = filteredCategoryTerm
    .map(row => (row._1._2, 1))
    .reduceByKey(_ + _)

val countCategoryTerm = filteredCategoryTerm
    .reduceByKey(_ + _)

val joinedCategoryTerm = countCategoryTerm
    .map(row => (row._1._2, (row._1._1, row._2)))
    .join(countTerms)

val chiSquaredTermCategory = joinedCategoryTerm
    .map(row => {    
        val A = row._2._1._2
        val B = row._2._2 - A
        val C = countsAsMap(row._2._1._1) - A
        val D = N - A - B - C
    
        val chiSquared = (N * math.pow((A * D) - (B * C), 2)) / ((A + B) * (A + C) * (B + D) * (C + D))
        (row._2._1._1, (row._1, chiSquared))
    })
    .groupByKey()
    .map(row => (row._1, row._2.toList.sortBy(x => -x._2).take(K)))
    .sortByKey()

Time: 39.51532602310181 seconds.



filteredCategoryTerm: org.apache.spark.rdd.RDD[((String, String), Int)] = MapPartitionsRDD[83] at flatMap at <console>:45
countTerms: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[85] at reduceByKey at <console>:49
countCategoryTerm: org.apache.spark.rdd.RDD[((String, String), Int)] = ShuffledRDD[86] at reduceByKey at <console>:52
joinedCategoryTerm: org.apache.spark.rdd.RDD[(String, ((String, Int), Int))] = MapPartitionsRDD[90] at join at <console>:56
chiSquaredTermCategory: org.apache.spark.rdd.RDD[(String, List[(String, Double)])] = ShuffledRDD[96] at sortByKey at <console>:70


In [62]:
%%time
val mergedDict = chiSquaredTermCategory.flatMap(row => row._2.map(term => term._1)).distinct

Time: 0.9137146472930908 seconds.



mergedDict: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[100] at distinct at <console>:33


In [63]:
%%time
val result = chiSquaredTermCategory.map(row => {
    val key = row._1
    val values = row._2.map { case (str, num) => s"$str:$num" }.mkString(" ")
    s"<$key> $values"
})

Time: 0.6810376644134521 seconds.



result: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[101] at map at <console>:33


In [15]:
%%time
writeRDDToFile(result, mergedDict.sortBy(x => x).reduce(_ + " " + _), "./output2_rdd.txt")

Time: 1.0665204524993896 seconds.



## Datasets/DataFrames: Spark ML and Pipelines

First, create all the necessary transformers!

In [5]:
import org.apache.spark.ml.feature.{StringIndexer, RegexTokenizer, StopWordsRemover}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.ml.feature.{HashingTF, IDF}

val indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("category_index")

val tokenizer = new RegexTokenizer()
    .setInputCol("reviewText")
    .setOutputCol("raw_terms")
    .setMinTokenLength(2)
    .setPattern(tokenizePattern)
    .setToLowercase(true)

val remover = new StopWordsRemover()
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("terms")
    .setStopWords(stopWords)

val countVectorizer = new CountVectorizer()
    .setInputCol(remover.getOutputCol)
    .setOutputCol("raw_features")
    .setMinDF(1)

val hashingTF = new HashingTF()
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("raw_features")

// Decide which frequencyCounter you want hashingTF vs cvModel?
val idf = new IDF()
    .setInputCol(countVectorizer.getOutputCol)
    .setOutputCol("features")

import org.apache.spark.ml.feature.{StringIndexer, RegexTokenizer, StopWordsRemover}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.ml.feature.{HashingTF, IDF}
indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_1e5b74a2e8f4
tokenizer: org.apache.spark.ml.feature.RegexTokenizer = RegexTokenizer: uid=regexTok_4987373b4125, minTokenLength=2, gaps=true, pattern=[^a-zA-Z<>^|]+, toLowercase=true
remover: org.apache.spark.ml.feature.StopWordsRemover = StopWordsRemover: uid=stopWords_8a1a16d915c0, numStopWords=596, locale=en_US, caseSensitive=false
countVectorizer: org.apache.spark.ml.feature.CountVectorizer = cntVec_892c9f0924f7
hashingTF: org.apache.spark.ml.feature.HashingTF = HashingTF: uid=hashingTF_73299ee31950, binary=false, nu...


Afterward, we create the Chi^2-Selector

In [6]:
import org.apache.spark.ml.feature.{ChiSqSelector, ChiSqSelectorModel}

val selector = new ChiSqSelector()
  .setNumTopFeatures(2000)
  .setFeaturesCol(idf.getOutputCol)
  .setLabelCol("category_index")
  .setOutputCol("selectedFeatures")

import org.apache.spark.ml.feature.{ChiSqSelector, ChiSqSelectorModel}
selector: org.apache.spark.ml.feature.ChiSqSelector = chiSqSelector_dde97a3c2365


Lastly, we create the pipeline to execute all the transformers and select the top K features

In [7]:
import org.apache.spark.ml.{Pipeline, PipelineModel}

val pipeline = new Pipeline()
    .setStages(Array(tokenizer, remover, countVectorizer, idf, indexer, selector))

import org.apache.spark.ml.{Pipeline, PipelineModel}
pipeline: org.apache.spark.ml.Pipeline = pipeline_5c607bf29a32


After creation of the pipeline, we can now fit it to our data, we want to transform

In [8]:
%%time
val model = pipeline.fit(df)

Time: 50.41893196105957 seconds.



model: org.apache.spark.ml.PipelineModel = pipeline_5c607bf29a32


In [9]:
val vocabulary = model.stages(2).asInstanceOf[CountVectorizerModel].vocabulary
val selectedFeatures = model.stages.last.asInstanceOf[ChiSqSelectorModel].selectedFeatures

vocabulary: Array[String] = Array(great, good, love, time, work, recommend, back, easy, make, bought, made, find, buy, price, put, reading, quality, people, works, quot, years, nice, characters, long, series, lot, found, author, day, bit, feel, makes, thing, perfect, fit, end, set, loved, things, thought, music, small, hard, give, year, world, size, worth, pretty, times, sound, written, light, real, big, amazon, part, bad, highly, money, excellent, purchased, happy, high, enjoyed, problem, family, interesting, wanted, character, job, review, purchase, man, watch, days, enjoy, place, home, stars, short, writing, play, cover, top, fan, full, fine, color, side, order, wonderful, amazing, point, fact, reviews, ordered, stories, favorite, easily, needed, battery, screen, water, dvd, beautifu...


Last but not least, we need to transform our data and display the format, which could be used going forward to the text classification task!

In [10]:
val rescaledData = model.transform(df).select("category", "selectedFeatures")

rescaledData: org.apache.spark.sql.DataFrame = [category: string, selectedFeatures: vector]


In [11]:
%%time
import org.apache.spark.ml.linalg.{SparseVector}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.types._
import org.apache.spark.sql.{functions => F}

// Define UDFs
val sparseVectorToMap = udf((v: SparseVector) => v.indices.zip(v.values).toMap)
val indexToToken = udf((i: Int) => vocabulary(selectedFeatures(i)))

val topK = rescaledData
    .select($"category", explode(sparseVectorToMap($"selectedFeatures")))
    .select($"category", $"key".as("term"), $"value".as("chi_squared"))
    .groupBy("category", "term")
    .agg(mean("chi_squared").as("chi_squared"))
    .withColumn("term", indexToToken(col("term")))
    .orderBy(desc("chi_squared"), asc("term"))
    .withColumn("token_chisquared", array(col("term"), col("chi_squared")))
    .groupBy("category")
    .agg(slice(collect_list("token_chisquared"), 1, K).as("topk"))
    .sort("category")

Time: 1.2329561710357666 seconds.



import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.types._
import org.apache.spark.sql.{functions=>F}
sparseVectorToMap: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$4594/1143777704@2235422f,MapType(IntegerType,DoubleType,false),List(Some(class[value[0]: vector])),Some(class[value[0]: map<int,double>]),None,true,true)
indexToToken: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$4599/148926824@5e747dce,StringType,List(Some(class[value[0]: int])),Some(class[value[0]: string]),None,true,true)
topK: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, topk: array<array<string>>]


In [None]:
%%time
writeDFToFile(topK, "./output_ds.txt")

## Text Classification

In [13]:
val seed = 12041500
val percentage_of_dataset = 0.1

seed: Int = 12041500
percentage_of_dataset: Double = 0.1


The next line of code is optional and can be used to down sample the dataframe to make model training faster, in case of high load on the cluster

In [14]:
val sampledDF = df.sample(withReplacement = false, fraction = percentage_of_dataset, seed = seed)

sampledDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]


First, we split our data into training- and test-set.

In [15]:
val Array(training, test) = sampledDF.randomSplit(Array(0.8, 0.2), seed = seed)

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]


Additionally, we create a Normalizer for our selected features.

In [16]:
import org.apache.spark.ml.feature.Normalizer

val normalizer = new Normalizer()
  .setInputCol(selector.getOutputCol)
  .setOutputCol("normFeatures")
  .setP(2.0)

import org.apache.spark.ml.feature.Normalizer
normalizer: org.apache.spark.ml.feature.Normalizer = Normalizer: uid=normalizer_45b3a1be1511, p=2.0


Then, we create the classifier. In our case we use a Linear Vector Machine. However, because we deal with a multiclass problem, we wrap it in a OneVsRest-classifier to bypass the limitation of Linear Vector Machines, which can only work with binary problems.


In [34]:
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}

val lsvc = new LinearSVC()
    .setFeaturesCol(normalizer.getOutputCol)
    .setLabelCol(indexer.getOutputCol)

val classifier = new OneVsRest()
    .setClassifier(lsvc)
    .setFeaturesCol(normalizer.getOutputCol)
    .setLabelCol(indexer.getOutputCol)

import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
lsvc: org.apache.spark.ml.classification.LinearSVC = linearsvc_4c0a5d2d727b
classifier: org.apache.spark.ml.classification.OneVsRest = oneVsRest_b02e036fc060


After, setting all things up, we create the pipeline, which uses all the prior created transformers and fits the transformed data to our classifier

In [35]:
import org.apache.spark.ml.{Pipeline, PipelineModel}

val pipeline = new Pipeline()
    .setStages(Array(tokenizer, remover, countVectorizer, idf, indexer, selector, normalizer, classifier))

import org.apache.spark.ml.{Pipeline, PipelineModel}
pipeline: org.apache.spark.ml.Pipeline = pipeline_ff8d8a911179


Now, we only need to create our MulticlassClassificationEvaluator and ParamGridBuilder, which we use for hyperparameter tuning and evaluation of our model. We evaluate our model based on the F1-Score and our hyperparameter tuning happens based on the following params:
- Compare chi square overall top 2000 filtered features with another, heavier filtering with much less dimensionality
- Compare different SVM settings by 
    - varying the regularization parameter (choose 3 different values), 
    - standardization of training features (2 values),
    - and maximum number of iterations (2 values).



In [36]:
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 

val evaluater = new MulticlassClassificationEvaluator()
    .setLabelCol(indexer.getOutputCol)
    .setMetricName("weightedFMeasure")

import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
evaluater: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = MulticlassClassificationEvaluator: uid=mcEval_179d9d6a964f, metricName=weightedFMeasure, metricLabel=0.0, beta=1.0, eps=1.0E-15


In [37]:
import org.apache.spark.ml.tuning.ParamGridBuilder

val paramGrid = new ParamGridBuilder()
    .addGrid(lsvc.maxIter, Array(10, 100))
    .addGrid(lsvc.regParam, Array(0.01, 0.1, 0.5))
    .addGrid(lsvc.standardization, Array(false, true))
    .addGrid(selector.numTopFeatures, Array(20, 2000))
    .build()

import org.apache.spark.ml.tuning.ParamGridBuilder
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linearsvc_4c0a5d2d727b-maxIter: 10,
	linearsvc_4c0a5d2d727b-standardization: false
}, {
	linearsvc_4c0a5d2d727b-maxIter: 100,
	linearsvc_4c0a5d2d727b-standardization: false
}, {
	linearsvc_4c0a5d2d727b-maxIter: 10,
	linearsvc_4c0a5d2d727b-standardization: true
}, {
	linearsvc_4c0a5d2d727b-maxIter: 100,
	linearsvc_4c0a5d2d727b-standardization: true
})


Now, we simply perform the grid-search on a train-validation split and evaluate the best hyperparams on our previously created evaluater.

In [38]:
import org.apache.spark.ml.tuning.TrainValidationSplit

val trainValidationSplit = new TrainValidationSplit()
    .setEstimator(pipeline)
    .setEvaluator(evaluater)
    .setEstimatorParamMaps(paramGrid)
    .setTrainRatio(0.8)
    .setSeed(seed)
    .setParallelism(5)

import org.apache.spark.ml.tuning.TrainValidationSplit
trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_786ab58c8808


Lastly, we fit the model, with the best hyperparameters to the data and perform predictions with it. Afterward, we evaluate the model based on the F1-Score.

In [39]:
%%time
val model = trainValidationSplit.fit(training)

Time: 439.81291604042053 seconds.



model: org.apache.spark.ml.tuning.TrainValidationSplitModel = TrainValidationSplitModel: uid=tvs_786ab58c8808, bestModel=pipeline_ff8d8a911179, trainRatio=0.8


In [66]:
val predictions = model.transform(test)

predictions: org.apache.spark.sql.DataFrame = [category: string, reviewText: string ... 9 more fields]


In [68]:
println(s"F1-Score = ${evaluater.evaluate(predictions)}")

F1-Score = 0.46804112569563594


In [65]:
import org.apache.spark.ml.classification.{OneVsRestModel, LinearSVCModel}

val bestModel = model.bestModel.asInstanceOf[PipelineModel]
val bestClassifier = bestModel.stages.last.asInstanceOf[OneVsRestModel]
val bestBinaryClassifierModel = bestClassifier.models.head.asInstanceOf[LinearSVCModel]

// Print the parameters of the best binary classifier model
println(s"Best binary classifier parameters:\n" +
  s"\tmaxIter: ${bestBinaryClassifierModel.getMaxIter}\n" +
  s"\tregParam: ${bestBinaryClassifierModel.getRegParam}\n" +
  s"\tstandardization: ${bestBinaryClassifierModel.getStandardization}")

Best binary classifier parameters:
	maxIter: 10
	regParam: 0.0
	standardization: false


import org.apache.spark.ml.classification.{OneVsRestModel, LinearSVCModel}
bestModel: org.apache.spark.ml.PipelineModel = pipeline_ff8d8a911179
bestClassifier: org.apache.spark.ml.classification.OneVsRestModel = OneVsRestModel: uid=oneVsRest_b02e036fc060, classifier=linearsvc_4c0a5d2d727b, numClasses=22, numFeatures=2000
bestBinaryClassifierModel: org.apache.spark.ml.classification.LinearSVCModel = LinearSVCModel: uid=linearsvc_4c0a5d2d727b, numClasses=2, numFeatures=2000


val Array(training, test) = sampledDF.randomSplit(Array(0.8, 0.2), seed = seed)### Another approach
Coming back later, when server isn't at full capacity

In [None]:
val Array(training, test) = sampledDF.randomSplit(Array(0.8, 0.2), seed = seed)

In [90]:
import org.apache.spark.ml.{Pipeline, PipelineModel}

val pipeline_data = new Pipeline()
    .setStages(Array(tokenizer, remover, countVectorizer, idf, indexer))
val pipeline_classifier = new Pipeline()
    .setStages(Array(selector, normalizer, classifier))

import org.apache.spark.ml.{Pipeline, PipelineModel}
pipeline_data: org.apache.spark.ml.Pipeline = pipeline_3ff51004a9e5
pipeline_classifier: org.apache.spark.ml.Pipeline = pipeline_d83ad444a85b


In [91]:
val model_data = pipeline_data.fit(training)

model_data: org.apache.spark.ml.PipelineModel = pipeline_3ff51004a9e5


In [110]:
val transformed_training = model_data.transform(training)

transformed_training: org.apache.spark.sql.DataFrame = [category: string, reviewText: string ... 5 more fields]


In [101]:
val training_cache = transformed_training.cache() //vs persist

training_cache: transformed_training.type = [category: string, reviewText: string ... 5 more fields]


In [111]:
transformed_training.write.save("transformed_training.parquet")
val training_cache = spark.read.load("transformed_training.parquet")

training_cache: org.apache.spark.sql.DataFrame = [category: string, reviewText: string ... 5 more fields]


In [113]:
import org.apache.spark.ml.tuning.TrainValidationSplit

val trainValidationSplit = new TrainValidationSplit()
    .setEstimator(pipeline_classifier)
    .setEvaluator(evaluater)
    .setEstimatorParamMaps(paramGrid)
    .setTrainRatio(0.8)
    .setSeed(seed)
    .setParallelism(5)

import org.apache.spark.ml.tuning.TrainValidationSplit
trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_132389348f6e


In [None]:
%%time
val model_classifier = trainValidationSplit.fit(training_cache)

In [115]:
val df_test = model_data.transform(test)

df_test: org.apache.spark.sql.DataFrame = [category: string, reviewText: string ... 5 more fields]


In [None]:
val predictions = model_classifier.transform(df_test)

In [None]:
println(s"F1-Score = ${evaluater.evaluate(predictions)}")