# Amazon-reviews predictions with Spark ML

In [45]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf

val conf = new SparkConf()
      .setMaster("yarn")
      .set("spark.executor.memory", "4g")
      .set("spark.driver.memory", "4g")
      .set("spark.driver.maxResultSize", "2g")
      .set("spark.executor.instances", "5")
      .set("spark.executor.cores", "4")
      .set("spark.default.parallelism", "20")

// Initialize SparkSession
val sc = SparkSession.builder.config(conf).getOrCreate()

import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf
conf: org.apache.spark.SparkConf = org.apache.spark.SparkConf@3dc15ebc
sc: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@53422aa9


## Read data and transform to RDD

In [46]:
val K = 75
val file_path_stopwords = "../data/stopwords.txt"
val file_path_reviews = "hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json"
// val file_path_reviews = "hdfs:///user/dic24_shared/amazon-reviews/full/reviewscombined.json"

val tokenizePattern = "[^a-zA-Z<>^|]+"

K: Int = 75
file_path_stopwords: String = ../data/stopwords.txt
file_path_reviews: String = hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json
tokenizePattern: String = [^a-zA-Z<>^|]+


In [47]:
%%time
val df = sc.read.json(file_path_reviews).select("category", "reviewText")

Time: 0.6782345771789551 seconds.



df: org.apache.spark.sql.DataFrame = [category: string, reviewText: string]


In [48]:
import scala.io.Source.fromFile

val stopWords = fromFile(file_path_stopwords).getLines.toArray

import scala.io.Source.fromFile
stopWords: Array[String] = Array(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, bibs, bike, book, books, both, brief, bulbs, but, by, c, came, camera, can, cannot, cant, car, case, cause, causes, ...


### Helpers

In [5]:
import java.io.PrintWriter
import org.apache.spark.rdd.RDD
import scala.collection.immutable.TreeSet

def writeRDDToFile(rdd: RDD[(String, Seq[(String, Double)])], filePath: String): Unit = {
    var mergedTerms = TreeSet[String]()
    val writer = new PrintWriter(filePath)
    
    val collectedData = rdd.sortByKey().collect()

    for ((category, topk) <- collectedData) {
        val topKStr = topk.map { case (term, chiSquared) => 
            mergedTerms += term
            s"$term:$chiSquared"
        }.mkString(" ")
        writer.println(s"<$category> $topKStr")
    }
    
    writer.print(mergedTerms.mkString(" "))
    
    writer.close()
}


import java.io.PrintWriter
import org.apache.spark.rdd.RDD
import scala.collection.immutable.TreeSet
writeRDDToFile: (rdd: org.apache.spark.rdd.RDD[(String, Seq[(String, Double)])], filePath: String)Unit


In [6]:
import org.apache.spark.sql.{Row, Dataset}
import scala.collection.immutable.TreeSet
import java.io.PrintWriter

def writeDFToFile(df: Dataset[Row], filePath: String) = {
    var mergedTerms = TreeSet[String]()
    val writer = new PrintWriter(filePath)
    
    val collectedData = df.collect()

    for (row <- collectedData) {
      val category = row.getString(0)
      val topk = row.getAs [Seq[Seq[String]]]("topk").map({
        case Seq(token, chiSquared) => { 
            mergedTerms += token
            s"$token:$chiSquared" 
        }
      }).mkString(" ")

      writer.println(f"<$category> $topk")
    }
    writer.print(mergedTerms.mkString(" "))
    
    writer.close()
}

import org.apache.spark.sql.{Row, Dataset}
import scala.collection.immutable.TreeSet
import java.io.PrintWriter
writeDFToFile: (df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row], filePath: String)Unit


## Calculate Chi-Square
This approaches, first calculates the different number of documents per category and afterward calculates the chi-squared values per term per category.

In [7]:
val rdd = df.rdd

rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[8] at rdd at <console>:34


In [8]:
%%time
val counts = df.rdd.map(row => (row.getString(0), 1)).countByKey()
val N = df.count()

Time: 4.964831113815308 seconds.



counts: scala.collection.Map[String,Long] = Map(Patio_Lawn_and_Garde -> 994, Movies_and_TV -> 4607, Electronic -> 7825, Office_Product -> 1243, Tools_and_Home_Improvement -> 1926, Kindle_Store -> 3205, Home_and_Kitche -> 4254, Digital_Music -> 836, Automotive -> 1374, Grocery_and_Gourmet_Food -> 1297, Baby -> 916, Book -> 22507, Clothing_Shoes_and_Jewelry -> 5749, Toys_and_Game -> 2253, Health_and_Personal_Care -> 2982, Sports_and_Outdoor -> 3269, Beauty -> 2023, CDs_and_Vinyl -> 3749, Musical_Instrument -> 500, Cell_Phones_and_Accessorie -> 3447, Apps_for_Android -> 2638, Pet_Supplie -> 1235)
N: Long = 78829


In [9]:
%%time
val countsAsMap = counts.toMap

Time: 0.35018062591552734 seconds.



countsAsMap: scala.collection.immutable.Map[String,Long] = Map(Patio_Lawn_and_Garde -> 994, Movies_and_TV -> 4607, Electronic -> 7825, Office_Product -> 1243, Tools_and_Home_Improvement -> 1926, Kindle_Store -> 3205, Home_and_Kitche -> 4254, Digital_Music -> 836, Automotive -> 1374, Grocery_and_Gourmet_Food -> 1297, Baby -> 916, Book -> 22507, Clothing_Shoes_and_Jewelry -> 5749, Toys_and_Game -> 2253, Health_and_Personal_Care -> 2982, Sports_and_Outdoor -> 3269, Beauty -> 2023, CDs_and_Vinyl -> 3749, Musical_Instrument -> 500, Cell_Phones_and_Accessorie -> 3447, Apps_for_Android -> 2638, Pet_Supplie -> 1235)


In [10]:
%%time
val preprocessedRDD = rdd
    .map(row => (row.getString(0), row.getString(1).toLowerCase().split(tokenizePattern).distinct))
    .map(row => (row._1, row._2.filter(w => w.length > 1 && !stopWords.contains(w))))

val termCategoryCounts = preprocessedRDD
    .flatMapValues(terms => terms)
    .map({ case (category, term) => ((category, term), 1) })
    .reduceByKey(_ + _)
    .map({ case ((category, term), count) => (term, (category, count)) })

val chiSquaredValues = termCategoryCounts
    .groupByKey()
    .flatMapValues({ categoryCounts =>
        val n_t = categoryCounts.map(row => row._2).sum
        categoryCounts.map({ case (category, count) =>
            val A = count
            val B = n_t - A
            val C = countsAsMap(category) - A
            val D = N - A - B - C
            val chiSquared = (N * math.pow((A * D) - (B * C), 2)) / ((A + B) * (A + C) * (B + D) * (C + D))
            (category, chiSquared)
        })
      })
   
val topTermsPerCategory = chiSquaredValues
    .map({ case (term, (category, chiSquared)) => (category, (term, chiSquared)) })
    .groupByKey()
    .mapValues(_.toSeq.sortBy(-_._2).take(K))

Time: 1.2302906513214111 seconds.



preprocessedRDD: org.apache.spark.rdd.RDD[(String, Array[String])] = MapPartitionsRDD[20] at map at <console>:41
termCategoryCounts: org.apache.spark.rdd.RDD[(String, (String, Int))] = MapPartitionsRDD[24] at map at <console>:47
chiSquaredValues: org.apache.spark.rdd.RDD[(String, (String, Double))] = MapPartitionsRDD[26] at flatMapValues at <console>:51
topTermsPerCategory: org.apache.spark.rdd.RDD[(String, Seq[(String, Double)])] = MapPartitionsRDD[29] at mapValues at <console>:66


In [11]:
%%time
writeRDDToFile(topTermsPerCategory, "../output_rdd.txt")

Time: 27.114131927490234 seconds.



### Second approach
In this approach we calculate the number of documents per category "on the fly".

In [12]:
%%time
import org.apache.spark.rdd.RDD

def preprocessing(row: Row): Seq[((String, Option[String]), Int)] = {
  val category = row.getString(0)
  val reviewText = row.getString(1)

  val terms = reviewText
    .toLowerCase()
    .split(tokenizePattern)
    .filter(token => token.length > 1 && !stopWords.contains(token))
    .toSet

  val counts = Seq(((category, None), 1)) ++ terms.map(token => ((category, Some(token)), 1))
  counts.toSeq
}

def tokenToKey(row: ((String, Option[String]), Int)): (Option[String], (String, Int)) = {
  val ((category, token), count) = row
  (token, (category, count))
}

def tokenSum(row: (Option[String], Iterable[(String, Int)])): Iterable[(String, (Option[String], Int, Int))] = {
  val (token, values) = row
  val counts = values.groupBy(_._1).mapValues(_.map(_._2).sum)
  val n_t = counts.values.sum

  counts.map { case (category, count) => (category, (token, count, n_t)) }
}

def chiSquared(row: (String, Iterable[(Option[String], Int, Int)])): (String, Seq[(String, Double)]) = {
  val (category, values) = row
  val counts = values.map { case (token, count, n_t) => token -> (count, n_t) }.toMap
  val (n_c, n) = counts.getOrElse(None, (0, counts.values.map(_._2).sum))

  val results = counts
    .collect {
      case (Some(token), (a, n_t)) =>
        val b = n_t - a
        val c = n_c - a
        val d = n - a - b - c
        val chiSquaredValue = n.toDouble * math.pow(a * d - b * c, 2) / ((a + b).toDouble * (a + c) * (b + d) * (c + d))
      
        (token, chiSquaredValue)
    }
    .toSeq
    .sortBy(-_._2)
    .take(K)

  (category, results)
}

Time: 0.7137079238891602 seconds.



import org.apache.spark.rdd.RDD
preprocessing: (row: org.apache.spark.sql.Row)Seq[((String, Option[String]), Int)]
tokenToKey: (row: ((String, Option[String]), Int))(Option[String], (String, Int))
tokenSum: (row: (Option[String], Iterable[(String, Int)]))Iterable[(String, (Option[String], Int, Int))]
chiSquared: (row: (String, Iterable[(Option[String], Int, Int)]))(String, Seq[(String, Double)])


In [13]:
%%time
val topTermsPerCategory = rdd
  .flatMap(preprocessing)
  .reduceByKey(_ + _)
  .map(tokenToKey)
  .groupByKey()
  .flatMap(tokenSum)
  .groupByKey()
  .map(chiSquared)
  .sortByKey()

Time: 27.783408880233765 seconds.



topTermsPerCategory: org.apache.spark.rdd.RDD[(String, Seq[(String, Double)])] = ShuffledRDD[42] at sortByKey at <console>:47


In [14]:
%%time
writeRDDToFile(topTermsPerCategory, "../output_rdd.txt")

Time: 1.1373136043548584 seconds.



## Datasets/DataFrames: Spark ML and Pipelines

First, create all the necessary transformers!

In [49]:
import org.apache.spark.ml.feature.{StringIndexer, RegexTokenizer, StopWordsRemover}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.ml.feature.{HashingTF, IDF}

val indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("category_index")

val tokenizer = new RegexTokenizer()
    .setInputCol("reviewText")
    .setOutputCol("raw_terms")
    .setMinTokenLength(2)
    .setPattern(tokenizePattern)
    .setToLowercase(true)

val remover = new StopWordsRemover()
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("terms")
    .setStopWords(stopWords)

val countVectorizer = new CountVectorizer()
    .setInputCol(remover.getOutputCol)
    .setOutputCol("raw_features")
    .setMinDF(1)

// Decide which frequencyCounter you want hashingTF vs cvModel?
// val hashingTF = new HashingTF()
//    .setInputCol(tokenizer.getOutputCol)
//    .setOutputCol("raw_features")

val idf = new IDF()
    .setInputCol(countVectorizer.getOutputCol)
    .setOutputCol("features")

import org.apache.spark.ml.feature.{StringIndexer, RegexTokenizer, StopWordsRemover}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.ml.feature.{HashingTF, IDF}
indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_083ba66f52ea
tokenizer: org.apache.spark.ml.feature.RegexTokenizer = RegexTokenizer: uid=regexTok_fe9487f58d14, minTokenLength=2, gaps=true, pattern=[^a-zA-Z<>^|]+, toLowercase=true
remover: org.apache.spark.ml.feature.StopWordsRemover = StopWordsRemover: uid=stopWords_e02c4cdc50a1, numStopWords=596, locale=en_US, caseSensitive=false
countVectorizer: org.apache.spark.ml.feature.CountVectorizer = cntVec_c0a25880bfff
idf: org.apache.spark.ml.feature.IDF = idf_9dba73983742


Afterward, we create the Chi^2-Selector

In [50]:
import org.apache.spark.ml.feature.{ChiSqSelector, ChiSqSelectorModel}

val selector = new ChiSqSelector()
  .setNumTopFeatures(2000)
  .setFeaturesCol(idf.getOutputCol)
  .setLabelCol("category_index")
  .setOutputCol("selectedFeatures")

import org.apache.spark.ml.feature.{ChiSqSelector, ChiSqSelectorModel}
selector: org.apache.spark.ml.feature.ChiSqSelector = chiSqSelector_93618e654f32


Lastly, we create the pipeline to execute all the transformers and select the top K features

In [51]:
import org.apache.spark.ml.{Pipeline, PipelineModel}

val pipeline = new Pipeline()
    .setStages(Array(tokenizer, remover, countVectorizer, idf, indexer, selector))

import org.apache.spark.ml.{Pipeline, PipelineModel}
pipeline: org.apache.spark.ml.Pipeline = pipeline_1040765eebac


After creation of the pipeline, we can now fit it to our data, we want to transform

In [18]:
%%time
val model = pipeline.fit(df)

Time: 46.24801969528198 seconds.



model: org.apache.spark.ml.PipelineModel = pipeline_bbd84c8d218d


Afterward, we can extract the vocabulary and selected features to map them

In [19]:
val vocabulary = model.stages(2).asInstanceOf[CountVectorizerModel].vocabulary
val selectedFeatures = model.stages.last.asInstanceOf[ChiSqSelectorModel].selectedFeatures

vocabulary: Array[String] = Array(great, good, love, time, work, recommend, back, easy, make, bought, made, find, buy, price, put, reading, quality, people, works, quot, years, nice, characters, long, series, lot, found, author, day, bit, feel, makes, thing, perfect, fit, end, set, loved, things, thought, music, small, hard, give, year, world, size, worth, pretty, times, sound, written, light, real, big, amazon, part, bad, highly, money, excellent, purchased, happy, high, enjoyed, problem, family, interesting, wanted, character, job, review, purchase, man, watch, days, enjoy, place, home, stars, short, writing, play, cover, top, fan, full, fine, color, side, order, wonderful, amazing, point, fact, reviews, ordered, stories, favorite, easily, needed, battery, screen, water, dvd, beautifu...


Last but not least, we need to transform our data and display the format, which could be used going forward to the text classification task!

In [20]:
val rescaledData = model.transform(df).select("category", "selectedFeatures")

rescaledData: org.apache.spark.sql.DataFrame = [category: string, selectedFeatures: vector]


We can now use this transformed data to extract the top 2000 features overall and group them by category and select the top 75 of each.

In [21]:
%%time
import org.apache.spark.ml.linalg.{SparseVector}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.types._

// Define UDFs
val sparseVectorToMap = udf((v: SparseVector) => v.indices.zip(v.values).toMap)
val indexToToken = udf((i: Int) => vocabulary(selectedFeatures(i)))

val topK = rescaledData
    .select($"category", explode(sparseVectorToMap($"selectedFeatures")))
    .select($"category", $"key".as("term"), $"value".as("chi_squared"))
    .groupBy("category", "term")
    .agg(mean("chi_squared").as("chi_squared"))
    .withColumn("term", indexToToken(col("term")))
    .orderBy(desc("chi_squared"), asc("term"))
    .withColumn("token_chisquared", array(col("term"), col("chi_squared")))
    .groupBy("category")
    .agg(slice(collect_list("token_chisquared"), 1, K).as("topk"))
    .sort("category")

Time: 0.7742371559143066 seconds.



import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.types._
sparseVectorToMap: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$4915/1919411326@37b7350d,MapType(IntegerType,DoubleType,false),List(Some(class[value[0]: vector])),Some(class[value[0]: map<int,double>]),None,true,true)
indexToToken: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$4920/1716803550@509cae9c,StringType,List(Some(class[value[0]: int])),Some(class[value[0]: string]),None,true,true)
topK: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, topk: array<array<string>>]


In [22]:
%%time
writeDFToFile(topK, "../output_ds.txt")

Time: 12.263301610946655 seconds.



## Text Classification

In [52]:
val seed = 12041500
val percentage_of_dataset = 1

seed: Int = 12041500
percentage_of_dataset: Int = 1


First, lets create two pipelines. One to perfom pre-processing and one to train the classifier. The reason for this is that we don't want to perform all pre-processing steps for each of the runs in the grid-search.

### Pre-processing
First, we pre process the data by performing the whole tokenization and tdidf-calculations.

In [24]:
import org.apache.spark.ml.{Pipeline, PipelineModel}

val pipeline_preprocessing = new Pipeline()
    .setStages(Array(tokenizer, remover, countVectorizer, idf, indexer))

import org.apache.spark.ml.{Pipeline, PipelineModel}
pipeline_preprocessing: org.apache.spark.ml.Pipeline = pipeline_fc877683f3e5


Lastly, we sample and therefore reduce the size of the data to improve performance

In [25]:
val sampled_df = df.sample(withReplacement = false, fraction = percentage_of_dataset, seed = seed)

sampled_df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]


In [26]:
val Array(training, test) = sampled_df.randomSplit(Array(0.8, 0.2), seed = seed)

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]


In [27]:
val preprocessing = pipeline_preprocessing.fit(training)

preprocessing: org.apache.spark.ml.PipelineModel = pipeline_fc877683f3e5


In [28]:
val preprocessed_trainining = preprocessing.transform(training).select(idf.getOutputCol, "category", indexer.getOutputCol)
val preprocessed_test = preprocessing.transform(test).select(idf.getOutputCol, "category", indexer.getOutputCol)

preprocessed_trainining: org.apache.spark.sql.DataFrame = [features: vector, category: string ... 1 more field]
preprocessed_test: org.apache.spark.sql.DataFrame = [features: vector, category: string ... 1 more field]


In [29]:
preprocessed_trainining.write.mode("overwrite").parquet("training_data.parquet")
preprocessed_test.write.mode("overwrite").parquet("test_data.parquet")

Reload them to have them easily accessible in memory

In [30]:
val trainDF = sc.read.parquet("training_data.parquet")
val testDF = sc.read.parquet("training_data.parquet")

trainDF: org.apache.spark.sql.DataFrame = [features: vector, category: string ... 1 more field]
testDF: org.apache.spark.sql.DataFrame = [features: vector, category: string ... 1 more field]


### Training classifier

Additionally, we create a Normalizer for our selected features.

In [53]:
import org.apache.spark.ml.feature.Normalizer

val normalizer = new Normalizer()
  .setInputCol(selector.getOutputCol)
  .setOutputCol("normFeatures")
  .setP(2.0)

import org.apache.spark.ml.feature.Normalizer
normalizer: org.apache.spark.ml.feature.Normalizer = Normalizer: uid=normalizer_f7479c2eda16, p=2.0


Then, we create the classifier. In our case we use a Linear Vector Machine. However, because we deal with a multiclass problem, we wrap it in a OneVsRest-classifier to bypass the limitation of Linear Vector Machines, which can only work with binary problems.

In [54]:
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}

val lsvc = new LinearSVC()

val classifier = new OneVsRest()
    .setClassifier(lsvc)
    .setFeaturesCol(normalizer.getOutputCol)
    .setLabelCol(indexer.getOutputCol)

import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
lsvc: org.apache.spark.ml.classification.LinearSVC = linearsvc_44ca9d66f970
classifier: org.apache.spark.ml.classification.OneVsRest = oneVsRest_ada2f6f879d9


In [55]:
val pipeline_classifier = new Pipeline()
    .setStages(Array(selector, normalizer, classifier))

pipeline_classifier: org.apache.spark.ml.Pipeline = pipeline_abd62de89b22


Now, we only need to create our MulticlassClassificationEvaluator and ParamGridBuilder, which we use for hyperparameter tuning and evaluation of our model. We evaluate our model based on the F1-Score and our hyperparameter tuning happens based on the following params:
- Compare chi square overall top 2000 filtered features with another, heavier filtering with much less dimensionality
- Compare different SVM settings by 
    - varying the regularization parameter (choose 3 different values), 
    - standardization of training features (2 values),
    - and maximum number of iterations (2 values).



In [56]:
import org.apache.spark.ml.tuning.ParamGridBuilder

val paramGrid = new ParamGridBuilder()
    .addGrid(lsvc.maxIter, Array(10, 50))
    .addGrid(lsvc.regParam, Array(0.001, 0.01, 0.1))
    .addGrid(lsvc.standardization, Array(false, true))
    .addGrid(selector.numTopFeatures, Array(20, 2000))
    .build()

import org.apache.spark.ml.tuning.ParamGridBuilder
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linearsvc_44ca9d66f970-maxIter: 10,
	chiSqSelector_93618e654f32-numTopFeatures: 20,
	linearsvc_44ca9d66f970-regParam: 0.001,
	linearsvc_44ca9d66f970-standardization: false
}, {
	linearsvc_44ca9d66f970-maxIter: 10,
	chiSqSelector_93618e654f32-numTopFeatures: 20,
	linearsvc_44ca9d66f970-regParam: 0.001,
	linearsvc_44ca9d66f970-standardization: true
}, {
	linearsvc_44ca9d66f970-maxIter: 50,
	chiSqSelector_93618e654f32-numTopFeatures: 20,
	linearsvc_44ca9d66f970-regParam: 0.001,
	linearsvc_44ca9d66f970-standardization: false
}, {
	linearsvc_44ca9d66f970-maxIter: 50,
	chiSqSelector_93618e654f32-numTopFeatures: 20,
	linearsvc_44ca9d66f970-regParam: 0.001,
	linearsvc_44ca9d66f970-...


Now, we simply perform the grid-search on a train-validation split and evaluate the best hyperparams on our previously created evaluater.

In [57]:
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 

val evaluater = new MulticlassClassificationEvaluator()
    .setLabelCol(indexer.getOutputCol)
    .setMetricName("f1")

import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
evaluater: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = MulticlassClassificationEvaluator: uid=mcEval_b83a1950d040, metricName=f1, metricLabel=0.0, beta=1.0, eps=1.0E-15


In [36]:
import org.apache.spark.ml.tuning.TrainValidationSplit
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 

val trainValidationSplit = new TrainValidationSplit()
    .setEstimator(pipeline_classifier)
    .setEvaluator(evaluater)
    .setEstimatorParamMaps(paramGrid)
    .setTrainRatio(0.8)
    .setSeed(seed)
    .setParallelism(20)

import org.apache.spark.ml.tuning.TrainValidationSplit
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_6c0eb8532277


Lastly, we fit the model, with the best hyperparameters to the data and perform predictions with it. Afterward, we evaluate the model based on the F1-Score.

In [37]:
%%time
val model = trainValidationSplit.fit(trainDF)

Time: 1180.5153167247772 seconds.



model: org.apache.spark.ml.tuning.TrainValidationSplitModel = TrainValidationSplitModel: uid=tvs_6c0eb8532277, bestModel=pipeline_482b12d36bfa, trainRatio=0.8


In [38]:
val predictions = model.transform(testDF)

predictions: org.apache.spark.sql.DataFrame = [features: vector, category: string ... 5 more fields]


In [39]:
println(s"F1-Score = ${evaluater.evaluate(predictions)}")

F1-Score = 0.6685228303780189


In [40]:
import org.apache.spark.ml.classification.{OneVsRestModel, LinearSVCModel}

val bestModel = model.bestModel.asInstanceOf[PipelineModel]
val bestClassifier = bestModel.stages.last.asInstanceOf[OneVsRestModel]
val bestBinaryClassifierModel = bestClassifier.models.head.asInstanceOf[LinearSVCModel]

println(s"Best binary classifier parameters:\n" +
  s"\tmaxIter: ${bestBinaryClassifierModel.getMaxIter}\n" +
  s"\tregParam: ${bestBinaryClassifierModel.getRegParam}\n" +
  s"\tstandardization: ${bestBinaryClassifierModel.getStandardization}")

Best binary classifier parameters:
	maxIter: 10
	regParam: 0.001
	standardization: true


import org.apache.spark.ml.classification.{OneVsRestModel, LinearSVCModel}
bestModel: org.apache.spark.ml.PipelineModel = pipeline_482b12d36bfa
bestClassifier: org.apache.spark.ml.classification.OneVsRestModel = OneVsRestModel: uid=oneVsRest_bb12bdfb41f3, classifier=linearsvc_cad71bb5a8e8, numClasses=22, numFeatures=2000
bestBinaryClassifierModel: org.apache.spark.ml.classification.LinearSVCModel = LinearSVCModel: uid=linearsvc_cad71bb5a8e8, numClasses=2, numFeatures=2000


### Try-out

In [58]:
import org.apache.spark.ml.feature.Normalizer

val normalizer = new Normalizer()
  .setInputCol(selector.getOutputCol)
  .setOutputCol("normFeatures")
  .setP(2.0)

import org.apache.spark.ml.feature.Normalizer
normalizer: org.apache.spark.ml.feature.Normalizer = Normalizer: uid=normalizer_143d6f976863, p=2.0


In [59]:
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}

val lsvc = new LinearSVC()
val classifier = new OneVsRest()
    .setClassifier(lsvc)
    .setFeaturesCol(normalizer.getOutputCol)
    .setLabelCol(indexer.getOutputCol)

import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
lsvc: org.apache.spark.ml.classification.LinearSVC = linearsvc_02ce1607396d
classifier: org.apache.spark.ml.classification.OneVsRest = oneVsRest_d857996ac75b


In [68]:
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.feature.{ChiSqSelector, ChiSqSelectorModel}

def pre_processing_classifier(df: DataFrame, numFeatures: Integer) = {
    val selector = new ChiSqSelector()
      .setNumTopFeatures(numFeatures)
      .setFeaturesCol(idf.getOutputCol)
      .setLabelCol("category_index")
      .setOutputCol("selectedFeatures")
    
    val pipeline_preprocessing = new Pipeline()
        .setStages(Array(tokenizer, remover, countVectorizer, idf, indexer, selector, normalizer))

    val Array(training, test) = df.randomSplit(Array(0.8, 0.2), seed = seed)
    val preprocessing = pipeline_preprocessing.fit(training)

    val preprocessed_trainining = preprocessing.transform(training).select(normalizer.getOutputCol, "category", indexer.getOutputCol)
    val preprocessed_test = preprocessing.transform(test).select(normalizer.getOutputCol, "category", indexer.getOutputCol)

    preprocessed_trainining.write.mode("overwrite").parquet("training_data.parquet")
    preprocessed_test.write.mode("overwrite").parquet("test_data.parquet")
}


import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.feature.{ChiSqSelector, ChiSqSelectorModel}
pre_processing_classifier: (df: org.apache.spark.sql.DataFrame, numFeatures: Integer)Unit


In [69]:
import org.apache.spark.ml.tuning.ParamGridBuilder

val paramGrid = new ParamGridBuilder()
    .addGrid(lsvc.maxIter, Array(10, 50))
    .addGrid(lsvc.regParam, Array(0.001, 0.01, 0.1))
    .addGrid(lsvc.standardization, Array(false, true))
    .build()

import org.apache.spark.ml.tuning.ParamGridBuilder
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linearsvc_02ce1607396d-maxIter: 10,
	linearsvc_02ce1607396d-regParam: 0.001,
	linearsvc_02ce1607396d-standardization: false
}, {
	linearsvc_02ce1607396d-maxIter: 10,
	linearsvc_02ce1607396d-regParam: 0.01,
	linearsvc_02ce1607396d-standardization: false
}, {
	linearsvc_02ce1607396d-maxIter: 10,
	linearsvc_02ce1607396d-regParam: 0.1,
	linearsvc_02ce1607396d-standardization: false
}, {
	linearsvc_02ce1607396d-maxIter: 10,
	linearsvc_02ce1607396d-regParam: 0.001,
	linearsvc_02ce1607396d-standardization: true
}, {
	linearsvc_02ce1607396d-maxIter: 10,
	linearsvc_02ce1607396d-regParam: 0.01,
	linearsvc_02ce1607396d-standardization: true
}, {
	linearsvc_02ce1607396d-maxIter: 10,
	l...


In [70]:
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 

val evaluater = new MulticlassClassificationEvaluator()
    .setLabelCol(indexer.getOutputCol)
    .setMetricName("f1")

import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
evaluater: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = MulticlassClassificationEvaluator: uid=mcEval_a5c41df99b6f, metricName=f1, metricLabel=0.0, beta=1.0, eps=1.0E-15


In [72]:
%%time
import org.apache.spark.ml.tuning.TrainValidationSplit
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 
import org.apache.spark.ml.classification.{OneVsRestModel, LinearSVCModel}

val numTopFeatures = Array(20, 2000)
for (n <- numTopFeatures) {
    println(s"Starting grid-search for ${n}-topFeatures selected by ChiSquared")
    pre_processing_classifier(df, n)
    val trainDF = sc.read.parquet("training_data.parquet")
    val testDF = sc.read.parquet("training_data.parquet")
    
    val trainValidationSplit = new TrainValidationSplit()
        .setEstimator(classifier)
        .setEvaluator(evaluater)
        .setEstimatorParamMaps(paramGrid)
        .setTrainRatio(0.8)
        .setSeed(seed)
        .setParallelism(20)
    
    val model = trainValidationSplit.fit(trainDF)
    val predictions = model.transform(testDF)
    println(s"F1-Score = ${evaluater.evaluate(predictions)}")

    //val bestModel = model.bestModel.asInstanceOf[PipelineModel]
    //val bestClassifier = bestModel.stages.last.asInstanceOf[OneVsRestModel]
    //val bestBinaryClassifierModel = bestClassifier.models.head.asInstanceOf[LinearSVCModel]

    //println(s"Best binary classifier parameters:\n" +
    //  s"\tmaxIter: ${bestBinaryClassifierModel.getMaxIter}\n" +
    //  s"\tregParam: ${bestBinaryClassifierModel.getRegParam}\n" +
    //  s"\tstandardization: ${bestBinaryClassifierModel.getStandardization}")
}

Starting grid-search for 20-topFeatures selected by ChiSquared
F1-Score = 0.1663626942224923
Starting grid-search for 2000-topFeatures selected by ChiSquared
F1-Score = 0.6665776668536328
Time: 832.5181884765625 seconds.



import org.apache.spark.ml.tuning.TrainValidationSplit
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.classification.{OneVsRestModel, LinearSVCModel}
numTopFeatures: Array[Int] = Array(20, 2000)
