In [1]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{StringIndexer,HashingTF,IDF,RegexTokenizer,StopWordsRemover,ChiSqSelector}
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.util.DefaultParamsWritable
import scala.io.Source.fromFile

Intitializing Scala interpreter ...

Spark Web UI available at http://captain01.os.hpc.tuwien.ac.at:9999/proxy/application_1715326141961_0693
SparkContext available as 'sc' (version = 3.2.3, master = yarn, app id = application_1715326141961_0693)
SparkSession available as 'spark'


import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{StringIndexer, HashingTF, IDF, RegexTokenizer, StopWordsRemover, ChiSqSelector}
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.util.DefaultParamsWritable
import scala.io.Source.fromFile


In [2]:
import org.apache.spark.sql.SparkSession
val sc = SparkSession.builder
.appName("SVM Text Classification")
.getOrCreate()

import org.apache.spark.sql.SparkSession
sc: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@39d509a3


## Set global variables

In [3]:
val path_to_stopwords = "../data/stopwords.txt"

val k = 2000
val seed = 42
val split_pattern = "[^a-zA-Z<>^|]+"

path_to_stopwords: String = ../data/stopwords.txt
k: Int = 2000
seed: Int = 42
split_pattern: String = [^a-zA-Z<>^|]+


## Load amazon reviews

In [4]:
val reviewsDF = sc
.read.json("hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json")
.select("category","reviewText")

val stopwords = fromFile(path_to_stopwords).getLines.toArray

reviewsDF.printSchema()

root
 |-- category: string (nullable = true)
 |-- reviewText: string (nullable = true)



reviewsDF: org.apache.spark.sql.DataFrame = [category: string, reviewText: string]
stopwords: Array[String] = Array(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, bibs, bike, book, books, both, brief, bulbs, but, by, c, came, ca...


## Define preprocessing and feauture extraction pipeline

In [5]:
// tokenize
val tokenizer = new RegexTokenizer()
.setInputCol("reviewText")
.setOutputCol("words")
.setPattern(split_pattern)

// remove stopwords
val stopWordsFile = path_to_stopwords
val remover = new StopWordsRemover()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("filtered")
.setStopWords(stopwords)

// turn words into numerical features
val hashingTF = new HashingTF()
.setInputCol(remover.getOutputCol)
.setOutputCol("rawFeatures")

// scale feature
val idf = new IDF()
.setInputCol(hashingTF.getOutputCol)
.setOutputCol("features")

// encode category to numerical label
val indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("label")

// ChiSqSelector
val selector = new ChiSqSelector()
.setNumTopFeatures(k)
.setFeaturesCol(idf.getOutputCol)
.setLabelCol(indexer.getOutputCol)
.setOutputCol("selected_features")

val pipeline_feature_extraction = new Pipeline().setStages(Array(tokenizer, remover, hashingTF, idf, indexer, selector))

tokenizer: org.apache.spark.ml.feature.RegexTokenizer = RegexTokenizer: uid=regexTok_82e30c3305c5, minTokenLength=1, gaps=true, pattern=[^a-zA-Z<>^|]+, toLowercase=true
stopWordsFile: String = ../data/stopwords.txt
remover: org.apache.spark.ml.feature.StopWordsRemover = StopWordsRemover: uid=stopWords_8d3e98c2a84e, numStopWords=596, locale=en_US, caseSensitive=false
hashingTF: org.apache.spark.ml.feature.HashingTF = HashingTF: uid=hashingTF_5669e8004f5d, binary=false, numFeatures=262144
idf: org.apache.spark.ml.feature.IDF = idf_5e45bbf6954c
indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_c4f2e88e1de8
selector: org.apache.spark.ml.feature.ChiSqSelector = chiSqSelector_a16c1ca0d1a0
pipeline_feature_extraction: org.apache.spark.ml.Pipeline = pipeline_ab21c01c37de


## Train/Test Split

In [6]:
val Array(train, test) = reviewsDF
.sample(withReplacement = false, fraction = 0.5, seed = seed)
.randomSplit(Array(0.8,0.2), seed=seed)

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]


In [69]:
train.show(5)

+----------------+--------------------+
|        category|          reviewText|
+----------------+--------------------+
|Apps_for_Android|6 inches on a rea...|
|Apps_for_Android|A fun and very ad...|
|Apps_for_Android|Amazing very addi...|
|Apps_for_Android|An enjoyable puzz...|
|Apps_for_Android|App has links to ...|
+----------------+--------------------+
only showing top 5 rows



## Define pipeline for training process

In [7]:
val normalizer = new Normalizer()
  .setInputCol(selector.getOutputCol)
  .setOutputCol("normalized_features")
  .setP(2.0) // L2 normalization

val svm = new LinearSVC()
.setFeaturesCol(normalizer.getOutputCol)
.setLabelCol(indexer.getOutputCol)

val ovr_svm = new OneVsRest()
.setClassifier(svm)
.setFeaturesCol(normalizer.getOutputCol)
.setLabelCol(indexer.getOutputCol)

val pipeline_train_model = new Pipeline()
.setStages(Array(pipeline_feature_extraction, normalizer, ovr_svm))

normalizer: org.apache.spark.ml.feature.Normalizer = Normalizer: uid=normalizer_6054856c51ee, p=2.0
svm: org.apache.spark.ml.classification.LinearSVC = linearsvc_6f76aba611fb
ovr_svm: org.apache.spark.ml.classification.OneVsRest = oneVsRest_f2a545e40ec8
pipeline_train_model: org.apache.spark.ml.Pipeline = pipeline_21d6d0411531


# Grid Search

In [8]:
val evaluator_f1 = new MulticlassClassificationEvaluator()
.setMetricName("f1")

val paramGrid = new ParamGridBuilder()
.addGrid(svm.maxIter, Array(8,34))
.addGrid(svm.regParam, Array(0.05, 0.13, 0.55))
.addGrid(selector.numTopFeatures, Array(377, 2000))
.addGrid(svm.standardization, Array(false, true))
.build()

val trainValidationSplit = new TrainValidationSplit()
.setEstimator(pipeline_train_model)
.setEvaluator(evaluator_f1)
.setEstimatorParamMaps(paramGrid)
.setSeed(seed)

evaluator_f1: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = MulticlassClassificationEvaluator: uid=mcEval_c53696f678e7, metricName=f1, metricLabel=0.0, beta=1.0, eps=1.0E-15
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linearsvc_6f76aba611fb-maxIter: 8,
	chiSqSelector_a16c1ca0d1a0-numTopFeatures: 377,
	linearsvc_6f76aba611fb-regParam: 0.05,
	linearsvc_6f76aba611fb-standardization: false
}, {
	linearsvc_6f76aba611fb-maxIter: 8,
	chiSqSelector_a16c1ca0d1a0-numTopFeatures: 2000,
	linearsvc_6f76aba611fb-regParam: 0.05,
	linearsvc_6f76aba611fb-standardization: false
}, {
	linearsvc_6f76aba611fb-maxIter: 8,
	chiSqSelector_a16c1ca0d1a0-numTopFeatures: 377,
	linearsvc_6f76aba611fb-regParam: 0.13,
	linearsvc_6f76aba611fb-standardization: false
}, {
	linear...


In [9]:
%%time
val grid_search_model = trainValidationSplit.fit(train)

Time: 3477.2955679893494 seconds.



grid_search_model: org.apache.spark.ml.tuning.TrainValidationSplitModel = TrainValidationSplitModel: uid=tvs_6aa092eea10a, bestModel=pipeline_21d6d0411531, trainRatio=0.75


In [10]:
// predict
val grid_search_predicton = grid_search_model.transform(test)

// calculate f1 score 
val evaluator = new MulticlassClassificationEvaluator()
.setMetricName("f1")
val f1_score_grid_search_model = evaluator.evaluate(grid_search_predicton)

grid_search_predicton: org.apache.spark.sql.DataFrame = [category: string, reviewText: string ... 9 more fields]
evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = MulticlassClassificationEvaluator: uid=mcEval_9ca726e2353e, metricName=f1, metricLabel=0.0, beta=1.0, eps=1.0E-15
f1_score_grid_search_model: Double = 0.5782124644392186


In [32]:
// Extract the best model
// 21d6d0411531
grid_search_model.bestModel.params

res24: org.apache.spark.ml.Model[_] = pipeline_21d6d0411531


In [45]:
val bestModel = grid_search_model.bestModel.asInstanceOf[Pipeline]

// Iterate through each stage in the pipeline and extract parameters
val stages = bestModel.stages
stages.foreach { stage =>
  println(s"Stage: ${stage.uid}")
  println(stage.asInstanceOf[Pipeline].explainParams())
}

<console>: 44: error: value foreach is not a member of org.apache.spark.ml.param.Param[Array[org.apache.spark.ml.PipelineStage]]

### SVM with default parameters

In [None]:
// train the model 
val svm_model = pipeline_train_model.fit(train)

In [124]:
// predict on test data
val predictions = svm_model.transform(test)

predictions: org.apache.spark.sql.DataFrame = [category: string, reviewText: string ... 9 more fields]


In [67]:
predictions.select("category","label","prediction").show()

+----------------+-----+----------+
|        category|label|prediction|
+----------------+-----+----------+
|Apps_for_Android|  8.0|       9.0|
|Apps_for_Android|  8.0|       3.0|
|Apps_for_Android|  8.0|       8.0|
|Apps_for_Android|  8.0|       0.0|
|Apps_for_Android|  8.0|      12.0|
|Apps_for_Android|  8.0|       0.0|
|Apps_for_Android|  8.0|       0.0|
|Apps_for_Android|  8.0|       6.0|
|Apps_for_Android|  8.0|       0.0|
|Apps_for_Android|  8.0|       8.0|
|Apps_for_Android|  8.0|       3.0|
|Apps_for_Android|  8.0|       3.0|
|Apps_for_Android|  8.0|       0.0|
|Apps_for_Android|  8.0|       8.0|
|Apps_for_Android|  8.0|      20.0|
|Apps_for_Android|  8.0|       0.0|
|Apps_for_Android|  8.0|      20.0|
|Apps_for_Android|  8.0|       0.0|
|Apps_for_Android|  8.0|       0.0|
|Apps_for_Android|  8.0|       8.0|
+----------------+-----+----------+
only showing top 20 rows



In [126]:
val f1_score_default_model = evaluator.evaluate(predictions)

f1_score_default_model: Double = 0.35558985563198786


check best param map
check vector counter instead of hashing tf
sample size