In [1]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{StringIndexer,HashingTF,IDF,RegexTokenizer,StopWordsRemover,ChiSqSelector}
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.util.DefaultParamsWritable
import scala.io.Source.fromFile

Intitializing Scala interpreter ...

Spark Web UI available at http://captain01.os.hpc.tuwien.ac.at:9999/proxy/application_1715326141961_1260
SparkContext available as 'sc' (version = 3.2.3, master = yarn, app id = application_1715326141961_1260)
SparkSession available as 'spark'


import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{StringIndexer, HashingTF, IDF, RegexTokenizer, StopWordsRemover, ChiSqSelector}
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.util.DefaultParamsWritable
import scala.io.Source.fromFile


In [2]:
import org.apache.spark.sql.SparkSession
val sc = SparkSession.builder
.appName("SVM Text Classification")
.getOrCreate()

import org.apache.spark.sql.SparkSession
sc: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@62d710fd


## Set global variables

In [3]:
// for execution using jupyter hub
// val path_to_stopwords = "../data/stopwords.txt"

// for execution us vs code
val path_to_stopwords = "Exercise_2/data/stopwords.txt"

val k = 2000
val seed = 42
val split_pattern = "[^a-zA-Z<>^|]+"

path_to_stopwords: String = Exercise_2/data/stopwords.txt
k: Int = 2000
seed: Int = 42
split_pattern: String = [^a-zA-Z<>^|]+


## Load amazon reviews

In [4]:
val reviewsDF = sc
.read.json("hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json")
.select("category","reviewText")

val stopwords = fromFile(path_to_stopwords).getLines.toArray

reviewsDF.printSchema()

root
 |-- category: string (nullable = true)
 |-- reviewText: string (nullable = true)



reviewsDF: org.apache.spark.sql.DataFrame = [category: string, reviewText: string]
stopwords: Array[String] = Array(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, bibs, bike, book, books, both, brief, bulbs, but, by, c, came, ca...


## Define preprocessing and feauture extraction pipeline

In [5]:
// tokenize
val tokenizer = new RegexTokenizer()
.setInputCol("reviewText")
.setOutputCol("words")
.setPattern(split_pattern)

// remove stopwords
val stopWordsFile = path_to_stopwords
val remover = new StopWordsRemover()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("filtered")
.setStopWords(stopwords)

// turn words into numerical features
val hashingTF = new HashingTF()
.setInputCol(remover.getOutputCol)
.setOutputCol("rawFeatures")

// scale feature
val idf = new IDF()
.setInputCol(hashingTF.getOutputCol)
.setOutputCol("features")

// encode category to numerical label
val indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("label")

// ChiSqSelector
val selector = new ChiSqSelector()
.setNumTopFeatures(k)
.setFeaturesCol(idf.getOutputCol)
.setLabelCol(indexer.getOutputCol)
.setOutputCol("selected_features")

// Normalize ChiSqSelector
val normalizer = new Normalizer()
  .setInputCol(selector.getOutputCol)
  .setOutputCol("normalized_features")
  .setP(2.0) // L2 normalization

val pipeline_feature_extraction = new Pipeline().setStages(Array(tokenizer, remover, hashingTF, idf, indexer, selector, normalizer))

tokenizer: org.apache.spark.ml.feature.RegexTokenizer = RegexTokenizer: uid=regexTok_707d0511f5e6, minTokenLength=1, gaps=true, pattern=[^a-zA-Z<>^|]+, toLowercase=true
stopWordsFile: String = Exercise_2/data/stopwords.txt
remover: org.apache.spark.ml.feature.StopWordsRemover = StopWordsRemover: uid=stopWords_336d47a691b5, numStopWords=596, locale=en_US, caseSensitive=false
hashingTF: org.apache.spark.ml.feature.HashingTF = HashingTF: uid=hashingTF_21815b2b1ebc, binary=false, numFeatures=262144
idf: org.apache.spark.ml.feature.IDF = idf_6d536948173b
indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_691cadd487ba
selector: org.apache.spark.ml.feature.ChiSqSelector = chiSqSelector_0a9bb5fdd735
normalizer: org.apache.spark.ml.feature.Normalizer = Normalizer: uid=normalizer_d667e6b...


## Train/Test Split

In [6]:
val Array(train, test) = reviewsDF
.sample(withReplacement = false, fraction = 0.01, seed = seed)
.randomSplit(Array(0.8,0.2), seed=seed)

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]


In [7]:
train.count()

res1: Long = 705


## Apply pipeline to train and test data

In [8]:
%%time

val preprocessed_train = pipeline_feature_extraction
.fit(train)
.transform(train)
.select("label", "normalized_features")

Time: 43.52297329902649 seconds.



preprocessed_train: org.apache.spark.sql.DataFrame = [label: double, normalized_features: vector]


In [9]:
%%time

val preprocessed_test = pipeline_feature_extraction
.fit(test)
.transform(test)
.select("label", "normalized_features")

Time: 28.62164330482483 seconds.



preprocessed_test: org.apache.spark.sql.DataFrame = [label: double, normalized_features: vector]


In [10]:
preprocessed_train.show(3)

+-----+--------------------+
|label| normalized_features|
+-----+--------------------+
|  7.0|(2000,[68,1224,14...|
|  7.0|(2000,[9,597,915,...|
|  7.0|(2000,[301,317,58...|
+-----+--------------------+
only showing top 3 rows



In [11]:
val X_train = preprocessed_train.select("normalized_features")
val y_train = preprocessed_train.select("label")
val X_test = preprocessed_test.select("normalized_features")
val y_test = preprocessed_test.select("label")

X_train: org.apache.spark.sql.DataFrame = [normalized_features: vector]
y_train: org.apache.spark.sql.DataFrame = [label: double]
X_test: org.apache.spark.sql.DataFrame = [normalized_features: vector]
y_test: org.apache.spark.sql.DataFrame = [label: double]


## Persist preprocessed data

In [12]:
val path_to_store = "Exercise_2/data/"

preprocessed_train
.write
.mode("overwrite")
.parquet(path_to_store + "training_data.parquet")

preprocessed_test
.write
.mode("overwrite")
.parquet(path_to_store + "test_data.parquet")



path_to_store: String = Exercise_2/data/


## Load parquet files

In [13]:
val trainDF = sc.read.parquet(path_to_store + "training_data.parquet")
val testDF = sc.read.parquet(path_to_store + "training_data.parquet")


trainDF: org.apache.spark.sql.DataFrame = [label: double, normalized_features: vector]
testDF: org.apache.spark.sql.DataFrame = [label: double, normalized_features: vector]


In [14]:
trainDF.show(2)

+-----+--------------------+
|label| normalized_features|
+-----+--------------------+
|  7.0|(2000,[68,1224,14...|
|  7.0|(2000,[9,597,915,...|
+-----+--------------------+
only showing top 2 rows



## Define estimator and evaluator

In [15]:
val svm = new LinearSVC()
.setFeaturesCol("normalized_features") //normalizer.getOutputCol
.setLabelCol("label") // indexer.getOutputCol

val ovr_svm = new OneVsRest()
.setClassifier(svm)
.setFeaturesCol("normalized_features")
.setLabelCol("label")

//val pipeline_train_model = new Pipeline()
//.setStages(ovr_svm)

svm: org.apache.spark.ml.classification.LinearSVC = linearsvc_42c595a0e3d6
ovr_svm: org.apache.spark.ml.classification.OneVsRest = oneVsRest_0fc10cef1bd0


## Grid Search pipeline

In [16]:
val evaluator_f1 = new MulticlassClassificationEvaluator()
.setMetricName("f1")

val paramGrid = new ParamGridBuilder()
.addGrid(svm.maxIter, Array(8,34))
.addGrid(svm.regParam, Array(0.05, 0.13, 0.55))
.addGrid(selector.numTopFeatures, Array(377, 2000))
.addGrid(svm.standardization, Array(false, true))
.build()

val trainValidationSplit = new TrainValidationSplit()
.setEstimator(ovr_svm)
.setEvaluator(evaluator_f1)
.setEstimatorParamMaps(paramGrid)
.setSeed(seed)

evaluator_f1: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = MulticlassClassificationEvaluator: uid=mcEval_6cb6f23fb8f3, metricName=f1, metricLabel=0.0, beta=1.0, eps=1.0E-15
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linearsvc_42c595a0e3d6-maxIter: 8,
	chiSqSelector_0a9bb5fdd735-numTopFeatures: 377,
	linearsvc_42c595a0e3d6-regParam: 0.05,
	linearsvc_42c595a0e3d6-standardization: false
}, {
	linearsvc_42c595a0e3d6-maxIter: 8,
	chiSqSelector_0a9bb5fdd735-numTopFeatures: 377,
	linearsvc_42c595a0e3d6-regParam: 0.05,
	linearsvc_42c595a0e3d6-standardization: true
}, {
	linearsvc_42c595a0e3d6-maxIter: 8,
	chiSqSelector_0a9bb5fdd735-numTopFeatures: 377,
	linearsvc_42c595a0e3d6-regParam: 0.13,
	linearsvc_42c595a0e3d6-standardization: false
}, {
	linearsv...


## Perform Grid Search

In [None]:
%%time
val grid_search_model = trainValidationSplit.fit(trainDF)

## Evaluate

In [None]:
val predictons = grid_search_model.transform(testDF)
val f1_score = evaluator.evaluate(predictons)