## Imports

In [1]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.feature.{StringIndexer,HashingTF,CountVectorizer,IDF,RegexTokenizer,StopWordsRemover,ChiSqSelector,Normalizer}
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.util.DefaultParamsWritable
import org.apache.spark.sql.functions.udf
import scala.io.Source.fromFile
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector}
import org.apache.spark.sql.functions._


Intitializing Scala interpreter ...

Spark Web UI available at http://captain01.os.hpc.tuwien.ac.at:9999/proxy/application_1715326141961_1483
SparkContext available as 'sc' (version = 3.2.3, master = yarn, app id = application_1715326141961_1483)
SparkSession available as 'spark'


import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{StringIndexer, HashingTF, CountVectorizer, IDF, RegexTokenizer, StopWordsRemover, ChiSqSelector, Normalizer}
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.util.DefaultParamsWritable
import org.apache.spark.sql.functions.udf
import org.apache.spark.ml.linalg.SparseVector
import scala.io.Source.fromFile
import org.apache.spark.ml.feature._
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector}
import o...


In [2]:
import org.apache.spark.sql.SparkSession
val sc = SparkSession.builder
.appName("SVM Text Classification")
.getOrCreate()

import org.apache.spark.sql.SparkSession
sc: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@f41e81e


In [3]:
System.getProperty("user.dir")

// for execution using jupyter hub
//val path_to_stopwords = "../data/stopwords.txt"

// for execution us vs code
val path_to_stopwords = "Exercise_2/data/stopwords.txt"

val k = 75
val seed = 42
val split_pattern = "[^a-zA-Z<>^|]+"

path_to_stopwords: String = Exercise_2/data/stopwords.txt
k: Int = 75
seed: Int = 42
split_pattern: String = [^a-zA-Z<>^|]+


## Load Data

In [4]:
val reviewsDF = sc
.read.json("hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json")
.select("category","reviewText")
.groupBy("category")
.agg(concat_ws(" ", collect_list("reviewText")).alias("reviewText"))

reviewsDF: org.apache.spark.sql.DataFrame = [category: string, reviewText: string]


In [5]:
val stopwords = fromFile(path_to_stopwords).getLines.toArray

stopwords: Array[String] = Array(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, bibs, bike, book, books, both, brief, bulbs, but, by, c, came, camera, can, cannot, cant, car, case, cause, causes, cd, certain, certainly, changes,...


## Pipeline

In [7]:
val tokenizer = new RegexTokenizer()
.setInputCol("reviewText")
.setOutputCol("words")
.setPattern(split_pattern)

val stopWordsFile = path_to_stopwords
val stopwords = scala.io.Source.fromFile(stopWordsFile).getLines().toArray
val remover = new StopWordsRemover()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("tokens")
.setStopWords(stopwords)

val indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("label")

val countVectorizer = new CountVectorizer()
.setInputCol("tokens")
.setOutputCol("rawFeatures")

val idf = new IDF()
.setInputCol("rawFeatures")
.setOutputCol("features")

val selector = new ChiSqSelector()
.setNumTopFeatures(75)
.setFeaturesCol("features")
.setLabelCol("label")
.setOutputCol("selectedFeatures")

val preprocessing = new Pipeline().setStages(Array(tokenizer, remover, indexer, countVectorizer, idf, selector))
val preprocessing_model = preprocessing.fit(reviewsDF)

tokenizer: org.apache.spark.ml.feature.RegexTokenizer = RegexTokenizer: uid=regexTok_4660990c2e9b, minTokenLength=1, gaps=true, pattern=[^a-zA-Z<>^|]+, toLowercase=true
stopWordsFile: String = Exercise_2/data/stopwords.txt
stopwords: Array[String] = Array(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behi...


In [14]:
val df = preprocessing_model.transform(reviewsDF).select("category","selectedFeatures")

df: org.apache.spark.sql.DataFrame = [category: string, selectedFeatures: vector]


## Encode selected tokens 

In [16]:
// Extract model and get the vocabulary
val countVectorizerModel = preprocessing_model.stages(3).asInstanceOf[CountVectorizerModel]
val vocabulary = countVectorizerModel.vocabulary


countVectorizerModel: org.apache.spark.ml.feature.CountVectorizerModel = CountVectorizerModel: uid=cntVec_9974e5761bdb, vocabularySize=96129
vocabulary: Array[String] = Array(great, good, love, time, work, recommend, back, easy, make, bought, made, find, buy, price, put, reading, quality, people, works, quot, years, nice, characters, long, series, lot, found, author, day, bit, feel, makes, thing, perfect, fit, end, set, loved, things, thought, music, small, hard, give, year, world, size, worth, pretty, times, sound, written, light, real, big, amazon, part, bad, highly, money, excellent, purchased, happy, high, enjoyed, problem, family, interesting, wanted, character, job, review, purchase, man, watch, days, enjoy, place, home, stars, short, writing, play, cover, top, fan, full, fine, co...


In [60]:
// UDF to map output
val outputUDF = udf { (features: SparseVector) =>
  // features.indices, features.values
  
  val words = features.indices.map(vocabulary)
  val values = features.values
  val sortedIndicesValues = words.zip(values).sortBy(-_._2) // Sort by values in descending order
  val outputString = sortedIndicesValues.map { case (word, value) => f"$word:${"%.2f".format(value)}" }.mkString(", ")
  outputString
}

outputUDF: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$4973/1996645485@19228148,StringType,List(Some(class[value[0]: vector])),Some(class[value[0]: string]),None,true,true)


In [66]:
val result = df
.withColumn("top_terms", outputUDF(col("selectedFeatures")))
.orderBy(asc("category"))
.select("category","top_terms")


result.show()

+--------------------+--------------------+
|            category|           top_terms|
+--------------------+--------------------+
|    Apps_for_Android|made:19.83, makes...|
|          Automotive|loved:6.85, autho...|
|                Baby|loved:29.35, easy...|
|              Beauty|written:13.84, bu...|
|                Book|great:119.40, bac...|
|       CDs_and_Vinyl|made:24.47, find:...|
|Cell_Phones_and_A...|love:32.09, easy:...|
|Clothing_Shoes_an...|good:63.97, time:...|
|       Digital_Music|find:3.16, made:2...|
|          Electronic|love:47.34, bad:3...|
|Grocery_and_Gourm...|written:11.74, wo...|
|Health_and_Person...|written:19.71, bu...|
|     Home_and_Kitche|easy:22.20, good:...|
|        Kindle_Store|great:10.58, love...|
|       Movies_and_TV|great:14.09, made...|
|  Musical_Instrument|made:4.82, world:...|
|      Office_Product|day:10.71, size:1...|
|Patio_Lawn_and_Garde|easy:6.00, writte...|
|         Pet_Supplie|good:5.42, easy:5...|
|  Sports_and_Outdoor|world:19.3

result: org.apache.spark.sql.DataFrame = [category: string, top_terms: string]


## Create Output File

In [74]:
import org.apache.spark.sql.{Row, Dataset}
import scala.collection.immutable.TreeSet
import java.io.PrintWriter


def writeDFToFile(df: Dataset[Row], filePath: String) = {
    val writer = new PrintWriter(filePath)
    
    val collectedData = df.collect()

    for (row <- collectedData) {
        val category = row.getString(0)
        val top_terms = row.getString(1)  // Explicitly specify the type as Vector
        
        writer.println(f"<$category> $top_terms")
    }
    
    writer.close()
}

import org.apache.spark.sql.{Row, Dataset}
import scala.collection.immutable.TreeSet
import java.io.PrintWriter
writeDFToFile: (df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row], filePath: String)Unit


In [76]:
writeDFToFile(result, "Exercise_2/data/output_ds.txt") //Exercise_2/data/output_ds.txt