In [1]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.feature.{StringIndexer,HashingTF,CountVectorizer,IDF,RegexTokenizer,StopWordsRemover,ChiSqSelector,Normalizer}
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.util.DefaultParamsWritable
import org.apache.spark.sql.functions.udf
import scala.io.Source.fromFile
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector}
import org.apache.spark.sql.functions._


Intitializing Scala interpreter ...

Spark Web UI available at http://captain01.os.hpc.tuwien.ac.at:9999/proxy/application_1715326141961_1521
SparkContext available as 'sc' (version = 3.2.3, master = yarn, app id = application_1715326141961_1521)
SparkSession available as 'spark'


import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.feature.{StringIndexer, HashingTF, CountVectorizer, IDF, RegexTokenizer, StopWordsRemover, ChiSqSelector, Normalizer}
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.util.DefaultParamsWritable
import org.apache.spark.sql.functions.udf
import scala.io.Source.fromFile
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector}
import org.apache.spark.sql.functions._


In [2]:
import org.apache.spark.sql.SparkSession
val sc = SparkSession.builder
.appName("SVM Text Classification")
.getOrCreate()

import org.apache.spark.sql.SparkSession
sc: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@3832c56c


In [3]:
System.getProperty("user.dir")

// for execution using jupyter hub
//val path_to_stopwords = "../data/stopwords.txt"

// for execution us vs code
val path_to_stopwords = "Exercise_2/data/stopwords.txt"

val k = 75
val seed = 42
val split_pattern = "[^a-zA-Z<>^|]+"

path_to_stopwords: String = Exercise_2/data/stopwords.txt
k: Int = 75
seed: Int = 42
split_pattern: String = [^a-zA-Z<>^|]+


## Load Data

In [4]:
val reviewsDF = sc
.read.json("hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json")
.select("category","reviewText")


reviewsDF: org.apache.spark.sql.DataFrame = [category: string, reviewText: string]


In [5]:
val stopwords = fromFile(path_to_stopwords).getLines.toArray

stopwords: Array[String] = Array(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, bibs, bike, book, books, both, brief, bulbs, but, by, c, came, camera, can, cannot, cant, car, case, cause, causes, cd, certain, certainly, changes,...


## Pipeline

In [6]:
val tokenizer = new RegexTokenizer()
.setInputCol("reviewText")
.setOutputCol("words")
.setPattern(split_pattern)

val stopWordsFile = path_to_stopwords
val stopwords = scala.io.Source.fromFile(stopWordsFile).getLines().toArray
val remover = new StopWordsRemover()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("tokens")
.setStopWords(stopwords)

val indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("label")

val countVectorizer = new CountVectorizer()
.setInputCol("tokens")
.setOutputCol("rawFeatures")

val idf = new IDF()
.setInputCol("rawFeatures")
.setOutputCol("features")

val selector = new ChiSqSelector()
.setNumTopFeatures(75)
.setFeaturesCol("features")
.setLabelCol("label")
.setOutputCol("selectedFeatures")

val preprocessing = new Pipeline().setStages(Array(tokenizer, remover, indexer, countVectorizer, idf, selector))
val preprocessing_model = preprocessing.fit(reviewsDF)

tokenizer: org.apache.spark.ml.feature.RegexTokenizer = RegexTokenizer: uid=regexTok_241035bd8a25, minTokenLength=1, gaps=true, pattern=[^a-zA-Z<>^|]+, toLowercase=true
stopWordsFile: String = Exercise_2/data/stopwords.txt
stopwords: Array[String] = Array(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behi...


In [7]:
val df = preprocessing_model.transform(reviewsDF).select("category","selectedFeatures")

df: org.apache.spark.sql.DataFrame = [category: string, selectedFeatures: vector]


In [8]:
df.show(3)

+--------------------+--------------------+
|            category|    selectedFeatures|
+--------------------+--------------------+
|Patio_Lawn_and_Garde|(75,[2,3,7,8,35],...|
|Patio_Lawn_and_Garde|(75,[0,1,3,21,39]...|
|Patio_Lawn_and_Garde|(75,[4,10],[2.443...|
+--------------------+--------------------+
only showing top 3 rows



## Encode selected tokens 

In [9]:
// Extract model and get the vocabulary
val countVectorizerModel = preprocessing_model.stages(3).asInstanceOf[CountVectorizerModel]
val vocabulary = countVectorizerModel.vocabulary

// UDF to map output
val encodeSelectedFeaturesUDF = udf { (features: SparseVector) =>
  // features.indices, features.values
  
  val words = features.indices.map(vocabulary)
  words
}

countVectorizerModel: org.apache.spark.ml.feature.CountVectorizerModel = CountVectorizerModel: uid=cntVec_62a354bf03d6, vocabularySize=96129
vocabulary: Array[String] = Array(great, good, love, time, work, recommend, back, easy, make, bought, made, find, buy, price, put, reading, quality, people, works, quot, years, nice, characters, long, series, lot, found, author, day, bit, feel, makes, thing, perfect, fit, end, set, loved, things, thought, music, small, hard, give, year, world, size, worth, pretty, times, sound, written, light, real, big, amazon, part, bad, highly, money, excellent, purchased, happy, high, enjoyed, problem, family, interesting, wanted, character, job, review, purchase, man, watch, days, enjoy, place, home, stars, short, writing, play, cover, top, fan, full, fine, co...


## Prepare Chi Square Calculation

In [10]:
// Step 1: for token t number of occurrens within each category -> A
val tokenFreqByCategory = df
.withColumn("tokens", encodeSelectedFeaturesUDF(col("selectedFeatures")))
.withColumn("token", explode(col("tokens")))
.groupBy("category", "token")
.count()
.withColumnRenamed("count", "A").orderBy(desc("A"))

// Step 2: for token t total number of occurrencs across all categories -> B = this - A
val t_total_number_of_occurrences = tokenFreqByCategory
.groupBy("token")
.agg(sum("A").alias("total_number_of_occurrences")).orderBy(desc("total_number_of_occurrences"))

// Step 3: number of reviews by category
val n_docs_by_cat = df
.groupBy("category")
.agg(count("*")
.as("n_docs_by_cat")) // C = this - A

// Step 4: total number of reviews
val n_of_docs = n_docs_by_cat.agg(sum("n_docs_by_cat").alias("N"))

// join both dataframes
val crossjoin_n_info = n_docs_by_cat.crossJoin(n_of_docs)

tokenFreqByCategory: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, token: string ... 1 more field]
t_total_number_of_occurrences: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [token: string, total_number_of_occurrences: bigint]
n_docs_by_cat: org.apache.spark.sql.DataFrame = [category: string, n_docs_by_cat: bigint]
n_of_docs: org.apache.spark.sql.DataFrame = [N: bigint]
crossjoin_n_info: org.apache.spark.sql.DataFrame = [category: string, n_docs_by_cat: bigint ... 1 more field]


## Calucate Chi Squared Values

In [11]:
val chiSquaredValues = tokenFreqByCategory
.join(t_total_number_of_occurrences, ("token"))
.join(crossjoin_n_info, ("category"))
.withColumn("B", $"total_number_of_occurrences" - $"A")
.withColumn("C", $"n_docs_by_cat" - $"A")
.withColumn("D", $"N" - $"A" - $"B" - $"C")
.withColumn("D", $"N" - $"A" - $"B" - $"C")
.withColumn("chisquared",
  ($"N" * pow($"A" * $"D" - $"B" * $"C", 2)) /
    (($"A" + $"B") * ($"A" + $"C") * ($"B" + $"D") * ($"C" + $"D"))
)
.select("category", "token","chisquared").orderBy(desc("chisquared"))

chiSquaredValues: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, token: string ... 1 more field]


## Top 75

In [12]:
// Top 75
import org.apache.spark.sql.expressions.Window
val windowSpec = Window.partitionBy("category").orderBy(desc("chisquared"))

val top75ByCategory = chiSquaredValues
.withColumn("rank", row_number().over(windowSpec))
.filter(col("rank") <= 75)
//.withColumn("value_str", col("chisquared").cast("string"))


top75ByCategory.show()

+----------------+----------+------------------+----+
|        category|     token|        chisquared|rank|
+----------------+----------+------------------+----+
|Apps_for_Android|       man| 2158.369406820129|   1|
|Apps_for_Android|    bought| 174.4587211734982|   2|
|Apps_for_Android|     price| 149.5959088208227|   3|
|Apps_for_Android|   quality|144.19395657092497|   4|
|Apps_for_Android|       lot|135.34205354169052|   5|
|Apps_for_Android|     makes|126.10402357270085|   6|
|Apps_for_Android|     years|124.35784441658687|   7|
|Apps_for_Android|   reading|121.65120689569069|   8|
|Apps_for_Android|      made|120.91456375779855|   9|
|Apps_for_Android|characters|118.09788512184659|  10|
|Apps_for_Android|      long|115.05716592780018|  11|
|Apps_for_Android|      size|114.84060961362133|  12|
|Apps_for_Android|      back| 99.12491188526246|  13|
|Apps_for_Android|     thing| 95.24822201431748|  14|
|Apps_for_Android|      hard| 93.34366680110551|  15|
|Apps_for_Android|       put

import org.apache.spark.sql.expressions.Window
windowSpec: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@38217c4a
top75ByCategory: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, token: string ... 2 more fields]


In [13]:
val output = top75ByCategory
.groupBy("category")
.agg(collect_list(concat_ws( ":", $"token", $"chisquared")) as "top_tokens")

output.show()

+--------------------+--------------------+
|            category|          top_tokens|
+--------------------+--------------------+
|    Apps_for_Android|[man:2158.3694068...|
|          Automotive|[makes:345.042680...|
|                Baby|[easy:111.8953956...|
|              Beauty|[reading:117.5479...|
|                Book|[reading:6184.609...|
|       CDs_and_Vinyl|[loved:13083.7708...|
|Cell_Phones_and_A...|[watch:446.765539...|
|Clothing_Shoes_an...|[hard:3312.666743...|
|       Digital_Music|[loved:1420.97017...|
|          Electronic|[works:1800.79675...|
|Grocery_and_Gourm...|[sound:118.998168...|
|Health_and_Person...|[lot:153.96737881...|
|     Home_and_Kitche|[characters:246.5...|
|        Kindle_Store|[lot:1079.2803955...|
|       Movies_and_TV|[interesting:4310...|
|  Musical_Instrument|[world:491.957905...|
|      Office_Product|[price:111.146121...|
|Patio_Lawn_and_Garde|[works:66.7466270...|
|         Pet_Supplie|[reading:60.28768...|
|  Sports_and_Outdoor|[makes:340

output: org.apache.spark.sql.DataFrame = [category: string, top_tokens: array<string>]


In [14]:
import org.apache.spark.sql.{Row, Dataset}
import scala.collection.immutable.TreeSet
import java.io.PrintWriter


def writeDFToFile(df: Dataset[Row], filePath: String) = {
    val writer = new PrintWriter(filePath)
    
    val collectedData = df.collect()

    for (row <- collectedData) {
        val category = row.getString(0)
        val top_terms = row.getSeq(1).mkString(", ")  // Explicitly specify the type as Vector
        
        writer.println(f"<$category> $top_terms")
    }
    
    writer.close()
}



import org.apache.spark.sql.{Row, Dataset}
import scala.collection.immutable.TreeSet
import java.io.PrintWriter
writeDFToFile: (df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row], filePath: String)Unit


In [15]:
writeDFToFile(output, "Exercise_2/data/output_ds_group_last.txt")