Create spark session

In [1]:
import org.apache.spark.sql.SparkSession

val sc = SparkSession.builder
.appName("ChiSquaredRDD")
.getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://captain01.os.hpc.tuwien.ac.at:9999/proxy/application_1715326141961_0590
SparkContext available as 'sc' (version = 3.2.3, master = yarn, app id = application_1715326141961_0590)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
sc: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@2365b18e


Load amazon reviews and stopwords

In [4]:
val reviews = sc
.read.json("hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json")
.select("category","reviewText").rdd

import scala.io.Source.fromFile
val stopwords = fromFile("../data/stopwords.txt").getLines.toArray

reviews: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[26] at rdd at <console>:28
import scala.io.Source.fromFile
stopwords: Array[String] = Array(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, bibs, bike...


Calculate Chi Sqaure values and output top k (=75) words by category

In [None]:
%%time

// avg. 44 seconds

val n_docs_by_cat = reviews
.map(row => (row.getString(0), 1))
.countByKey()
.toMap

val N = reviews.count()


def preprocess(text: String): Array[String] = {
  text.toLowerCase.split("[^a-zA-Z<>^|]+")
      .filter(word => !stopwords.contains(word) && word.length > 1)
      .distinct
    
}


val filteredRDD = reviews.map(row => (row.getString(0), preprocess(row.getString(1))))

val a = filteredRDD
.flatMapValues(terms => terms)
.map({ case (category, term) => ((category, term), 1) })
.reduceByKey(_ + _)
.map({ case ((category, term), count) => (term, (category, count)) })


def reducer_token_sum(token: String, values: Iterable[(String, Int)]): Traversable[(String, (String, Double))] = {
  val counts = values.toMap
  val n_t = counts.values.sum
  counts.map { case (category, count) =>  //(category, (token, count, n_t))
    val A = count
    val B = n_t - A
    val C = n_docs_by_cat(category) - A 
    val D = N - A - B - C
    val chisquared = (N * math.pow((A * D) - (B * C), 2)) / ((A + B) * (A + C) * (B + D) * (C + D))
    (category, (token, chisquared ))
  }
}


val b = a.groupByKey.flatMap { case (key, value) => reducer_token_sum(key, value) } 
b.collect()


val topK = b.groupByKey()
.mapValues(_.toList.sortBy(-_._2).take(75))
.sortByKey()


val output = topK.map(row => {
    val key = row._1
    val values = row._2.map { case (str, num) => s"$str:$num" }.mkString(" ")
    s"<$key> $values"
})


import java.io.PrintWriter
def writeRDDToFile(rdd: org.apache.spark.rdd.RDD[String], filePath: String): Unit = {
    val writer = new PrintWriter(filePath)
    rdd.collect().foreach(line => writer.println(line))
    writer.close()
}

writeRDDToFile(output, "../data/output_rdd.txt")

In [None]:
output.collect()