In [57]:
val reviewsDF = spark.read.options(Map("header"->"true")).format("json").load("hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json").select("category","reviewText")
reviewsDF.printSchema()

root
 |-- category: string (nullable = true)
 |-- reviewText: string (nullable = true)



reviewsDF: org.apache.spark.sql.DataFrame = [category: string, reviewText: string]


In [71]:
// Tokenize and Case Folding
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.ml.util.DefaultParamsWritable
import org.apache.spark.sql.functions.udf

class CustomTokenizer extends Tokenizer with DefaultParamsWritable {

  // use splitting pattern from exercise 1
  override protected def createTransformFunc: String => Seq[String] = { input =>
    input.toLowerCase.split("[^a-zA-Z<>^|]+").toSeq
  }
}

val tokenizer = new CustomTokenizer()
.setInputCol("reviewText")
.setOutputCol("words")

val tokenized = tokenizer.transform(reviewsDF).select("category","words")

tokenized.show()

+--------------------+--------------------+
|            category|               words|
+--------------------+--------------------+
|Patio_Lawn_and_Garde|[this, was, a, gi...|
|Patio_Lawn_and_Garde|[this, is, a, ver...|
|Patio_Lawn_and_Garde|[the, metal, base...|
|Patio_Lawn_and_Garde|[for, the, most, ...|
|Patio_Lawn_and_Garde|[this, hose, is, ...|
|Patio_Lawn_and_Garde|[this, tool, work...|
|Patio_Lawn_and_Garde|[this, product, i...|
|Patio_Lawn_and_Garde|[i, was, excited,...|
|Patio_Lawn_and_Garde|[i, purchased, th...|
|Patio_Lawn_and_Garde|[never, used, a, ...|
|Patio_Lawn_and_Garde|[good, price, goo...|
|Patio_Lawn_and_Garde|[i, have, owned, ...|
|Patio_Lawn_and_Garde|[i, had, won, a, ...|
|Patio_Lawn_and_Garde|[the, birds, ate,...|
|Patio_Lawn_and_Garde|[bought, last, su...|
|Patio_Lawn_and_Garde|[i, knew, i, had,...|
|Patio_Lawn_and_Garde|[i, was, a, littl...|
|Patio_Lawn_and_Garde|[i, have, used, t...|
|Patio_Lawn_and_Garde|[i, actually, do,...|
|Patio_Lawn_and_Garde|[just, wha

import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.ml.util.DefaultParamsWritable
import org.apache.spark.sql.functions.udf
defined class CustomTokenizer
tokenizer: org.apache.spark.ml.feature.Tokenizer = tok_e7128560c2b3
tokenized: org.apache.spark.sql.DataFrame = [category: string, words: array<string>]


In [73]:
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

class CustomStopWordsRemover(stopWordsFile: String) extends StopWordsRemover {
    // load and set custom stop words
    val customStopWords: Array[String] = scala.io.Source.fromFile(stopWordsFile).getLines.toArray
    setStopWords(customStopWords)    
}


val stopWordsFile = "stopwords.txt"
val remover = new CustomStopWordsRemover(stopWordsFile)
  .setInputCol("words")
  .setOutputCol("filtered")

val filtered = remover.transform(tokenized).select("category", "filtered")
filtered.show()


// val query3 = filtered.filter(array_contains($"filtered", "ever"))
// query3.show()

+--------------------+--------------------+
|            category|            filtered|
+--------------------+--------------------+
|Patio_Lawn_and_Garde|[gift, husband, m...|
|Patio_Lawn_and_Garde|[nice, spreader, ...|
|Patio_Lawn_and_Garde|[metal, base, hos...|
|Patio_Lawn_and_Garde|[part, works, pre...|
|Patio_Lawn_and_Garde|[hose, supposed, ...|
|Patio_Lawn_and_Garde|[tool, works, cut...|
|Patio_Lawn_and_Garde|[typical, usable,...|
|Patio_Lawn_and_Garde|[excited, ditch, ...|
|Patio_Lawn_and_Garde|[purchased, leaf,...|
|Patio_Lawn_and_Garde|[manual, lawnmowe...|
|Patio_Lawn_and_Garde|[good, price, goo...|
|Patio_Lawn_and_Garde|[owned, flowtron,...|
|Patio_Lawn_and_Garde|[similar, family,...|
|Patio_Lawn_and_Garde|[birds, ate, blue...|
|Patio_Lawn_and_Garde|[bought, summer, ...|
|Patio_Lawn_and_Garde|[knew, mouse, bas...|
|Patio_Lawn_and_Garde|[worried, reading...|
|Patio_Lawn_and_Garde|[brand, long, tim...|
|Patio_Lawn_and_Garde|[current, model, ...|
|Patio_Lawn_and_Garde|[expected,

import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
defined class CustomStopWordsRemover
stopWordsFile: String = stopwords.txt
remover: CustomStopWordsRemover = StopWordsRemover: uid=stopWords_af1d7f26a8fb, numStopWords=596, locale=en_US, caseSensitive=false
filtered: org.apache.spark.sql.DataFrame = [category: string, filtered: array<string>]


In [102]:
val tokenFreqByCategory = filtered
.withColumn("token", explode(array_distinct($"filtered")))
.groupBy("category", "token")
.count()
.withColumnRenamed("count", "A").orderBy(desc("A"))

tokenFreqByCategory.show()

+----------+----------+----+
|  category|     token|   A|
+----------+----------+----+
|      Book|      good|4886|
|      Book|     great|4834|
|      Book|   reading|3893|
|      Book|      love|3812|
|      Book|      time|3755|
|      Book|    author|3210|
|      Book|characters|3142|
|      Book|   written|2576|
|      Book| recommend|2459|
|      Book|    series|2409|
|      Book|    people|2343|
|      Book|      find|2256|
|Electronic|     great|2223|
|      Book|      make|2194|
|      Book|     found|2104|
|      Book|       put|1944|
|      Book|      work|1941|
|      Book|     loved|1925|
|      Book|       end|1925|
|      Book|     world|1923|
+----------+----------+----+
only showing top 20 rows



tokenFreqByCategory: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, token: string ... 1 more field]


In [136]:
import org.apache.spark.ml.feature.ChiSqSelector
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.stat.ChiSquareTest
import org.apache.spark.sql.functions._

// Step 1: for token t number of occurrens within each category -> A
val tokenFreqByCategory = filtered
 .withColumn("token", explode(array_distinct($"filtered")))
.groupBy("category", "token")
.count()
.withColumnRenamed("count", "A").orderBy(desc("A"))

//tokenFreqByCategory.show()

// Step 2: for token t total number of occurrencs across all categories -> B = this - A
val t_total_number_of_occurrences = tokenFreqByCategory
  .groupBy("token")
  .agg(sum("A").alias("total_number_of_occurrences")).orderBy(desc("total_number_of_occurrences"))
//t_total_number_of_occurrences.show()

// Step 3: number of reviews by category
val n_docs_by_cat = filtered.groupBy("category").agg(count("*").as("n_docs_by_cat")) // C = this - A

// Step 4: total number of reviews
val n_of_docs = n_docs_by_cat.agg(sum("n_docs_by_cat").alias("N")) // N
// join both dataframes
val crossjoin_n_info = n_docs_by_cat.crossJoin(n_of_docs)

import org.apache.spark.ml.feature.ChiSqSelector
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.stat.ChiSquareTest
import org.apache.spark.sql.functions._
tokenFreqByCategory: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, token: string ... 1 more field]
t_total_number_of_occurrences: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [token: string, total_number_of_occurrences: bigint]
n_docs_by_cat: org.apache.spark.sql.DataFrame = [category: string, n_docs_by_cat: bigint]
n_of_docs: org.apache.spark.sql.DataFrame = [N: bigint]
crossjoin_n_info: org.apache.spark.sql.DataFrame = [category: string, n_docs_by_cat: bigint ... 1 more field]


In [164]:
val join = tokenFreqByCategory
.join(t_total_number_of_occurrences, ("token"))
.join(crossjoin_n_info, ("category"))
.withColumn("B", $"total_number_of_occurrences" - $"A")
.withColumn("C", $"n_docs_by_cat" - $"A")
.withColumn("D", $"N" - $"A" - $"B" - $"C")
.withColumn("D", $"N" - $"A" - $"B" - $"C")
.withColumn("chisquared",
  ($"N" * pow($"A" * $"D" - $"B" * $"C", 2)) /
    (($"A" + $"B") * ($"A" + $"C") * ($"B" + $"D") * ($"C" + $"D"))
)
.select("category", "token","chisquared").orderBy(desc("chisquared"))

join.show()

+--------------------+-----------+------------------+
|            category|      token|        chisquared|
+--------------------+-----------+------------------+
|       CDs_and_Vinyl|      music|13083.770814228063|
|       Movies_and_TV|        dvd| 8746.742991979037|
|       CDs_and_Vinyl|     albums| 6705.824908271496|
|       CDs_and_Vinyl|     tracks|6516.9526511813565|
|                Book|    reading| 6184.609280406393|
|                Book|     author| 6181.072186631432|
|       CDs_and_Vinyl|     lyrics|   5856.6404116392|
|       CDs_and_Vinyl|     listen| 5637.042824641422|
|       Movies_and_TV|     acting|  5158.79879531417|
|      Office_Product| cartridges| 5038.569370197361|
|       Movies_and_TV|     movies| 5030.696393981536|
|       CDs_and_Vinyl|       band| 4883.454486644896|
|       CDs_and_Vinyl|       rock|  4836.65107719276|
|                Book| characters| 4818.141989312432|
|       CDs_and_Vinyl|     vocals| 4785.881671742037|
|Clothing_Shoes_an...|comfor

join: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, token: string ... 1 more field]
