In [69]:
val reviewsDF = spark.read.options(Map("header"->"true"))
.format("json")
.load("hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json")
.select("category","reviewText")
.groupBy("category")
.agg(concat_ws(" ", collect_list("reviewText")).alias("reviewText"))


reviewsDF.printSchema()

root
 |-- category: string (nullable = true)
 |-- reviewText: string (nullable = false)



reviewsDF: org.apache.spark.sql.DataFrame = [category: string, reviewText: string]


In [68]:
val path_to_stopwords = "../data/stopwords.txt"
val k = 2000
val split_pattern = "[^a-zA-Z<>^|]+"

path_to_stopwords: String = ./data/stopwords.txt
k: Int = 2000
split_pattern: String = [^a-zA-Z<>^|]+


In [46]:
class CustomTokenizer extends Tokenizer with DefaultParamsWritable {
  // use splitting pattern from exercise 1
  override protected def createTransformFunc: String => Seq[String] = { input =>
    input.toLowerCase.split(split_pattern).toSeq.filter(_.length > 1)
  }
}

val tokenizer = new CustomTokenizer()
.setInputCol("reviewText")
.setOutputCol("words")
val tokenized = tokenizer.transform(reviewsDF).select("category","words")
tokenized.show()

+--------------------+--------------------+--------------------+
|            category|          reviewText|               words|
+--------------------+--------------------+--------------------+
|Health_and_Person...|The viewing area ...|[the, viewing, ar...|
|        Kindle_Store|This book has two...|[this, book, has,...|
|                Baby|I love it. I'm no...|[love, it, not, t...|
|       Movies_and_TV|Blu-ray disc is d...|[blu, ray, disc, ...|
|Clothing_Shoes_an...|Absolutely beauti...|[absolutely, beau...|
|                Book|Kahlil Gibran wri...|[kahlil, gibran, ...|
|          Automotive|Bought this cable...|[bought, this, ca...|
|  Sports_and_Outdoor|my husband reques...|[my, husband, req...|
|    Apps_for_Android|I spend too much ...|[spend, too, much...|
|Tools_and_Home_Im...|Two things are go...|[two, things, are...|
|Cell_Phones_and_A...|Bought this produ...|[bought, this, pr...|
|       CDs_and_Vinyl|I'm not at all a ...|[not, at, all, he...|
|          Electronic|The

defined class CustomTokenizer
tokenizer: org.apache.spark.ml.feature.Tokenizer = tok_2bfc9c743f2c
tokenized: org.apache.spark.sql.DataFrame = [category: string, reviewText: string ... 1 more field]


In [49]:
class CustomStopWordsRemover(stopWordsFile: String) extends StopWordsRemover {
    // load and set custom stop words
    val customStopWords: Array[String] = scala.io.Source.fromFile(stopWordsFile).getLines.toArray
    setStopWords(customStopWords)    
}

val stopWordsFile = path_to_stopwords
val remover = new CustomStopWordsRemover(stopWordsFile)
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("filtered")
val filtered = remover.transform(tokenized).select("category", "filtered")
filtered.show()

+--------------------+--------------------+
|            category|            filtered|
+--------------------+--------------------+
|Health_and_Person...|[viewing, area, s...|
|        Kindle_Store|[separate, lines,...|
|                Baby|[love, journaling...|
|       Movies_and_TV|[blu, ray, disc, ...|
|Clothing_Shoes_an...|[absolutely, beau...|
|                Book|[kahlil, gibran, ...|
|          Automotive|[bought, cable, r...|
|  Sports_and_Outdoor|[husband, request...|
|    Apps_for_Android|[spend, time, spe...|
|Tools_and_Home_Im...|[things, good, li...|
|Cell_Phones_and_A...|[bought, arrival,...|
|       CDs_and_Vinyl|[helen, forrest, ...|
|          Electronic|[power, cable, wo...|
|         Pet_Supplie|[quit, lot, plaqu...|
|Grocery_and_Gourm...|[sooohard, find, ...|
|     Home_and_Kitche|[helpful, dvd, gr...|
|       Toys_and_Game|[learning, bittin...|
|       Digital_Music|[reviewer, cares,...|
|  Musical_Instrument|[love, version, w...|
|Patio_Lawn_and_Garde|[gift, hus

defined class CustomStopWordsRemover
stopWordsFile: String = stopwords.txt
remover: CustomStopWordsRemover = StopWordsRemover: uid=stopWords_b62340107760, numStopWords=596, locale=en_US, caseSensitive=false
filtered: org.apache.spark.sql.DataFrame = [category: string, filtered: array<string>]


In [52]:
// turn words into numerical features
val hashingTF = new HashingTF()
.setInputCol(remover.getOutputCol)
.setOutputCol("rawFeatures")
val featurizedData = hashingTF.transform(filtered).select("category", "rawFeatures")
featurizedData.show()

+--------------------+--------------------+
|            category|         rawFeatures|
+--------------------+--------------------+
|Health_and_Person...|(262144,[7,55,71,...|
|        Kindle_Store|(262144,[150,169,...|
|                Baby|(262144,[329,343,...|
|       Movies_and_TV|(262144,[2,6,19,4...|
|Clothing_Shoes_an...|(262144,[149,150,...|
|                Book|(262144,[2,5,6,13...|
|          Automotive|(262144,[329,343,...|
|  Sports_and_Outdoor|(262144,[42,55,13...|
|    Apps_for_Android|(262144,[300,329,...|
|Tools_and_Home_Im...|(262144,[7,55,116...|
|Cell_Phones_and_A...|(262144,[107,150,...|
|       CDs_and_Vinyl|(262144,[2,5,6,19...|
|          Electronic|(262144,[7,13,20,...|
|         Pet_Supplie|(262144,[7,83,329...|
|Grocery_and_Gourm...|(262144,[7,329,34...|
|     Home_and_Kitche|(262144,[7,55,181...|
|       Toys_and_Game|(262144,[7,150,19...|
|       Digital_Music|(262144,[19,50,15...|
|  Musical_Instrument|(262144,[5,150,21...|
|Patio_Lawn_and_Garde|(262144,[6

hashingTF: org.apache.spark.ml.feature.HashingTF = HashingTF: uid=hashingTF_d82fe258cea1, binary=false, numFeatures=262144
featurizedData: org.apache.spark.sql.DataFrame = [category: string, rawFeatures: vector]


In [53]:
// scale feature
val idf = new IDF()
.setInputCol(hashingTF.getOutputCol)
.setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData).select("category", "features")
rescaledData.show()


+--------------------+--------------------+
|            category|            features|
+--------------------+--------------------+
|Health_and_Person...|(262144,[7,55,71,...|
|        Kindle_Store|(262144,[150,169,...|
|                Baby|(262144,[329,343,...|
|       Movies_and_TV|(262144,[2,6,19,4...|
|Clothing_Shoes_an...|(262144,[149,150,...|
|                Book|(262144,[2,5,6,13...|
|          Automotive|(262144,[329,343,...|
|  Sports_and_Outdoor|(262144,[42,55,13...|
|    Apps_for_Android|(262144,[300,329,...|
|Tools_and_Home_Im...|(262144,[7,55,116...|
|Cell_Phones_and_A...|(262144,[107,150,...|
|       CDs_and_Vinyl|(262144,[2,5,6,19...|
|          Electronic|(262144,[7,13,20,...|
|         Pet_Supplie|(262144,[7,83,329...|
|Grocery_and_Gourm...|(262144,[7,329,34...|
|     Home_and_Kitche|(262144,[7,55,181...|
|       Toys_and_Game|(262144,[7,150,19...|
|       Digital_Music|(262144,[19,50,15...|
|  Musical_Instrument|(262144,[5,150,21...|
|Patio_Lawn_and_Garde|(262144,[6

idf: org.apache.spark.ml.feature.IDF = idf_8f4fddaa2939
idfModel: org.apache.spark.ml.feature.IDFModel = IDFModel: uid=idf_8f4fddaa2939, numDocs=22, numFeatures=262144
rescaledData: org.apache.spark.sql.DataFrame = [category: string, features: vector]


In [54]:
// encode category to numerical label
val indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("label")

val indexed = indexer.fit(rescaledData).transform(rescaledData)
indexed.show()

+--------------------+--------------------+-----+
|            category|            features|label|
+--------------------+--------------------+-----+
|Health_and_Person...|(262144,[7,55,71,...| 11.0|
|        Kindle_Store|(262144,[150,169,...| 13.0|
|                Baby|(262144,[329,343,...|  2.0|
|       Movies_and_TV|(262144,[2,6,19,4...| 14.0|
|Clothing_Shoes_an...|(262144,[149,150,...|  7.0|
|Tools_and_Home_Im...|(262144,[7,55,116...| 20.0|
|Cell_Phones_and_A...|(262144,[107,150,...|  6.0|
|                Book|(262144,[2,5,6,13...|  4.0|
|          Automotive|(262144,[329,343,...|  1.0|
|  Sports_and_Outdoor|(262144,[42,55,13...| 19.0|
|    Apps_for_Android|(262144,[300,329,...|  0.0|
|       CDs_and_Vinyl|(262144,[2,5,6,19...|  5.0|
|       Toys_and_Game|(262144,[7,150,19...| 21.0|
|       Digital_Music|(262144,[19,50,15...|  8.0|
|  Musical_Instrument|(262144,[5,150,21...| 15.0|
|Patio_Lawn_and_Garde|(262144,[6,55,298...| 17.0|
|              Beauty|(262144,[7,128,36...|  3.0|


indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_a4d08d0ef358
indexed: org.apache.spark.sql.DataFrame = [category: string, features: vector ... 1 more field]


In [55]:
val selector = new ChiSqSelector()
.setNumTopFeatures(k)
.setFeaturesCol(idf.getOutputCol)
.setLabelCol("label")
.setOutputCol("selected_features")

val result = selector.fit(indexed).transform(indexed)
result.show()

+--------------------+--------------------+-----+--------------------+
|            category|            features|label|   selected_features|
+--------------------+--------------------+-----+--------------------+
|Health_and_Person...|(262144,[7,55,71,...| 11.0|(2000,[0,1,2,3,4,...|
|        Kindle_Store|(262144,[150,169,...| 13.0|(2000,[0,1,2,3,4,...|
|                Baby|(262144,[329,343,...|  2.0|(2000,[0,1,3,4,5,...|
|       Movies_and_TV|(262144,[2,6,19,4...| 14.0|(2000,[0,2,3,4,5,...|
|Clothing_Shoes_an...|(262144,[149,150,...|  7.0|(2000,[0,1,2,3,4,...|
|                Book|(262144,[2,5,6,13...|  4.0|(2000,[0,1,2,3,4,...|
|          Automotive|(262144,[329,343,...|  1.0|(2000,[0,1,2,3,4,...|
|  Sports_and_Outdoor|(262144,[42,55,13...| 19.0|(2000,[0,1,2,3,4,...|
|    Apps_for_Android|(262144,[300,329,...|  0.0|(2000,[0,5,6,7,9,...|
|Tools_and_Home_Im...|(262144,[7,55,116...| 20.0|(2000,[0,1,2,3,4,...|
|Cell_Phones_and_A...|(262144,[107,150,...|  6.0|(2000,[0,1,2,3,4,...|
|     

selector: org.apache.spark.ml.feature.ChiSqSelector = chiSqSelector_d7674da3b460
result: org.apache.spark.sql.DataFrame = [category: string, features: vector ... 2 more fields]


In [56]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.util.DefaultParamsWritable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{StringIndexer, HashingTF, IDF, Tokenizer, StopWordsRemover,ChiSqSelector}


class CustomStopWordsRemover(stopWordsFile: String) extends StopWordsRemover {
    // load and set custom stop words
    val customStopWords: Array[String] = scala.io.Source.fromFile(stopWordsFile).getLines.toArray
    setStopWords(customStopWords)    
}

class CustomTokenizer extends Tokenizer with DefaultParamsWritable {
  // use splitting pattern from exercise 1
  override protected def createTransformFunc: String => Seq[String] = { input =>
    input.toLowerCase.split(split_pattern).toSeq.filter(_.length > 1)
  }
}


// tokenize
val tokenizer = new CustomTokenizer()
.setInputCol("reviewText")
.setOutputCol("words")

// remove stopwords
val stopWordsFile = path_to_stopwords
val remover = new CustomStopWordsRemover(stopWordsFile)
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("filtered")

// turn words into numerical features
val hashingTF = new HashingTF()
.setInputCol(remover.getOutputCol)
.setOutputCol("rawFeatures")

// scale feature
val idf = new IDF()
.setInputCol(hashingTF.getOutputCol)
.setOutputCol("features")

// encode category to numerical label
val indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("label")

// ChiSqSelector
val selector = new ChiSqSelector()
.setNumTopFeatures(k)
.setFeaturesCol(idf.getOutputCol)
.setLabelCol(indexer.getOutputCol)
.setOutputCol("selected_features")

val pipeline = new Pipeline().setStages(Array(tokenizer, remover, hashingTF, idf, indexer, selector))

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.util.DefaultParamsWritable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{StringIndexer, HashingTF, IDF, Tokenizer, StopWordsRemover, ChiSqSelector}
defined class CustomStopWordsRemover
defined class CustomTokenizer
tokenizer: org.apache.spark.ml.feature.Tokenizer = tok_4dea812d6995
stopWordsFile: String = stopwords.txt
remover: CustomStopWordsRemover = StopWordsRemover: uid=stopWords_8074a1f7ea90, numStopWords=596, locale=en_US, caseSensitive=false
hashingTF: org.apache.spark.ml.feature.HashingTF = HashingTF: uid=hashingTF_cf179d3ff0cf, binary=false, numFeatures=262144
idf: org.apache.spark.ml.feature.IDF = idf_ad649cfaba48
indexer: org.apache.spark.ml.featu...


In [57]:
val pipeLineModel = pipeline.fit(reviewsDF)
val df = pipeLineModel.transform(reviewsDF).select("category", "label", "selected_features")
df.show()

+--------------------+-----+--------------------+
|            category|label|   selected_features|
+--------------------+-----+--------------------+
|Health_and_Person...| 11.0|(2000,[0,1,2,3,4,...|
|        Kindle_Store| 13.0|(2000,[0,1,2,3,4,...|
|                Baby|  2.0|(2000,[0,1,3,4,5,...|
|       Movies_and_TV| 14.0|(2000,[0,2,3,4,5,...|
|Clothing_Shoes_an...|  7.0|(2000,[0,1,2,3,4,...|
|                Book|  4.0|(2000,[0,1,2,3,4,...|
|          Automotive|  1.0|(2000,[0,1,2,3,4,...|
|  Sports_and_Outdoor| 19.0|(2000,[0,1,2,3,4,...|
|    Apps_for_Android|  0.0|(2000,[0,5,6,7,9,...|
|Tools_and_Home_Im...| 20.0|(2000,[0,1,2,3,4,...|
|Cell_Phones_and_A...|  6.0|(2000,[0,1,2,3,4,...|
|       CDs_and_Vinyl|  5.0|(2000,[0,2,3,4,5,...|
|          Electronic|  9.0|(2000,[0,1,2,3,4,...|
|         Pet_Supplie| 18.0|(2000,[0,1,2,3,4,...|
|Grocery_and_Gourm...| 10.0|(2000,[0,1,3,5,6,...|
|     Home_and_Kitche| 12.0|(2000,[0,1,2,3,4,...|
|       Toys_and_Game| 21.0|(2000,[0,1,2,3,4,...|


pipeLineModel: org.apache.spark.ml.PipelineModel = pipeline_1020592d71d3
df: org.apache.spark.sql.DataFrame = [category: string, label: double ... 1 more field]


In [65]:
// save unfit pipeline
pipeline.write.overwrite().save("pipeline")