# DIC2020 - A2.2 SparkML Pipeline to vectorize reviews and find top 4000 terms over all categories

## Imports

In [1]:
import org.apache.spark.ml.feature.RegexTokenizer
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, IDF, ChiSqSelector, StringIndexer, ChiSqSelectorModel}
import java.io.PrintWriter


Intitializing Scala interpreter ...

Spark Web UI available at http://c100.local:8088/proxy/application_1587827373944_4796
SparkContext available as 'sc' (version = 2.4.0-cdh6.3.2, master = yarn, app id = application_1587827373944_4796)
SparkSession available as 'spark'


import org.apache.spark.ml.feature.RegexTokenizer
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, IDF, ChiSqSelector, StringIndexer, ChiSqSelectorModel}
import java.io.PrintWriter


## Constants

In [2]:
val INPUT = "hdfs:///user/pknees/amazon-reviews/full/reviews_devset.json"
val DELIMS = "[.!?,;:()\\[\\]{}\\-_\"\\`~#&*%$\\/\\s\\d]"
val STOPWORDS = "hdfs:///user/e11944050/stopwords.txt"
val TOP_N_FEATURES = 4000

INPUT: String = hdfs:///user/pknees/amazon-reviews/full/reviews_devset.json
DELIMS: String = [.!?,;:()\[\]{}\-_"\`~#&*%$\/\s\d]
STOPWORDS: String = hdfs:///user/e11944050/stopwords.txt
TOP_N_FEATURES: Int = 4000


## Load JSON data from file and print schema

In [3]:
val amazonReviewDfFull = spark.read.json(INPUT)
amazonReviewDfFull.printSchema()

root
 |-- asin: string (nullable = true)
 |-- category: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



amazonReviewDfFull: org.apache.spark.sql.DataFrame = [asin: string, category: string ... 8 more fields]


## Only select reviewText and category columns

In [4]:
val columnSelection = Seq("reviewText", "category")
val reviewCategoryDf = amazonReviewDfFull.select(columnSelection.map(c => col(c)): _*)

columnSelection: Seq[String] = List(reviewText, category)
reviewCategoryDf: org.apache.spark.sql.DataFrame = [reviewText: string, category: string]


## Load stopwords

In [5]:
val stopWords = spark.read.textFile(STOPWORDS).as[String].collect()

stopWords: Array[String] = Array(a's, able, about, above, according, accordingly, across, actually, after, afterwards, again, against, ain't, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, appear, appreciate, appropriate, are, aren't, around, as, aside, ask, asking, associated, at, available, away, awfully, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, both, brief, but, by, c'mon, c's, came, can, can't, cannot, cant, cause, causes, certain, certainly, changes, clearly, co, com, come, comes, concerning, consequently, consider, considering, conta...

## Define pipeline

In [6]:
val regexTokenizer = new RegexTokenizer()
                .setInputCol("reviewText")
                .setOutputCol("terms")
                .setPattern(DELIMS)
                .setToLowercase(true)

val remover = new StopWordsRemover()
              .setInputCol("terms")
              .setOutputCol("termsFiltered")
              .setStopWords(stopWords)

val countVectorizer = new CountVectorizer() 
                    .setInputCol("termsFiltered")
                    .setOutputCol("rawFeatures") 
                    .setMinDF(100) //otherwise ChiSqSelection takes too much time

val idf = new IDF()
        .setInputCol("rawFeatures")
        .setOutputCol("features")

val indexer = new StringIndexer()
            .setInputCol("category")
            .setOutputCol("categoryIndex")

val selector = new ChiSqSelector()
               .setNumTopFeatures(TOP_N_FEATURES)
               .setFeaturesCol("features")
               .setLabelCol("categoryIndex")
               .setOutputCol("selectedFeatures")

regexTokenizer: org.apache.spark.ml.feature.RegexTokenizer = regexTok_a31611662901
remover: org.apache.spark.ml.feature.StopWordsRemover = stopWords_8fd8b138f748
countVectorizer: org.apache.spark.ml.feature.CountVectorizer = cntVec_633dfe088131
idf: org.apache.spark.ml.feature.IDF = idf_5da6e57c0879
indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_c0e63a48b944
selector: org.apache.spark.ml.feature.ChiSqSelector = chiSqSelector_ac1e766b31c7


## Fit pipeline on review data

In [7]:
val pipeline = new Pipeline().setStages(Array(
                                regexTokenizer, 
                                remover, 
                                countVectorizer, 
                                idf, 
                                indexer, 
                                selector))

val model = pipeline.fit(reviewCategoryDf)

pipeline: org.apache.spark.ml.Pipeline = pipeline_07b499317477
model: org.apache.spark.ml.PipelineModel = pipeline_07b499317477


## Get top 4000 terms selected by pipeline and make space separated string

In [8]:
// get indices of selected features
val selectedFeatures = model.stages(5).asInstanceOf[ChiSqSelectorModel].selectedFeatures

// get terms in vocabulary
val vocabulary = model.stages(2).asInstanceOf[CountVectorizerModel].vocabulary

val selectedFeaturesString = selectedFeatures // get the space-separated string of terms
    .map(x => vocabulary(x)) // map index to vocabulary
    .sorted // sort alphabetically
    .mkString(" ") // make space-separated String

selectedFeaturesString

selectedFeatures: Array[Int] = Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 165, 166, 167, 168, 169, 170, 171, 173, 174, 175, 179, 180, 181, 182, 183, 184, 185, 186, 187, 1...

## Save top 4000 terms to file

In [10]:
new PrintWriter("output_ds.txt") { 
    write(selectedFeaturesString); 
    close 
}

res2: java.io.PrintWriter = $anon$1@79bc16e6
