# DIC2020 - A2.3 SparkML Pipeline to train category prediction model

## Imports

In [1]:
import org.apache.spark.ml.feature.RegexTokenizer
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, IDF, ChiSqSelector, StringIndexer, ChiSqSelectorModel, Normalizer}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.classification.{OneVsRest, LinearSVC}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

Intitializing Scala interpreter ...

Spark Web UI available at http://c100.local:8088/proxy/application_1587827373944_5391
SparkContext available as 'sc' (version = 2.4.0-cdh6.3.2, master = yarn, app id = application_1587827373944_5391)
SparkSession available as 'spark'


import org.apache.spark.ml.feature.RegexTokenizer
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, IDF, ChiSqSelector, StringIndexer, ChiSqSelectorModel, Normalizer}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.classification.{OneVsRest, LinearSVC}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator


## Constants

In [2]:
val INPUT = "hdfs:///user/pknees/amazon-reviews/full/reviews_devset.json"
val DELIMS = "[.!?,;:()\\[\\]{}\\-_\"\\`~#&*%$\\/\\s\\d]"
val STOPWORDS = "hdfs:///user/e11944050/stopwords.txt"
val TOP_N_FEATURES = 4000
val RANDOM_SEED = 42 //to make splits reproducible

INPUT: String = hdfs:///user/pknees/amazon-reviews/full/reviews_devset.json
DELIMS: String = [.!?,;:()\[\]{}\-_"\`~#&*%$\/\s\d]
STOPWORDS: String = hdfs:///user/e11944050/stopwords.txt
TOP_N_FEATURES: Int = 4000
RANDOM_SEED: Int = 42


## Load JSON data from file and print schema

In [3]:
val amazonReviewDfFull = spark.read.json(INPUT)
amazonReviewDfFull.printSchema()

root
 |-- asin: string (nullable = true)
 |-- category: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



amazonReviewDfFull: org.apache.spark.sql.DataFrame = [asin: string, category: string ... 8 more fields]


## Only select reviewText and category columns

In [4]:
val columnSelection = Seq("reviewText", "category")
val reviewCategoryDf = amazonReviewDfFull.select(columnSelection.map(c => col(c)): _*)

columnSelection: Seq[String] = List(reviewText, category)
reviewCategoryDf: org.apache.spark.sql.DataFrame = [reviewText: string, category: string]


## Load stopwords

In [5]:
val stopWords = spark.read.textFile(STOPWORDS).as[String].collect()

stopWords: Array[String] = Array(a's, able, about, above, according, accordingly, across, actually, after, afterwards, again, against, ain't, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, appear, appreciate, appropriate, are, aren't, around, as, aside, ask, asking, associated, at, available, away, awfully, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, both, brief, but, by, c'mon, c's, came, can, can't, cannot, cant, cause, causes, certain, certainly, changes, clearly, co, com, come, comes, concerning, consequently, consider, considering, conta...

## Define preprocessing pipeline

In [6]:
val regexTokenizer = new RegexTokenizer()
                .setInputCol("reviewText")
                .setOutputCol("terms")
                .setPattern(DELIMS)
                .setToLowercase(true)

val remover = new StopWordsRemover()
              .setInputCol("terms")
              .setOutputCol("termsFiltered")
              .setStopWords(stopWords)

val countVectorizer = new CountVectorizer() 
                    .setInputCol("termsFiltered")
                    .setOutputCol("rawFeatures") 
                    .setMinDF(100) //otherwise ChiSqSelection takes too much time

val idf = new IDF()
        .setInputCol("rawFeatures")
        .setOutputCol("idfFeatures")

val indexer = new StringIndexer()
            .setInputCol("category")
            .setOutputCol("label")

val selector = new ChiSqSelector()
               .setNumTopFeatures(TOP_N_FEATURES)
               .setFeaturesCol("idfFeatures")
               .setLabelCol("label")
               .setOutputCol("selectedFeatures")

val normalizer = new Normalizer()
                  .setInputCol("selectedFeatures")
                  .setOutputCol("features")
                  .setP(2.0) //L2 Norm

regexTokenizer: org.apache.spark.ml.feature.RegexTokenizer = regexTok_3401ac26f392
remover: org.apache.spark.ml.feature.StopWordsRemover = stopWords_b515e2736645
countVectorizer: org.apache.spark.ml.feature.CountVectorizer = cntVec_2785c6a41ed0
idf: org.apache.spark.ml.feature.IDF = idf_2c7d0ae1851a
indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_e2c02fcc571d
selector: org.apache.spark.ml.feature.ChiSqSelector = chiSqSelector_a72fde60fa69
normalizer: org.apache.spark.ml.feature.Normalizer = normalizer_bfa5c3b1614f


## Fit preprocessing pipeline and transform data

In [7]:
val prpPipeline = new Pipeline().setStages(Array(
                                regexTokenizer, 
                                remover, 
                                countVectorizer, 
                                idf, 
                                indexer, 
                                selector,
                                normalizer))

val prpModel = prpPipeline.fit(reviewCategoryDf)

prpPipeline: org.apache.spark.ml.Pipeline = pipeline_1d117ce2781c
prpModel: org.apache.spark.ml.PipelineModel = pipeline_1d117ce2781c


In [8]:
val prpData = prpModel.transform(reviewCategoryDf)

prpData: org.apache.spark.sql.DataFrame = [reviewText: string, category: string ... 7 more fields]


## Split dataset in train and test

In [9]:
val Array(train, test) = prpData.randomSplit(Array(0.8, 0.2), seed=RANDOM_SEED)

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [reviewText: string, category: string ... 7 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [reviewText: string, category: string ... 7 more fields]


## Define classifier pipeline

In [10]:
// instantiate the base classifier
val classifier = new LinearSVC()
                
// instantiate the One Vs Rest Classifier.
val ovr = new OneVsRest().setClassifier(classifier)

// We use a ParamGridBuilder to construct a grid of parameters to search over.
// TrainValidationSplit will try all combinations of values and determine best model using
// the evaluator.
val paramGrid = new ParamGridBuilder()
                .addGrid(classifier.regParam, Array(0.01, 0.1, 0.2)) //try 3 settings for regularization 
                .addGrid(classifier.maxIter, Array(5, 10)) //try 2 settings for max iterations
                .addGrid(classifier.standardization, Array(true, false)) //try with and without standardization
                .build()

val trainValidationSplit = new TrainValidationSplit()
                            .setEstimator(ovr)
                            .setEvaluator(new MulticlassClassificationEvaluator().setMetricName("f1"))
                            .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation.
                            .setTrainRatio(0.8) // Evaluate up to 2 parameter settings in parallel
                            .setParallelism(2)

classifier: org.apache.spark.ml.classification.LinearSVC = linearsvc_2db3ee075a5b
ovr: org.apache.spark.ml.classification.OneVsRest = oneVsRest_750e29cf94ce
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linearsvc_2db3ee075a5b-maxIter: 5,
	linearsvc_2db3ee075a5b-regParam: 0.01,
	linearsvc_2db3ee075a5b-standardization: true
}, {
	linearsvc_2db3ee075a5b-maxIter: 5,
	linearsvc_2db3ee075a5b-regParam: 0.1,
	linearsvc_2db3ee075a5b-standardization: true
}, {
	linearsvc_2db3ee075a5b-maxIter: 5,
	linearsvc_2db3ee075a5b-regParam: 0.2,
	linearsvc_2db3ee075a5b-standardization: true
}, {
	linearsvc_2db3ee075a5b-maxIter: 5,
	linearsvc_2db3ee075a5b-regParam: 0.01,
	linearsvc_2db3ee075a5b-standardization: false
}, {
	linearsvc_2db3ee075a5b-maxIter: 5,
	linearsvc_2db3ee075a5b-regParam: ...

## Fit model pipeline on train data

In [11]:
val predictionModel = trainValidationSplit.fit(train)

predictionModel: org.apache.spark.ml.tuning.TrainValidationSplitModel = tvs_e73d8808c5aa


## Transform test data and show some predictions

In [12]:
val predictions = predictionModel.transform(test)

predictions.select("features", "label", "prediction")
            .show()

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(3608,[0,1,3,4,8,...|  0.0|       0.0|
|(3608,[0,1,2,5,10...|  0.0|       0.0|
|(3608,[0,1,2,5,16...|  0.0|       0.0|
|(3608,[0,1,2,5,16...|  0.0|       0.0|
|(3608,[0,1,2,4,8,...|  0.0|       0.0|
|(3608,[0,1,2,4,5,...|  0.0|       0.0|
|(3608,[0,1,2,5,6,...|  0.0|       0.0|
|(3608,[1,3,5,14,3...|  0.0|       0.0|
|(3608,[0,1,2,4,23...|  0.0|       0.0|
|(3608,[0,1,2,4,5,...|  0.0|       0.0|
|(3608,[0,2,22,47,...|  0.0|       0.0|
|(3608,[1,2,12,14,...|  0.0|       0.0|
|(3608,[0,2,4,8,54...|  0.0|       0.0|
|(3608,[0,1,2,6,8,...|  0.0|       0.0|
|(3608,[0,1,2,5,8,...|  0.0|       0.0|
|(3608,[0,1,2,3,6,...|  0.0|       0.0|
|(3608,[0,1,2,5,7,...|  0.0|       0.0|
|(3608,[0,1,2,5,7,...|  0.0|       0.0|
|(3608,[0,1,4,189,...| 18.0|      12.0|
|(3608,[1,67,358,5...| 10.0|      10.0|
+--------------------+-----+----------+
only showing top 20 rows



predictions: org.apache.spark.sql.DataFrame = [reviewText: string, category: string ... 9 more fields]


## Evaluate predictions over test data with F1 score

In [13]:
val evaluator = new MulticlassClassificationEvaluator().setMetricName("f1")
val f1 = evaluator.evaluate(predictions)

evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_6b5479353acb
f1: Double = 0.6562831535787289
