In [1]:
import org.apache.spark.sql.SparkSession;

//import statistics.functions._;

import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.types._;
import org.apache.spark.sql.functions._;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.Model;
import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator;
import org.apache.spark.ml.feature.Binarizer;
import org.apache.spark.ml.feature.{RegexTokenizer, NGram};
import org.apache.spark.ml.feature.{HashingTF, IDF};
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder};

Intitializing Scala interpreter ...

Spark Web UI available at http://pranav-pc:4040
SparkContext available as 'sc' (version = 3.0.0-preview2, master = local[*], app id = local-1580763318127)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.Model
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.Binarizer
import org.apache.spark.ml.feature.{RegexTokenizer, NGram}
import org.apache.spark.ml.feature.{HashingTF, IDF}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}


In [2]:
val path: String = "hdfs://localhost:9000/TextMining/tokens/part-00000";

// "rate" must be Double because it can be easily binarized by Spark
val original_schema = new StructType(Array(
  StructField("product", StringType,  true),
  StructField("votes",   IntegerType, true),
  StructField("rate",    DoubleType,  true),
  StructField("text",    StringType,  true)));

val original_data:DataFrame = spark.read
  .options(Map("delimiter" -> "\t"))
  .schema(original_schema)
  .csv(path)
  .na.drop();

path: String = hdfs://localhost:9000/TextMining/tokens/part-00000
original_schema: org.apache.spark.sql.types.StructType = StructType(StructField(product,StringType,true), StructField(votes,IntegerType,true), StructField(rate,DoubleType,true), StructField(text,StringType,true))
original_data: org.apache.spark.sql.DataFrame = [product: string, votes: int ... 2 more fields]


In [3]:
original_data.show(5)

+----------+-----+----+--------------------+
|   product|votes|rate|                text|
+----------+-----+----+--------------------+
|B01CT5KLIY|    1| 4.0|love idea two wom...|
|B01CT5KQMA|    1| 5.0|must veri well gr...|
|B01CT5KQMA|    1| 5.0|charact likabl pl...|
|B01CT6LRK4|    1| 4.0|abl final sit sur...|
|B01CT6LRK4|    1| 2.0|few basic tip say...|
+----------+-----+----+--------------------+
only showing top 5 rows



In [4]:
val binarizer = new Binarizer()
  .setInputCol("rate")
  .setOutputCol("label")
  .setThreshold(3.5);

// get n-grams
val tokenizer = new RegexTokenizer()
  .setInputCol("text")
  .setOutputCol("tokens")
  .setPattern("\\W");
val ngrams = new NGram()
  .setInputCol(tokenizer.getOutputCol)
  .setOutputCol("n-grams");

// calc tf-idf 
val tf = new HashingTF()
  .setInputCol(ngrams.getOutputCol)
  .setOutputCol("tf");
val idf = new IDF()
  .setInputCol(tf.getOutputCol)
  .setOutputCol("tf-idf")
  .setMinDocFreq(3);

// build the classifier
val classifierMod = new LogisticRegression()
  .setMaxIter(10)
  .setFeaturesCol(idf.getOutputCol)
  .setLabelCol(binarizer.getOutputCol);

// this is the pipeline that data follows to be evaluated
val pipeline = new Pipeline()
  .setStages(Array(binarizer, tokenizer, ngrams, tf, idf, classifierMod));

// a little of optimization: try different hyperparameters
val paramGrid = new ParamGridBuilder()
  .addGrid(classifierMod.regParam, Array(0.01, 0.05, 0.1))
  .addGrid(ngrams.n, Array(1, 2, 3))
  .build();

// do it with a cross validation on the train set (3 folds)
val cv = new CrossValidator()
  .setEstimator(pipeline)
  .setEvaluator(new BinaryClassificationEvaluator)
  .setEstimatorParamMaps(paramGrid)
  .setNumFolds(3);

binarizer: org.apache.spark.ml.feature.Binarizer = Binarizer: uid=binarizer_1f2ecfd8bb5d
tokenizer: org.apache.spark.ml.feature.RegexTokenizer = regexTok_4eca11e23721
ngrams: org.apache.spark.ml.feature.NGram = NGram: uid=ngram_08dd240d9a73, n=2
tf: org.apache.spark.ml.feature.HashingTF = HashingTF: uid=hashingTF_1ed4df629fa6, binary=false, numFeatures=262144
idf: org.apache.spark.ml.feature.IDF = idf_6db681b5f222
classifierMod: org.apache.spark.ml.classification.LogisticRegression = logreg_9cdc54499216
pipeline: org.apache.spark.ml.Pipeline = pipeline_0234d6e5c92e
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	ngram_08dd240d9a73-n: 1,
	logreg_9cdc54499216-regParam: 0.01
}, {
	ngram_08dd240d9a73-n: 1,
	logreg_9cdc54499216-regParam: 0.05
}, {
	ngram_08dd240d9a73-n: 1,
	l...


In [5]:
println("Training... ");
val model = cv.fit(original_data);
println("done!");

// print results
for (i <- 0 until model.avgMetrics.size) {
  println("\n\n");
  println(model.getEstimatorParamMaps(i));
  println(model.avgMetrics(i));
}


Training... 
done!



{
	ngram_08dd240d9a73-n: 1,
	logreg_9cdc54499216-regParam: 0.01
}
0.894217505803825



{
	ngram_08dd240d9a73-n: 1,
	logreg_9cdc54499216-regParam: 0.05
}
0.9034031006521369



{
	ngram_08dd240d9a73-n: 1,
	logreg_9cdc54499216-regParam: 0.1
}
0.905312185591742



{
	ngram_08dd240d9a73-n: 2,
	logreg_9cdc54499216-regParam: 0.01
}
0.8443155195102902



{
	ngram_08dd240d9a73-n: 2,
	logreg_9cdc54499216-regParam: 0.05
}
0.8644238243016412



{
	ngram_08dd240d9a73-n: 2,
	logreg_9cdc54499216-regParam: 0.1
}
0.872029019745533



{
	ngram_08dd240d9a73-n: 3,
	logreg_9cdc54499216-regParam: 0.01
}
0.6673848921888594



{
	ngram_08dd240d9a73-n: 3,
	logreg_9cdc54499216-regParam: 0.05
}
0.6837430757007262



{
	ngram_08dd240d9a73-n: 3,
	logreg_9cdc54499216-regParam: 0.1
}
0.6908928013842282


model: org.apache.spark.ml.tuning.CrossValidatorModel = CrossValidatorModel: uid=cv_067743676650, bestModel=pipeline_0234d6e5c92e, numFolds=3
