In [None]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._

val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
      

### Calculate a basic cleaned stream

In [None]:
var rawdf = (
    spark.read.option("inferSchema", "true")
    .json("../data/winemag-data-130k-v2.json")
    .select(trim(lower($"variety")).alias("variety"), trim(lower($"description")).alias("description"))
    .dropDuplicates(Seq("description"))
    .filter($"variety".isNotNull)
    .filter(not($"variety".contains("blend")))
    .filter(not($"variety".contains("red")))
    .filter(not($"variety".contains("white")))
    .select($"variety", regexp_replace($"description", $"variety", lit("")).alias("description"))
    .select($"variety", regexp_replace($"description", "[^\\p{L}\\p{Nd}[0-9]+]+", " ").alias("description"))
    .select($"variety", trim(lower($"description")).alias("description"))
    .cache
)

rawdf.show(3)

### Select the varieties with > 3000 reviews

In [25]:
val df = (rawdf
 .groupBy($"variety")
 .agg(count("variety").alias("count"))
 .where("count > 3000")
 .join(rawdf, Seq("variety"))
 .orderBy("variety")
 .select($"variety", $"description")).cache

df.show(3)


+------------------+--------------------+
|           variety|         description|
+------------------+--------------------+
|cabernet sauvignon|jammy aromas figg...|
|cabernet sauvignon|fiery and clipped...|
|cabernet sauvignon|here s an importa...|
|cabernet sauvignon|aged in a combina...|
|cabernet sauvignon|encased in a ligh...|
|cabernet sauvignon|from estate grown...|
|cabernet sauvignon|hard to appreciat...|
|cabernet sauvignon|a little rugged a...|
|cabernet sauvignon|a touch of overri...|
|cabernet sauvignon|aromas of butters...|
|cabernet sauvignon|comes down on the...|
|cabernet sauvignon|for those looking...|
|cabernet sauvignon|four plus years h...|
|cabernet sauvignon|from chilean wine...|
|cabernet sauvignon|hailing from diff...|
|cabernet sauvignon|varietal this win...|
|cabernet sauvignon|a good everyday c...|
|cabernet sauvignon|a portion of the ...|
|cabernet sauvignon|a stunning effort...|
|cabernet sauvignon|arcane s cab is s...|
+------------------+--------------

In [26]:
import org.apache.spark.ml.feature.StopWordsRemover
import scala.collection.mutable.WrappedArray


val varieties = (df
    .select("variety")
    .distinct())

val tokenizer2 = (
    new Tokenizer()             
    .setInputCol("variety")
    .setOutputCol("variety_splits")
)

val varietySplits = tokenizer2.transform(varieties).cache

val splitVarieties = (varietySplits
             .select("variety_splits")
             .collect()
             .map(_.toSeq.asInstanceOf[WrappedArray[WrappedArray[String]]])
             .flatMap(_.toSeq)
             .flatMap(_.toSeq))

val wordsDf = tokenizer.transform(onlyTop).select($"variety", $"words")

val remover = new StopWordsRemover().setInputCol("words").setOutputCol("filteredWords")

remover.setStopWords((splitVarieties.toList:::remover.getStopWords.toList).toSet.toArray)

val noStopWordsDf = remover.transform(wordsDf).select($"variety", $"filteredWords".alias("words"))

In [19]:
import org.apache.spark.ml.feature.CountVectorizer

val countVectorizer = new CountVectorizer().setInputCol("words").setOutputCol("features")
val countVectorizerModel = countVectorizer.fit(noStopWordsDf)
val countVectorizerDF = countVectorizerModel.transform(noStopWordsDf)

countVectorizerDF.show(5,true)

+------------------+--------------------+--------------------+
|           variety|               words|            features|
+------------------+--------------------+--------------------+
|cabernet sauvignon|[stunning, effort...|(20065,[0,3,6,8,1...|
|cabernet sauvignon|[little, rugged, ...|(20065,[1,23,31,8...|
|cabernet sauvignon|[portion, sourced...|(20065,[1,2,14,58...|
|cabernet sauvignon|[varietal, wine, ...|(20065,[0,1,4,6,7...|
|cabernet sauvignon|[good, everyday, ...|(20065,[1,7,10,12...|
+------------------+--------------------+--------------------+
only showing top 5 rows



In [7]:
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.Normalizer

val indexer = (
    new StringIndexer()
    .setInputCol("variety")
    .setOutputCol("varietyIndex"))

val indexed = (
    indexer           
    .fit(countVectorizerDF).transform(countVectorizerDF)
    .select($"variety", $"varietyIndex".cast("double").alias("label"), $"features"))

indexed.show()


+------------------+-----+--------------------+
|           variety|label|            features|
+------------------+-----+--------------------+
|cabernet sauvignon|  2.0|(20065,[0,1,2,3,4...|
|cabernet sauvignon|  2.0|(20065,[0,5,7,11,...|
|cabernet sauvignon|  2.0|(20065,[0,2,12,15...|
|cabernet sauvignon|  2.0|(20065,[0,1,6,10,...|
|cabernet sauvignon|  2.0|(20065,[1,5,7,12,...|
|cabernet sauvignon|  2.0|(20065,[4,9,46,49...|
|cabernet sauvignon|  2.0|(20065,[0,5,39,52...|
|cabernet sauvignon|  2.0|(20065,[0,1,9,10,...|
|cabernet sauvignon|  2.0|(20065,[1,2,3,12,...|
|cabernet sauvignon|  2.0|(20065,[1,6,11,12...|
|cabernet sauvignon|  2.0|(20065,[1,4,5,6,8...|
|cabernet sauvignon|  2.0|(20065,[1,7,10,12...|
|cabernet sauvignon|  2.0|(20065,[1,4,6,10,...|
|cabernet sauvignon|  2.0|(20065,[0,1,2,7,2...|
|cabernet sauvignon|  2.0|(20065,[12,39,44,...|
|cabernet sauvignon|  2.0|(20065,[0,1,3,14,...|
|cabernet sauvignon|  2.0|(20065,[0,4,6,12,...|
|cabernet sauvignon|  2.0|(20065,[1,2,4,

In [8]:
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

val Array(trainingData, testData) = indexed.randomSplit(Array(0.7, 0.3))

// Train a NaiveBayes model.
val model = new NaiveBayes().fit(trainingData)



In [9]:
// Select example rows to display.
val predictions = model.transform(testData)

(predictions
 .filter($"label".equalTo($"prediction"))
 .select($"variety")
 .groupBy($"variety")
 .count
 .orderBy($"count".desc)
 .show(10))

(predictions
 .filter($"label".notEqual($"prediction"))
 .select($"variety")
 .groupBy($"variety")
 .count
 .orderBy($"count".desc)
 .show(10))

// Select (prediction, true label) and compute test error
val evaluator = (new MulticlassClassificationEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("accuracy"))

val accuracy = evaluator.evaluate(predictions)
println("Test set accuracy = " + accuracy)


+------------------+-----+
|           variety|count|
+------------------+-----+
|        pinot noir| 3080|
|        chardonnay| 3028|
|cabernet sauvignon| 2209|
|          riesling| 1073|
|   sauvignon blanc|  830|
|              rosé|  681|
|             syrah|  430|
+------------------+-----+

+------------------+-----+
|           variety|count|
+------------------+-----+
|             syrah|  717|
|        pinot noir|  619|
|   sauvignon blanc|  587|
|cabernet sauvignon|  469|
|        chardonnay|  380|
|          riesling|  358|
|              rosé|  265|
+------------------+-----+

Test set accuracy = 0.7694553850332745


In [10]:
import org.apache.spark.ml.regression.LinearRegression

val lr = (new LinearRegression()
  .setMaxIter(10)
  .setRegParam(0.3)
  .setElasticNetParam(0.8))

// Fit the model
val lrModel = lr.fit(trainingData)

// Print the coefficients and intercept for linear regression
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

// Summarize the model over the training set and print out some metrics
val trainingSummary = lrModel.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")

Coefficients: (20065,[7,85,251,542],[-0.19138322261637372,-0.2639452304074721,0.2270774658680632,0.442845035787779]) Intercept: 2.1084465589513752
numIterations: 11
objectiveHistory: [0.4999851402757777,0.4992495421023175,0.4982776417781064,0.4979646870723006,0.4978644417511339,0.49783228070130064,0.4978218039393463,0.4978184250248167,0.49781729988562734,0.49781692859973264,0.49781679760014796]
+--------------------+
|           residuals|
+--------------------+
|-0.10844655895137523|
|-0.10844655895137523|
|-0.10844655895137523|
|-0.10844655895137523|
| 0.08293666366499841|
|-0.10844655895137523|
|-0.10844655895137523|
| 0.08293666366499841|
|-0.10844655895137523|
|-0.10844655895137523|
|-0.10844655895137523|
| 0.08293666366499841|
| 0.08293666366499841|
| 0.08293666366499841|
| 0.08293666366499841|
|-0.10844655895137523|
|-0.10844655895137523|
| 0.15549867145609686|
|  0.3468818940724705|
| 0.08293666366499841|
+--------------------+
only showing top 20 rows

RMSE: 1.8416498217237585

In [11]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.ml.regression.GBTRegressor

// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
val featureIndexer = (new VectorIndexer()
  .setInputCol("features")
  .setOutputCol("indexedFeatures")
  .setMaxCategories(15)
  .fit(indexed))

// Train a RandomForest model.
val rf = (new RandomForestRegressor()
  .setLabelCol("label")
  .setFeaturesCol("indexedFeatures"))

// Train a DecisionTree model.
val dt = (new DecisionTreeRegressor()
  .setLabelCol("label")
  .setFeaturesCol("indexedFeatures"))

// Train a GBT model.
val gbt = (new GBTRegressor()
  .setLabelCol("label")
  .setFeaturesCol("indexedFeatures")
  .setMaxIter(10))

// Chain indexer and forest in a Pipeline.
val pipeline = new Pipeline().setStages(Array(featureIndexer, gbt))

// Train model. This also runs the indexer.
val model = pipeline.fit(trainingData)

// Make predictions.
val predictions = model.transform(testData)

// Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

// Select (prediction, true label) and compute test error.
val evaluator = (new RegressionEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("rmse"))

val rmse = evaluator.evaluate(predictions)

println("Root Mean Squared Error (RMSE) on test data = " + rmse)


+------------------+-----+--------------------+
|        prediction|label|            features|
+------------------+-----+--------------------+
|1.6422612044711733|  2.0|(20065,[0,1,2,3,4...|
|2.4567537765944505|  2.0|(20065,[0,1,2,3,4...|
|2.2263054386220698|  2.0|(20065,[0,1,2,3,4...|
| 1.249610884739557|  2.0|(20065,[0,1,2,3,6...|
| 1.249610884739557|  2.0|(20065,[0,1,2,3,6...|
+------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 1.639412940255989


In [12]:
// Select example rows to display.
predictions.select("prediction", "label", "features").orderBy($"label".desc).show(500)


+-------------------+-----+--------------------+
|         prediction|label|            features|
+-------------------+-----+--------------------+
| 2.4239747155184452|  6.0|(20065,[0,9,15,16...|
| 1.4802475663512495|  6.0|(20065,[0,2,4,10,...|
|  2.412392446948956|  6.0|(20065,[0,9,13,24...|
| 2.3030923257739793|  6.0|(20065,[0,1,5,6,1...|
|  1.571010765591114|  6.0|(20065,[0,2,4,7,1...|
|  3.735038313963659|  6.0|(20065,[0,3,9,15,...|
|  2.490045026039038|  6.0|(20065,[0,9,13,16...|
| 1.1073401519269206|  6.0|(20065,[0,1,3,7,1...|
|  6.775034993786712|  6.0|(20065,[0,1,5,6,1...|
| 4.1648453377064705|  6.0|(20065,[0,2,3,4,1...|
| 1.2867266803731099|  6.0|(20065,[0,2,4,7,8...|
|  3.041885177661924|  6.0|(20065,[0,3,8,13,...|
|  4.007465170381376|  6.0|(20065,[0,3,9,15,...|
| 4.2570099217845705|  6.0|(20065,[0,5,6,16,...|
|  3.461200564908273|  6.0|(20065,[0,9,12,18...|
|  4.687709682771015|  6.0|(20065,[0,1,2,6,7...|
|  4.538738470077681|  6.0|(20065,[0,1,3,7,9...|
|  3.672247944367639

In [13]:
println(model)

println("Learned regression tree model:\n" + model.toDebugString)

pipeline_fb4fa445aa4a


Name: Compile Error
Message: <console>:83: error: value toDebugString is not a member of org.apache.spark.ml.PipelineModel
       println("Learned regression tree model:\n" + model.toDebugString)
                                                          ^
StackTrace: 