*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# VW MMLSpark Model for AI on Accumulo


In this notebook, we test a Vowpal Wabbit model in MMLSpark using the manual test set provided in [sentiment140](http://help.sentiment140.com/for-students/?source=post_page---------------------------) twitter data. [Microsoft Accumulo Spark Connector (MASC)](https://github.com/microsoft/masc) is used for handling data IO between Accumulo and Spark.   

Before running this notebook, please
* make sure you have Accumulo 2.0.0 and Spark 2.4.3 installed
* create and activate a conda environment with Apache Toree installed
* download accumulo-spark-datasource jar and accumulo-spark-iterator jar
* run commands like the following to install a Jupyter toree kernel
```
# Replace the jar file path based on your situation
JAR="file:///home/rba1/twitter-sentiment/lib/accumulo-spark-datasource-1.0.0-SNAPSHOT-shaded.jar"
jupyter toree install \
    --replace \
    --user \
    --kernel_name=accumulo \
    --spark_home=${SPARK_HOME} \
    --spark_opts="--master yarn --jars $JAR \
        --packages org.apache.spark:spark-avro_2.11:2.4.3,com.microsoft.ml.spark:mmlspark_2.11:0.18.1 \
        --driver-memory 8g \
        --executor-memory 6g \
        --driver-cores 2 \
        --executor-cores 2 \
        --num-executors 16"
```

In [1]:
import org.apache.spark.{SparkConf, SparkContext}

// Stop existing spark context and create new one
sc.stop()

val conf = new SparkConf()
conf.setAppName("TwitterSentimentClassification")

new SparkContext(conf)

println("Spark version %s".format(sc.version))
println("Scala %s".format(util.Properties.versionString))
println
sc.getConf.getAll.foreach(println)

Waiting for a Spark session to start...

Spark version 2.4.3
Scala version 2.11.12



Waiting for a Spark session to start...

(spark.eventLog.enabled,true)
(spark.repl.local.jars,file:///home/rba1/twitter-sentiment/lib/accumulo-spark-datasource-1.0.0-SNAPSHOT-shaded.jar,file:///home/rba1/.ivy2/jars/org.apache.spark_spark-avro_2.11-2.4.3.jar,file:///home/rba1/.ivy2/jars/com.microsoft.ml.spark_mmlspark_2.11-0.18.1.jar,file:///home/rba1/.ivy2/jars/org.spark-project.spark_unused-1.0.0.jar,file:///home/rba1/.ivy2/jars/org.scalactic_scalactic_2.11-3.0.5.jar,file:///home/rba1/.ivy2/jars/org.scalatest_scalatest_2.11-3.0.5.jar,file:///home/rba1/.ivy2/jars/io.spray_spray-json_2.11-1.3.2.jar,file:///home/rba1/.ivy2/jars/com.microsoft.cntk_cntk-2.4.jar,file:///home/rba1/.ivy2/jars/org.openpnp_opencv-3.2.0-1.jar,file:///home/rba1/.ivy2/jars/com.jcraft_jsch-0.1.54.jar,file:///home/rba1/.ivy2/jars/org.apache.httpcomponents_httpclient-4.5.6.jar,file:///home/rba1/.ivy2/jars/com.microsoft.ml.lightgbm_lightgbmlib-2.2.350.jar,file:///home/rba1/.ivy2/jars/com.github.vowpalwabbit_vw-jni-8.7.0.2.jar,file:///home/rba1/.ivy2/jars/org

conf = org.apache.spark.SparkConf@4d03b080


org.apache.spark.SparkConf@4d03b080

In [2]:
import org.apache.spark.sql.types.{LongType, DoubleType, StringType, StructField, StructType}
import org.apache.accumulo.core.client.Accumulo
import scala.collection.JavaConverters._

// client property file path
val PROPS_PATH = "/home/rba1/install/accumulo-2.0.0/conf/accumulo-client.properties"
val TEST_TABLE_NAME = "twitter_test_data"
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val schema = StructType(Array(
    StructField("sentiment", DoubleType),
    StructField("id", StringType),
    StructField("date", StringType),
    StructField("query_string", StringType),
    StructField("user", StringType),
    StructField("text", StringType)
))

PROPS_PATH = /home/rba1/install/accumulo-2.0.0/conf/accumulo-client.properties
TEST_TABLE_NAME = twitter_test_data
sqlContext = org.apache.spark.sql.SQLContext@2eed267d
schema = StructType(StructField(sentiment,DoubleType,true), StructField(id,StringType,true), StructField(date,StringType,true), StructField(query_string,StringType,true), StructField(user,StringType,true), StructField(text,StringType,true))




StructType(StructField(sentiment,DoubleType,true), StructField(id,StringType,true), StructField(date,StringType,true), StructField(query_string,StringType,true), StructField(user,StringType,true), StructField(text,StringType,true))

## Ingest Twitter Data to Accumulo

In [3]:
// need to upload data to hdfs first via 
// hdfs dfs -put /home/rba1/twitter-sentiment/testdata_manual.csv testdata_manual.csv
val file_path = "testdata_manual.csv"
val df = spark.read.format("csv").schema(schema).load(file_path)

var t0 = System.nanoTime()
val props = Accumulo.newClientProperties().from(PROPS_PATH).build()
props.put("table", TEST_TABLE_NAME)
props.put("rowKey", "id")
df.write.format("org.apache.accumulo").options(props.asScala).save()
var t1 = System.nanoTime()
println("Time to ingest twitter data to Accumulo: " + (t1 - t0)*1e-9 + "s")

Time to ingest twitter data to Accumulo: 4.65183017s


file_path = testdata_manual.csv
df = [sentiment: double, id: string ... 4 more fields]
t0 = 19161648859768
props = {auth.type=password, auth.principal=root, table=twitter_test_data, instance.zookeepers=rbaaccucluster2-0:2181,rbaaccucluster2-1:2181,rbaaccucluster2-2:2181, instance.name=muchos, rowKey=id, auth.token=secret}
t1 = 19166300689938


19166300689938

## Load Test Data from Accumulo

In [4]:
println("Reading test data from Accumulo...")
var t0 = System.nanoTime()
var test_df = spark.read
                   .format("org.apache.accumulo")
                   .options(props.asScala)
                   .schema(schema)
                   .load()
test_df.cache().count()
var t1 = System.nanoTime()
val read_time = (t1 - t0)*1e-9
println("Time to load test data: " + read_time + "s")

Reading test data from Accumulo...
Time to load test data: 3.6648699220000003s


t0 = 19168230563355
test_df = [sentiment: double, id: string ... 5 more fields]
t1 = 19171895433277
read_time = 3.6648699220000003


3.6648699220000003

## Data Preparation

In [5]:
import org.apache.spark.sql.functions.{rand, when}

test_df = test_df.withColumn("label", 'sentiment.cast("Int"))
                  .select('label as 'label, 'text as 'text)
                  .withColumn("label", when('label > 0, 1.0D).otherwise(0.0D))

test_df = [label: double, text: string]


[label: double, text: string]

## Load Model

In [6]:
import org.apache.spark.ml.PipelineModel
import com.microsoft.ml.spark.vw.VowpalWabbitClassificationModel

val vwModel = PipelineModel.load("./model/vwModel_twitter_sentiment")

vwModel = pipeline_3d0899e64c3e


pipeline_3d0899e64c3e

In [7]:
var t0 = System.nanoTime()
val vwPred = vwModel.transform(test_df)
vwPred.select("text", "label", "prediction").cache().count()
var t1 = System.nanoTime()
val infer_time = (t1 - t0)*1e-9
println("Time to make prediction: " + infer_time + "s")

Time to make prediction: 2.606229307s


t0 = 19180899641455
vwPred = [label: double, text: string ... 4 more fields]
t1 = 19183505870762
infer_time = 2.606229307


2.606229307

## Compute AUC

In [8]:
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

val sparkEvaluator = new BinaryClassificationEvaluator()
    .setRawPredictionCol("prediction")
    .setLabelCol("label")
val test_AUC = sparkEvaluator.evaluate(vwPred)
println("Test AUC = %f".format(test_AUC))

Test AUC = 0.792175


sparkEvaluator = binEval_5c9a350032fc
test_AUC = 0.7921748772374465


0.7921748772374465