*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# VW MMLSpark Model for AI on Accumulo


In this notebook, we train a Vowpal Wabbit model in MMLSpark using [sentiment140](http://help.sentiment140.com/for-students/?source=post_page---------------------------) twitter data. [Microsoft Accumulo Spark Connector (MASC)](https://github.com/microsoft/masc) is used for handling data IO between Accumulo and Spark.   

Before running this notebook, please
* make sure you have Accumulo 2.0.0 and Spark 2.4.3 installed
* create and activate a conda environment with Apache Toree installed
* download accumulo-spark-datasource jar and accumulo-spark-iterator jar
* run commands like the following to install a Jupyter toree kernel
```
# Replace the jar file path based on your situation
JAR="file:///home/rba1/twitter-sentiment/lib/accumulo-spark-datasource-1.0.0-SNAPSHOT-shaded.jar"
jupyter toree install \
    --replace \
    --user \
    --kernel_name=accumulo \
    --spark_home=${SPARK_HOME} \
    --spark_opts="--master yarn --jars $JAR \
        --packages org.apache.spark:spark-avro_2.11:2.4.3,com.microsoft.ml.spark:mmlspark_2.11:0.18.1 \
        --driver-memory 8g \
        --executor-memory 6g \
        --driver-cores 2 \
        --executor-cores 2 \
        --num-executors 16"
```

In [1]:
import org.apache.spark.{SparkConf, SparkContext}

// Stop existing spark context and create new one
sc.stop()

val conf = new SparkConf()
conf.setAppName("TwitterSentimentClassification")

new SparkContext(conf)

println("Spark version %s".format(sc.version))
println("Scala %s".format(util.Properties.versionString))
println
sc.getConf.getAll.foreach(println)

Waiting for a Spark session to start...

Spark version 2.4.3
Scala version 2.11.12



Waiting for a Spark session to start...

(spark.eventLog.enabled,true)
(spark.repl.local.jars,file:///home/rba1/twitter-sentiment/lib/accumulo-spark-datasource-1.0.0-SNAPSHOT-shaded.jar,file:///home/rba1/.ivy2/jars/org.apache.spark_spark-avro_2.11-2.4.3.jar,file:///home/rba1/.ivy2/jars/com.microsoft.ml.spark_mmlspark_2.11-0.18.1.jar,file:///home/rba1/.ivy2/jars/org.spark-project.spark_unused-1.0.0.jar,file:///home/rba1/.ivy2/jars/org.scalactic_scalactic_2.11-3.0.5.jar,file:///home/rba1/.ivy2/jars/org.scalatest_scalatest_2.11-3.0.5.jar,file:///home/rba1/.ivy2/jars/io.spray_spray-json_2.11-1.3.2.jar,file:///home/rba1/.ivy2/jars/com.microsoft.cntk_cntk-2.4.jar,file:///home/rba1/.ivy2/jars/org.openpnp_opencv-3.2.0-1.jar,file:///home/rba1/.ivy2/jars/com.jcraft_jsch-0.1.54.jar,file:///home/rba1/.ivy2/jars/org.apache.httpcomponents_httpclient-4.5.6.jar,file:///home/rba1/.ivy2/jars/com.microsoft.ml.lightgbm_lightgbmlib-2.2.350.jar,file:///home/rba1/.ivy2/jars/com.github.vowpalwabbit_vw-jni-8.7.0.2.jar,file:///home/rba1/.ivy2/jars/org

conf = org.apache.spark.SparkConf@3440396b


org.apache.spark.SparkConf@3440396b

In [2]:
import org.apache.spark.sql.types.{LongType, DoubleType, StringType, StructField, StructType}
import org.apache.accumulo.core.client.Accumulo
import scala.collection.JavaConverters._

// client property file path
val PROPS_PATH = "/home/rba1/install/accumulo-2.0.0/conf/accumulo-client.properties"
val TRAIN_TABLE_NAME = "twitter_train_data"
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val schema = StructType(Array(
    StructField("sentiment", DoubleType),
    StructField("id", StringType),
    StructField("date", StringType),
    StructField("query_string", StringType),
    StructField("user", StringType),
    StructField("text", StringType)
))

PROPS_PATH = /home/rba1/install/accumulo-2.0.0/conf/accumulo-client.properties
TRAIN_TABLE_NAME = twitter_train_data
sqlContext = org.apache.spark.sql.SQLContext@74dbb18b
schema = StructType(StructField(sentiment,DoubleType,true), StructField(id,StringType,true), StructField(date,StringType,true), StructField(query_string,StringType,true), StructField(user,StringType,true), StructField(text,StringType,true))




StructType(StructField(sentiment,DoubleType,true), StructField(id,StringType,true), StructField(date,StringType,true), StructField(query_string,StringType,true), StructField(user,StringType,true), StructField(text,StringType,true))

## Ingest Twitter Data to Accumulo

In [3]:
// need to upload data to hdfs first via 
// hdfs dfs -put /home/rba1/twitter-sentiment/sentiment140_prefix.csv sentiment140_prefix.csv
val file_path = "sentiment140_prefix.csv"
val df = spark.read.format("csv").schema(schema).load(file_path)

var t0 = System.nanoTime()
val props = Accumulo.newClientProperties().from(PROPS_PATH).build()
props.put("table", TRAIN_TABLE_NAME)
props.put("rowKey", "id")
df.write.format("org.apache.accumulo").options(props.asScala).save()
var t1 = System.nanoTime()
println("Time to ingest twitter data to Accumulo: " + (t1 - t0)*1e-9 + "s")

Time to ingest twitter data to Accumulo: 22.002847971s


file_path = sentiment140_prefix.csv
df = [sentiment: double, id: string ... 4 more fields]
t0 = 18420090028815
props = {auth.type=password, auth.principal=root, table=twitter_train_data, instance.zookeepers=rbaaccucluster2-0:2181,rbaaccucluster2-1:2181,rbaaccucluster2-2:2181, instance.name=muchos, rowKey=id, auth.token=secret}
t1 = 18442092876786


18442092876786

## Load Training Data from Accumulo

In [4]:
println("Reading training data from Accumulo...")
var t0 = System.nanoTime()
var train_df = spark.read
                    .format("org.apache.accumulo")
                    .options(props.asScala)
                    .schema(schema)
                    .load()
train_df.cache().count()
var t1 = System.nanoTime()
val read_time = (t1 - t0)*1e-9
println("Time to load training data: " + read_time + "s")

Reading training data from Accumulo...
Time to load training data: 34.577519024000004s


t0 = 18443853066868
train_df = [sentiment: double, id: string ... 5 more fields]
t1 = 18478430585892
read_time = 34.577519024000004


34.577519024000004

## Data Preparation

In [5]:
import org.apache.spark.sql.functions.{rand, when}

train_df = train_df.orderBy(rand()) // Randomly permute the data for online training
                   .withColumn("label", 'sentiment.cast("Int"))
                   .select('label as 'label, 'text as 'text)
                   .withColumn("label", when('label > 0, 1.0D).otherwise(-1.0D))

train_df = [label: double, text: string]


[label: double, text: string]

## Feature Engineering and Model Training

In [6]:
import org.apache.spark.ml.Pipeline
import com.microsoft.ml.spark.vw.{VowpalWabbitFeaturizer, VowpalWabbitClassifier}

val vwFeaturizer = new VowpalWabbitFeaturizer()
 .setStringSplitInputCols(Array("text"))
 .setOutputCol("features")

val vwParams = "--loss_function=logistic --quiet --holdout_off"
val vw = new VowpalWabbitClassifier()
    .setLabelCol("label")
    .setArgs(vwParams)
    .setNumPasses(1)

vwFeaturizer = VowpalWabbitFeaturizer_f1627b144bf8
vwParams = --loss_function=logistic --quiet --holdout_off
vw = VowpalWabbitClassifier_62338413e2e0


VowpalWabbitClassifier_62338413e2e0

In [7]:
// define a training pipeline
val vw_pipeline = new Pipeline().setStages(Array(vwFeaturizer, vw))

var t0 = System.nanoTime()
val vwModel = vw_pipeline.fit(train_df)
var t1 = System.nanoTime()
val train_time = (t1 - t0)*1e-9
println("Time to train Vowpal Wabbit model: " + train_time + "s")

Time to train Vowpal Wabbit model: 19.832663679s


vw_pipeline = pipeline_3d0899e64c3e
t0 = 18482474609412
vwModel = pipeline_3d0899e64c3e
t1 = 18502307273091
train_time = 19.832663679


19.832663679

In [8]:
// Save model to hdfs
vwModel.write.overwrite().save("./model/vwModel_twitter_sentiment")