# Example Microsoft MASC Demo Notebook

## Tweet Based Sentiment Analysis
This example uses the Sentiment140 dataset to demonstrate simple tweet-based sentiment analysis

### Requirements
- Python version should be >= 3
- Jupyter Notebook should run using Java SDK version == 1.8
- Accumulo version should be >= 2.0.0

### Prerequisites
- Download and extract the dataset (available here: http://help.sentiment140.com/for-students)
  - In this example we have already extracted and pushed the training data file to HDFS
- Build datasource and iterator JARs or pull from Maven Central Repository
  - Datasource package on Maven Central Repository is specified below
  - Iterator JAR is deployed to all Accumulo nodes then Accumulo cluster is restarted


### Setup

In [None]:
import base64
from configparser import ConfigParser
import os

from mleap import pyspark
from mleap.pyspark.spark_support import SimpleSparkSerializer

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import CountVectorizer, RegexTokenizer
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F

In [5]:
# Define dependencies and configuration variables
DATASOURCE_PKG = 'com.microsoft.masc:microsoft-accumulo-spark-datasource:1.0.3'
SPARK_AVRO_PKG = 'org.apache.spark:spark-avro_2.11:2.4.3'
SPARK_MLEAP_PKG = 'ml.combust.mleap:mleap-spark_2.11:0.13.0'

ACCUMULO_PROPS = '/opt/muchos/install/accumulo-2.0.0/conf/accumulo-client.properties'
DATA_FILE = 'hdfs:///data/training.1600000.processed.noemoticon.csv'
MLEAP_BUNDLE = 'hdfs:///tmp/sentiment140.lr.zip'

### Define helper functions

In [5]:
def convert_label(df):
    """Convert label to binary integer"""
    return (df
            .filter(df['label'] != '2')
            .withColumn('label', (df['label'].cast(IntegerType()) / F.lit(4)).cast(IntegerType())))

def get_data(spark, data_file):
    """Read Sentiment140 data from file"""
    df = (spark.read
          .csv(data_file)
          .withColumn('label', F.col('_c0').cast(IntegerType()))
          .withColumnRenamed('_c1', 'id')
          .withColumnRenamed('_c2', 'timestamp')
          .withColumnRenamed('_c3', 'query')
          .withColumnRenamed('_c4', 'user')
          .withColumnRenamed('_c5', 'text')
          .drop('_c0'))

    return df

def get_properties(properties_file):
    """Read Accumulo client properties file"""
    config = ConfigParser()
    with open(properties_file) as stream:
        config.read_string("[top]\n" + stream.read())
    return dict(config['top'])

def get_model_string(bundle, model, df):
    """Get base64 encoded string representation of MLeap PipelineModel"""
    if os.path.exists(bundle):
        os.remove(bundle)
    model.serializeToBundle('jar:file:{}'.format(bundle), model.transform(df))

    with open(bundle, mode='rb') as file:
        model_string = base64.b64encode(file.read()).decode('utf-8')

    return model_string

### Read Data
Reading subset of twitter data used in sentiment analysis benchmark

In [6]:
# Start Spark session
spark = (SparkSession
         .builder
         .appName('DataSourceTest')
         .config('spark.jars.packages', ','.join([DATASOURCE_PKG, SPARK_AVRO_PKG, SPARK_MLEAP_PKG]))
         .getOrCreate())

In [7]:
df = get_data(spark=spark, data_file=DATA_FILE)
df.limit(5).toPandas()

Unnamed: 0,id,timestamp,query,user,text,label
0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,1
2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0
3,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,1
4,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",0


### Train Logistic Regression Spark ML Model

In [8]:
# Define pipeline of feature engineering and model training
tokenizer = RegexTokenizer(gaps=False, pattern='\\p{L}+', inputCol='text', outputCol='words')
vectorizer = CountVectorizer(inputCol='words', outputCol='features')
lr = LogisticRegression(maxIter=1, regParam=0.2, elasticNetParam=0)

pipeline = Pipeline(stages=[tokenizer, vectorizer, lr])

# Fit Model
model = pipeline.fit(df)

# Export Model
model_string = get_model_string(bundle=MLEAP_BUNDLE, model=model, df=df)

### Save Data to Accumulo

In [11]:
properties = get_properties(ACCUMULO_PROPS)
# Define Accumulo table where data will be written
properties['table'] = 'demo_table'
# Identify column to use as the key for Accumulo rows
properties['rowkey'] = 'id'

(df.write
   .format("com.microsoft.accumulo")
   .options(**properties)
   .save())

### Excecute server-side inference and filtering

In [12]:
# Define model to use
properties['mleap'] = model_string
# Define filtering based on model output
properties['mleapfilter'] = '${prediction > .9}'
# Remove id column when reading as this is populated from the Accumulo key
schema = df.drop('id').schema

pred = (spark
        .read
        .format("com.microsoft.accumulo")
        .options(**properties)
        .schema(schema)
        .load())

# Define column level filtering
pred.select(['user', 'text', 'prediction']).limit(15).toPandas()

Unnamed: 0,user,text,prediction
0,scotthamilton,is upset that he can't update his Facebook by ...,1.0
1,ElleCTF,my whole body feels itchy and like its on fire,1.0
2,joy_wolf,@Kwesidei not the whole crew,1.0
3,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...,1.0
4,mimismo,@twittera que me muera ?,1.0
5,pardonlauren,I just re-pierced my ears,1.0
6,robrobbierobert,"@octolinz16 It it counts, idk why I did either...",1.0
7,HairByJess,@iamjazzyfizzle I wish I got to watch it with ...,1.0
8,armotley,about to file taxes,1.0
9,gi_gi_bee,@FakerPattyPattz Oh dear. Were you drinking ou...,1.0


In [18]:
spark.stop()