In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0", 
            "JohnSnowLabs:spark-nlp:2.3.4"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0,JohnSnowLabs:spark-nlp:2.3.4 pyspark-shell


In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F

from pyspark.ml.feature import (
    StringIndexer,
    HashingTF, 
    IDF
)
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes

In [4]:
# get or create Spark session

app_name = "yelp-john-snow"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [5]:
import sparknlp
sparknlp.start()

from sparknlp.pretrained import PretrainedPipeline

explain_document_pipeline = PretrainedPipeline("explain_document_ml")
annotations = explain_document_pipeline.annotate("We are very happy about SparkNLP")
print(annotations)


explain_document_ml download started this may take some time.
Approx size to download 9.4 MB
[OK!]
{'stem': ['we', 'ar', 'veri', 'happi', 'about', 'sparknlp'], 'checked': ['We', 'are', 'very', 'happy', 'about', 'SparkNLP'], 'lemma': ['We', 'be', 'very', 'happy', 'about', 'SparkNLP'], 'document': ['We are very happy about SparkNLP'], 'pos': ['PRP', 'VBP', 'RB', 'JJ', 'IN', 'NNP'], 'token': ['We', 'are', 'very', 'happy', 'about', 'SparkNLP'], 'sentence': ['We are very happy about SparkNLP']}


In [6]:
print(type(annotations))

<class 'dict'>


In [7]:
url ="https://s3.amazonaws.com/dataviz-curriculum/day_2/yelp_reviews.csv"
spark.sparkContext.addFile(url)

df = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', inferSchema="true") \
    .load(SparkFiles.get("yelp_reviews.csv"))
df.printSchema()

root
 |-- class: string (nullable = true)
 |-- text: string (nullable = true)



In [8]:
# show first row
df.head()

Row(class='positive', text='Wow... Loved this place.')

In [9]:
annotations_df = explain_document_pipeline.transform(df)
annotations_df.show()

+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   class|                text|            document|            sentence|               token|             checked|               lemma|                stem|                 pos|
+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|positive|Wow... Loved this...|[[document, 0, 23...|[[document, 0, 5,...|[[token, 0, 2, Wo...|[[token, 0, 2, Wo...|[[token, 0, 2, Wo...|[[token, 0, 2, wo...|[[pos, 0, 2, UH, ...|
|negative|  Crust is not good.|[[document, 0, 17...|[[document, 0, 17...|[[token, 0, 4, Cr...|[[token, 0, 4, Cr...|[[token, 0, 4, Cr...|[[token, 0, 4, cr...|[[pos, 0, 4, NNP,...|
|negative|Not tasty and the...|[[document, 0, 40...|[[document, 0, 40...|[[token, 0, 2, No...|[[token, 0,

In [10]:
annotations_df.select(['class', 'stem']).show()

+--------+--------------------+
|   class|                stem|
+--------+--------------------+
|positive|[[token, 0, 2, wo...|
|negative|[[token, 0, 4, cr...|
|negative|[[token, 0, 2, no...|
|positive|[[token, 0, 6, st...|
|positive|[[token, 0, 2, th...|
|negative|[[token, 0, 2, no...|
|negative|[[token, 0, 7, ho...|
|negative|[[token, 0, 2, th...|
|positive|[[token, 0, 2, th...|
|positive|[[token, 0, 0, a,...|
|positive|[[token, 0, 6, se...|
|negative|[[token, 0, 4, wo...|
|negative|[[token, 0, 2, th...|
|positive|[[token, 0, 0, i,...|
|negative|[[token, 0, 0, i,...|
|negative|[[token, 0, 0, i,...|
|positive|[[token, 0, 5, hi...|
|negative|[[token, 0, 7, wa...|
|negative|[[token, 0, 3, th...|
|negative|[[token, 0, 2, di...|
+--------+--------------------+
only showing top 20 rows



## DocumentAssembler: Getting data in
In order to get through the NLP process, we need to get raw data annotated. There is a special transformer that does this for us: the DocumentAssembler, it creates the first annotation of type Document which may be used by annotators down the road



In [11]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

# Building our own pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

## Sentence detection and tokenization
In this quick example, we now proceed to identify the sentences in each of our document lines. SentenceDetector requires a Document annotation, which is provided by the DocumentAssembler output, and it’s itself a Document type token. The Tokenizer requires a Document annotation type, meaning it works both with DocumentAssembler or SentenceDetector output, in here, we use the sentence output.



In [12]:
sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

## Finisher: Getting data out

At the end of each pipeline or any stage that was done by Spark NLP, you may want to get results out whether onto another pipeline or simply write them on disk. The Finisher annotator helps you to clean the metadata (if it’s set to true) and output the results into an array:



In [13]:
finisher = Finisher() \
    .setInputCols(["token"]) \
    .setIncludeMetadata(True) \
    .setCleanAnnotations(True)

## Using Spark ML Pipeline
Now we want to put all this together and retrieve the results, we use a Pipeline for this. We use the same data in fit() that we will use in transform since none of the pipeline stages have a training stage.



In [14]:
pipeline = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    finisher
  ])


In [15]:
model = pipeline.fit(df)

In [16]:
extracted = model.transform(df)
extracted.show()

+--------+--------------------+--------------------+-----------------------+
|   class|                text|      finished_token|finished_token_metadata|
+--------+--------------------+--------------------+-----------------------+
|positive|Wow... Loved this...|[Wow, ..., Loved,...|   [[sentence, 0], [...|
|negative|  Crust is not good.|[Crust, is, not, ...|   [[sentence, 0], [...|
|negative|Not tasty and the...|[Not, tasty, and,...|   [[sentence, 0], [...|
|positive|Stopped by during...|[Stopped, by, dur...|   [[sentence, 0], [...|
|positive|The selection on ...|[The, selection, ...|   [[sentence, 0], [...|
|negative|Now I am getting ...|[Now, I, am, gett...|   [[sentence, 0], [...|
|negative|Honeslty it didn'...|[Honeslty, it, di...|   [[sentence, 0], [...|
|negative|The potatoes were...|[The, potatoes, w...|   [[sentence, 0], [...|
|positive|The fries were gr...|[The, fries, were...|   [[sentence, 0], [...|
|positive|      A great touch.|[A, great, touch, .]|   [[sentence, 0], [...|

In [17]:
pos_neg_to_num = StringIndexer(inputCol='class',outputCol='label')

hashingTF = HashingTF(inputCol="finished_token", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')
# Create feature vectors
features = VectorAssembler(inputCols=['idf_token'], outputCol='features')

In [18]:
pipeline = Pipeline(
    stages = [
    pos_neg_to_num,
    documentAssembler,
    sentenceDetector,
    tokenizer,
    finisher,
    hashingTF,
    idf,
    features
  ])


In [19]:
model = pipeline.fit(df)
extracted = model.transform(df)
extracted.show()

+--------+--------------------+-----+--------------------+-----------------------+--------------------+--------------------+--------------------+
|   class|                text|label|      finished_token|finished_token_metadata|          hash_token|           idf_token|            features|
+--------+--------------------+-----+--------------------+-----------------------+--------------------+--------------------+--------------------+
|positive|Wow... Loved this...|  0.0|[Wow, ..., Loved,...|   [[sentence, 0], [...|(262144,[1536,131...|(262144,[1536,131...|(262144,[1536,131...|
|negative|  Crust is not good.|  1.0|[Crust, is, not, ...|   [[sentence, 0], [...|(262144,[1536,158...|(262144,[1536,158...|(262144,[1536,158...|
|negative|Not tasty and the...|  1.0|[Not, tasty, and,...|   [[sentence, 0], [...|(262144,[1536,255...|(262144,[1536,255...|(262144,[1536,255...|
|positive|Stopped by during...|  0.0|[Stopped, by, dur...|   [[sentence, 0], [...|(262144,[1536,339...|(262144,[1536,339...|

In [20]:
# Break data down into a training set and a testing set
training, testing = extracted.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [21]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+--------+--------------------+-----+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|   class|                text|label|      finished_token|finished_token_metadata|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------+--------------------+-----+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|negative|"I don't know wha...|  1.0|[", I, don't, kno...|   [[sentence, 0], [...|(262144,[15889,29...|(262144,[15889,29...|(262144,[15889,29...|[-723.82082134981...|[7.14288495719912...|       1.0|
|negative|"The burger... I ...|  1.0|[", The, burger, ...|   [[sentence, 0], [...|(262144,[1536,963...|(262144,[1536,963...|(262144,[1536,963...|[-763.76920546547...|[6.37156015969899...|       1.0|
|nega

In [22]:
# Use the Class Evaluator for a cleaner description
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.736583
