In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0 pyspark-shell


In [4]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.ml.feature import Tokenizer, StopWordsRemover
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType
from pyspark import SparkFiles


In [5]:
# get or create Spark session

app_name = "nlp-stopwords"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [6]:
# Read in data from S3 Buckets
url ="https://s3.amazonaws.com/dataviz-curriculum/day_2/food_reviews.csv"
spark.sparkContext.addFile(url)

df = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', inferSchema="true") \
    .load(SparkFiles.get("food_reviews.csv"))
df.printSchema()

root
 |-- Reviews: string (nullable = true)



In [7]:
# Show DataFrame
df.show()

+--------------------+
|             Reviews|
+--------------------+
|The pasta was a d...|
|We ate the fish i...|
|My family did not...|
|The girl even tri...|
|this is his job a...|
|I'm always greete...|
+--------------------+



In [8]:
# Tokenize DataFrame
review_data = Tokenizer(inputCol="Reviews", outputCol="Words")

In [9]:
# Transform DataFrame
reviewed = review_data.transform(df)
reviewed.show()

+--------------------+--------------------+
|             Reviews|               Words|
+--------------------+--------------------+
|The pasta was a d...|[the, pasta, was,...|
|We ate the fish i...|[we, ate, the, fi...|
|My family did not...|[my, family, did,...|
|The girl even tri...|[the, girl, even,...|
|this is his job a...|[this, is, his, j...|
|I'm always greete...|[i'm, always, gre...|
+--------------------+--------------------+



In [10]:
# Remove stop words
remover = StopWordsRemover(inputCol="Words", outputCol="filtered")

In [11]:
# Transform new DataFrame
newFrame = remover.transform(reviewed)
newFrame.show()

+--------------------+--------------------+--------------------+
|             Reviews|               Words|            filtered|
+--------------------+--------------------+--------------------+
|The pasta was a d...|[the, pasta, was,...|       [pasta, dish]|
|We ate the fish i...|[we, ate, the, fi...|  [ate, fish, tasty]|
|My family did not...|[my, family, did,...|[family, like, food]|
|The girl even tri...|[the, girl, even,...|[girl, even, trie...|
|this is his job a...|[this, is, his, j...|[job, since, prob...|
|I'm always greete...|[i'm, always, gre...|[always, greeted,...|
+--------------------+--------------------+--------------------+



In [12]:
# Show simplified review
newFrame.select("filtered").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|filtered                                                                                                                                                                     |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[pasta, dish]                                                                                                                                                                |
|[ate, fish, tasty]                                                                                                                                                           |
|[family, like, food]                                                                                                   