In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version

update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java to provide /usr/bin/java (java) in manual mode
openjdk version "1.8.0_222"
OpenJDK Runtime Environment (build 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10)
OpenJDK 64-Bit Server VM (build 25.222-b10, mixed mode)


In [0]:
import findspark
findspark.init("spark-2.4.4-bin-hadoop2.7")# SPARK_HOME
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [4]:

# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://s3.amazonaws.com/dataviz-curriculum/day_2/food_reviews.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("food_reviews.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+--------------------+
|             Reviews|
+--------------------+
|The pasta was a d...|
|We ate the fish i...|
|My family did not...|
|The girl even tri...|
|this is his job a...|
|I'm always greete...|
+--------------------+



In [0]:

# Tokenize DataFrame
review_data = Tokenizer(inputCol="Reviews", outputCol="Words")

In [6]:

# Transform DataFrame
reviewed = review_data.transform(df)
reviewed.show()

+--------------------+--------------------+
|             Reviews|               Words|
+--------------------+--------------------+
|The pasta was a d...|[the, pasta, was,...|
|We ate the fish i...|[we, ate, the, fi...|
|My family did not...|[my, family, did,...|
|The girl even tri...|[the, girl, even,...|
|this is his job a...|[this, is, his, j...|
|I'm always greete...|[i'm, always, gre...|
+--------------------+--------------------+



In [0]:

# Remove stop words
remover = StopWordsRemover(inputCol="Words", outputCol="filtered")

In [8]:

# Transform new DataFrame
newFrame = remover.transform(reviewed)
newFrame.show()

+--------------------+--------------------+--------------------+
|             Reviews|               Words|            filtered|
+--------------------+--------------------+--------------------+
|The pasta was a d...|[the, pasta, was,...|       [pasta, dish]|
|We ate the fish i...|[we, ate, the, fi...|  [ate, fish, tasty]|
|My family did not...|[my, family, did,...|[family, like, food]|
|The girl even tri...|[the, girl, even,...|[girl, even, trie...|
|this is his job a...|[this, is, his, j...|[job, since, prob...|
|I'm always greete...|[i'm, always, gre...|[always, greeted,...|
+--------------------+--------------------+--------------------+



In [9]:

# Show simplified review
newFrame.select("filtered").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|filtered                                                                                                                                                                     |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[pasta, dish]                                                                                                                                                                |
|[ate, fish, tasty]                                                                                                                                                           |
|[family, like, food]                                                                                                   