# Tokenizing

In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0 pyspark-shell


In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.ml.feature import RegexTokenizer, Tokenizer
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType
from pyspark import SparkFiles


In [4]:
# get or create Spark session

app_name = "tokenizing-data"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [5]:
# Read in data from S3 Buckets
url ="https://s3.amazonaws.com/dataviz-curriculum/day_2/data.csv"
spark.sparkContext.addFile(url)

df = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', inferSchema="true") \
    .load(SparkFiles.get("data.csv"))
df.printSchema()

root
 |-- Poem: string (nullable = true)



In [6]:
# Show DataFrame
df.show()

+--------------------+
|                Poem|
+--------------------+
|This Autumn midnight|
|Orion’s at my window|
|shouting for his ...|
+--------------------+



In [7]:
# Tokenize DataFrame
tokened = Tokenizer(inputCol="Poem", outputCol="words")

In [8]:
# Transform DataFrame
tokenized = tokened.transform(df)
tokenized.show()

+--------------------+--------------------+
|                Poem|               words|
+--------------------+--------------------+
|This Autumn midnight|[this, autumn, mi...|
|Orion’s at my window|[orion’s, at, my,...|
|shouting for his ...|[shouting, for, h...|
+--------------------+--------------------+



In [9]:
# Create a Function to count vowels
@F.udf(returnType=IntegerType())
def vowel_counter(words):
    vowel_count = 0

    for word in words:
        for letter in word:
            if letter in ('a', 'e', 'i', 'o', 'u'):
                vowel_count += 1

    return vowel_count

In [10]:
# Store a user defined function
# non-decorator approach
# count_vowels = udf(vowel_counter, IntegerType())
# count_vowels

In [11]:
# Create new DataFrame with the udf
tokenized \
    .select("Poem", "words") \
    .withColumn("vowels", vowel_counter(F.col("words"))) \
    .show(truncate=False)

+---------------------+--------------------------+------+
|Poem                 |words                     |vowels|
+---------------------+--------------------------+------+
|This Autumn midnight |[this, autumn, midnight]  |6     |
|Orion’s at my window |[orion’s, at, my, window] |6     |
|shouting for his dog.|[shouting, for, his, dog.]|6     |
+---------------------+--------------------------+------+

