In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [3]:
# Start spark session
spark = SparkSession.builder.appName('tokenizer').getOrCreate()

In [4]:
# Read in csv
data = spark.read.csv('data.csv')

In [6]:
# Show datframe
dataframe = spark.read.format('csv').option('header', 'true').load('data.csv')
dataframe.show()

+--------------------+
|                Poem|
+--------------------+
|This Autumn midnight|
|Orion’s at my window|
|shouting for his ...|
+--------------------+



In [7]:
# Tokenize dataframe
tokened = Tokenizer(inputCol='Poem', outputCol='words')

In [9]:
# Create functino to count vowels
tokenized = tokened.transform(dataframe)
tokenized.show()

+--------------------+--------------------+
|                Poem|               words|
+--------------------+--------------------+
|This Autumn midnight|[this, autumn, mi...|
|Orion’s at my window|[orion’s, at, my,...|
|shouting for his ...|[shouting, for, h...|
+--------------------+--------------------+



In [10]:
# Store a user defined function
# Create a Function to count vowels
def vowel_counter(words):
    vowel_count = 0

    for word in words:
        for vowel in word:
            if vowel in ('a', 'e', 'i', 'o', 'u'):
                vowel_count += 1

    return vowel_count

In [11]:
# Transform dataframe
count_vowels = udf(vowel_counter, IntegerType())
count_vowels

<function __main__.vowel_counter>

In [12]:
# Create a new dataframe with the udf
tokenized.select('Poem', 'words').withColumn('vowels', count_vowels(col('words'))).show(truncate=False)

+---------------------+--------------------------+------+
|Poem                 |words                     |vowels|
+---------------------+--------------------------+------+
|This Autumn midnight |[this, autumn, midnight]  |6     |
|Orion’s at my window |[orion’s, at, my, window] |6     |
|shouting for his dog.|[shouting, for, his, dog.]|6     |
+---------------------+--------------------------+------+



In [13]:
# Stop Spark
spark.stop()