In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0 pyspark-shell


In [5]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from pyspark.ml.feature import Tokenizer
from pyspark.sql.types import IntegerType

In [4]:
# get or create Spark session

app_name = "spark-udf"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [6]:
dataframe = spark.createDataFrame([
    (0, "Mary had a little lamb"),
    (1, "It's fleece was white as snow"),
    (2, "And everywhere Mary went"),
    (3, "The lamb was sure to go")
], ["id", "Nursery Rhyme"])
dataframe.show()

+---+--------------------+
| id|       Nursery Rhyme|
+---+--------------------+
|  0|Mary had a little...|
|  1|It's fleece was w...|
|  2|And everywhere Ma...|
|  3|The lamb was sure...|
+---+--------------------+



In [17]:
# Write a function to count occurrences of Mary
def mary_counter(rhyme):
    counter = 0
    for word in rhyme.split(" "):
        if word.lower() == "mary":
            counter += 1
    return counter

In [19]:
mary_udf = F.udf(mary_counter, IntegerType())
dataframe.withColumn("mary count", mary_udf("Nursery Rhyme")).show()

+---+--------------------+----------+
| id|       Nursery Rhyme|mary count|
+---+--------------------+----------+
|  0|Mary had a little...|         1|
|  1|It's fleece was w...|         0|
|  2|And everywhere Ma...|         1|
|  3|The lamb was sure...|         0|
+---+--------------------+----------+



In [20]:
# Decorators offer a more concise method
# Documentation: https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/functions.html#udf)
help(F.udf)

Help on function udf in module pyspark.sql.functions:

udf(f=None, returnType=StringType)
    Creates a user defined function (UDF).
    
    .. note:: The user-defined functions are considered deterministic by default. Due to
        optimization, duplicate invocations may be eliminated or the function may even be invoked
        more times than it is present in the query. If your function is not deterministic, call
        `asNondeterministic` on the user defined function. E.g.:
    
    >>> from pyspark.sql.types import IntegerType
    >>> import random
    >>> random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()
    
    .. note:: The user-defined functions do not support conditional expressions or short circuiting
        in boolean expressions and it ends up with being executed all internally. If the functions
        can fail on special rows, the workaround is to incorporate the condition into the functions.
    
    .. note:: The user-defined

In [21]:
@F.udf(returnType=IntegerType())
def mary_counter(rhyme):
    counter = 0
    for word in rhyme.split(" "):
        if word.lower() == "mary":
            counter += 1
    return counter
dataframe.withColumn("mary count", mary_counter("Nursery Rhyme")).show()

+---+--------------------+----------+
| id|       Nursery Rhyme|mary count|
+---+--------------------+----------+
|  0|Mary had a little...|         1|
|  1|It's fleece was w...|         0|
|  2|And everywhere Ma...|         1|
|  3|The lamb was sure...|         0|
+---+--------------------+----------+



In [11]:
# Tokenize word
tokenizer = Tokenizer(inputCol="Nursery Rhyme", outputCol="words")
tokenizer

Tokenizer_c9dc6e547fa3

In [9]:
@F.udf(returnType=IntegerType())
# Create a function to return the length of a list
def word_list_length(word_list):
    return len(word_list)

In [14]:
# Transform DataFrame
tokenized = tokenizer.transform(dataframe)

# Select the needed columns and don't truncate results
tokenized.select("Nursery Rhyme", "words")\
    .withColumn("tokens", word_list_length(F.col("words"))).show(truncate=False)

+-----------------------------+------------------------------------+------+
|Nursery Rhyme                |words                               |tokens|
+-----------------------------+------------------------------------+------+
|Mary had a little lamb       |[mary, had, a, little, lamb]        |5     |
|It's fleece was white as snow|[it's, fleece, was, white, as, snow]|6     |
|And everywhere Mary went     |[and, everywhere, mary, went]       |4     |
|The lamb was sure to go      |[the, lamb, was, sure, to, go]      |6     |
+-----------------------------+------------------------------------+------+



In [16]:
# User-Defined Functions are not optimized
# Always try using the built-in Spark functions before writing your own
# pysaprl.sql.functions.size also counts length of list
tokenized.select("Nursery Rhyme", "words")\
    .withColumn("tokens", F.size("words")).show(truncate=False)

+-----------------------------+------------------------------------+------+
|Nursery Rhyme                |words                               |tokens|
+-----------------------------+------------------------------------+------+
|Mary had a little lamb       |[mary, had, a, little, lamb]        |5     |
|It's fleece was white as snow|[it's, fleece, was, white, as, snow]|6     |
|And everywhere Mary went     |[and, everywhere, mary, went]       |4     |
|The lamb was sure to go      |[the, lamb, was, sure, to, go]      |6     |
+-----------------------------+------------------------------------+------+

