In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [0]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Tokens").getOrCreate()

In [0]:
# Import the Tokenizer library
from pyspark.ml.feature import Tokenizer

In [4]:
# Create a sample DataFrame
dataframe = spark.createDataFrame([
                                   (0, "Spark is great."),
                                   (1, "We are learning Spark."),
                                   (2, "Spark is better than Hadoop no doubt.")

], ["id", "sentence"])

dataframe.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|     Spark is great.|
|  1|We are learning S...|
|  2|Spark is better t...|
+---+--------------------+



In [5]:
# Tokenize sentences
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenizer

Tokenizer_34ff5c4736b8

The tokenizer function takes input and output parameters. The input passes the name of the column that we want to have tokenized, and the output takes the name that we want the column called.

In [6]:
# Transform and show the DataFrame
tokenized_df = tokenizer.transform(dataframe)
tokenized_df.show(truncate=False)

+---+-------------------------------------+---------------------------------------------+
|id |sentence                             |words                                        |
+---+-------------------------------------+---------------------------------------------+
|0  |Spark is great.                      |[spark, is, great.]                          |
|1  |We are learning Spark.               |[we, are, learning, spark.]                  |
|2  |Spark is better than Hadoop no doubt.|[spark, is, better, than, hadoop, no, doubt.]|
+---+-------------------------------------+---------------------------------------------+



**User-defined functions (UDFs)** are functions created by the user to add custom output columns.

In [0]:
# Create a function to return the length of a list
def word_list_length(word_list):
	return len(word_list)

In [0]:
# Import the udf function, the col function, and IntegerType
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

Import the udf function, the col function to select a column to be passed into a function, and the type IntegerType that will be used in our udf to define the data type of the output

In [0]:
# Create a user defined function
count_tokens = udf(word_list_length, IntegerType())

udf takes in the name of the function as a parameter and the output data type, which is the IntegerType that was just imported

In [0]:
# Create tokenizer
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

In [0]:
# Transform and show the DataFrame
tokenized_df = tokenizer.transform(dataframe)

In [12]:
# Select the needed columns and don't truncate results
tokenized_df.withColumn("tokens", count_tokens(col("words"))).show(truncate=False)

+---+-------------------------------------+---------------------------------------------+------+
|id |sentence                             |words                                        |tokens|
+---+-------------------------------------+---------------------------------------------+------+
|0  |Spark is great.                      |[spark, is, great.]                          |3     |
|1  |We are learning Spark.               |[we, are, learning, spark.]                  |4     |
|2  |Spark is better than Hadoop no doubt.|[spark, is, better, than, hadoop, no, doubt.]|7     |
+---+-------------------------------------+---------------------------------------------+------+

