In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [0]:
# Start a SparkSession
import findspark
findspark.init()

In [0]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Tokens").getOrCreate()

In [0]:
# Import tokenizer library
from pyspark.ml.feature import Tokenizer

In [11]:
# Create sample DataFrame
dataframe = spark.createDataFrame([
      (0, "Spark is great"),
      (1, "I am learning Spark"),
      (2, "Spark is better than Hadoop")
], ["id", "sentence"])

dataframe.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|      Spark is great|
|  1| I am learning Spark|
|  2|Spark is better t...|
+---+--------------------+



In [12]:
# Tokenize sentences
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenizer

Tokenizer_a978056ecce8

In [13]:
# Transform and show DataFrame
tokenized_df = tokenizer.transform(dataframe)
tokenized_df.show(truncate=False)

+---+---------------------------+---------------------------------+
|id |sentence                   |words                            |
+---+---------------------------+---------------------------------+
|0  |Spark is great             |[spark, is, great]               |
|1  |I am learning Spark        |[i, am, learning, spark]         |
|2  |Spark is better than Hadoop|[spark, is, better, than, hadoop]|
+---+---------------------------+---------------------------------+



In [0]:
# Create a function to return the length of a list
def word_list_length(word_list):
	return len(word_list)

In [0]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [0]:
# Create a user defined function
count_tokens = udf(word_list_length, IntegerType())

In [18]:
# Create Tokenizer
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# Transform DataFrame
tokenized = tokenizer.transform(dataframe)

# Select the needed columns and don't truncate results
tokenized.withColumn("tokens", count_tokens(col("words"))).show(truncate=False)

+---+---------------------------+---------------------------------+------+
|id |sentence                   |words                            |tokens|
+---+---------------------------+---------------------------------+------+
|0  |Spark is great             |[spark, is, great]               |3     |
|1  |I am learning Spark        |[i, am, learning, spark]         |4     |
|2  |Spark is better than Hadoop|[spark, is, better, than, hadoop]|5     |
+---+---------------------------+---------------------------------+------+

