# Tokenization
Tokenization is an algorithm (or set of algorithms) for splitting a phrase, sentence, paragraph, or an entire text document into smaller units, such as individual words, bigrams, or terms.

Each of these smaller units are called tokens. For example, the lexical analyzer (as an algorithm in compiler writing) breaks programming syntaxes into a series of tokens, by removing any whitespace or comments in the source code.

Therefore, tokenization is a process of spliting a string into words, symbols, or any other meaningful tokens (such as bigram or N-grams).

In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("pyspark-ml-tokenizer").getOrCreate()

In [3]:
docs = [(1, "a Fox jumped over FOX"),
            (2, "RED of fox jumpled")]
df = spark.createDataFrame(docs, ["id", "text"])
df.show(truncate=False)

+---+---------------------+
|id |text                 |
+---+---------------------+
|1  |a Fox jumped over FOX|
|2  |RED of fox jumpled   |
+---+---------------------+



In [7]:
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

countTokens = udf(lambda words: len(words), IntegerType())

tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
tokenized = tokenizer.transform(df)
tokenized.select("text", "tokens").withColumn("tokens_length", countTokens(col("tokens"))).show(truncate=False)

+---------------------+---------------------------+-------------+
|text                 |tokens                     |tokens_length|
+---------------------+---------------------------+-------------+
|a Fox jumped over FOX|[a, fox, jumped, over, fox]|5            |
|RED of fox jumpled   |[red, of, fox, jumpled]    |4            |
+---------------------+---------------------------+-------------+



# RegexTokenizer

In [8]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokens",
pattern="\\W", minTokenLength=3)
regex_tokenized = regexTokenizer.transform(df)
regex_tokenized.select("text", "tokens").withColumn("tokens_length", countTokens(col("tokens"))).show(truncate=False)

+---------------------+------------------------+-------------+
|text                 |tokens                  |tokens_length|
+---------------------+------------------------+-------------+
|a Fox jumped over FOX|[fox, jumped, over, fox]|4            |
|RED of fox jumpled   |[red, fox, jumpled]     |3            |
+---------------------+------------------------+-------------+



# Tokenization with Pipeline

In [9]:
docs = [(1, "a Fox jumped, over, the fence?"),
            (2, "a RED, of fox?")]
df = spark.createDataFrame(docs, ["id", "text"])
df.show(truncate=False)

+---+------------------------------+
|id |text                          |
+---+------------------------------+
|1  |a Fox jumped, over, the fence?|
|2  |a RED, of fox?                |
+---+------------------------------+



In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StopWordsRemover

tk = RegexTokenizer(pattern=r'(?:\p{Punct}|\s)+', inputCol="text", outputCol='text2')
sw = StopWordsRemover(inputCol='text2', outputCol='text3')
pipeline = Pipeline(stages=[tk, sw])
df4 = pipeline.fit(df).transform(df)
df4.show(truncate=False)

+---+------------------------------+----------------------------------+--------------------+
|id |text                          |text2                             |text3               |
+---+------------------------------+----------------------------------+--------------------+
|1  |a Fox jumped, over, the fence?|[a, fox, jumped, over, the, fence]|[fox, jumped, fence]|
|2  |a RED, of fox?                |[a, red, of, fox]                 |[red, fox]          |
+---+------------------------------+----------------------------------+--------------------+

