In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

## Tokenizer and tokenization
### Tokenization
Is the process of taking text such as a sentence and then breaking into individual terms, and that is usuarlly words. 

The simple **tokenizer** class provides the functionality

### Regular expression tokenizer
That allows more advanced tokenization based on a regular expression.

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [4]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [5]:
sen_df = spark.createDataFrame([
    (0,'Hi I heard about Spark'),
    (1, 'I wish java could use case classes'),
    (2, 'Logistic,regression,models,are,neat')   
    ],['id','sentence'])

In [7]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish java could...|
|  2|Logistic,regressi...|
+---+--------------------+



### Create the tokenizer objects

In [8]:
tokenizer = Tokenizer(inputCol='sentence',outputCol='words')

In [10]:
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

In [11]:
count_tokens = udf(lambda words:len(words), IntegerType())

In [12]:
tokenized = tokenizer.transform(sen_df)

In [13]:
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [14]:
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [15]:
rg_tokenized = regex_tokenizer.transform(sen_df)

In [16]:
rg_tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



### Stop word removal
A stop word, is a word that should be remove from the input because typicaly these words appear frequently and dont really care that much meaning.

In [17]:
from pyspark.ml.feature import StopWordsRemover

In [18]:
sentenceDataFrame = spark.createDataFrame([
    (0, ['I','saw','the','green','horse']),
    (1, ['Mary','had','a','little','lamb'])
    ], ['id','tokens'])

In [19]:
sentenceDataFrame.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Mary, had, a, li...|
+---+--------------------+



In [20]:
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')

In [21]:
remover.transform(sentenceDataFrame).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



### N-gram
Sequence of tokens typically words for some integer.
<br> So it is a sequence of end tokens for some integer N, 
<br> and the anagram class can be used to tranform input features into n-grams.

So what n-gram does is it takes an input sequence of strings, so basically the output of a tokenizer and then the parameter end is used to determine the number of terms in each anagram and the output will consist of a sequence of these n-grams where each n-gram is represented by space delimited string that is the sapce is actually what's the limiting that actual string of n and consecutive words.

In [22]:
from pyspark.ml.feature import NGram

In [23]:
wordDataFrame = spark.createDataFrame([
    (0, ['Hi','I','heard','about','Spark']),
    (1, ['I','wish','Java','could','use','case','classes']),
    (2, ['Logistic','regression','models','are','neat'])
    ], ['id','words'])

In [29]:
ngram = NGram(n=2, inputCol='words', outputCol='grams')

In [30]:
ngram.transform(wordDataFrame).select('grams').show(truncate=False)

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



In [27]:
ngram = NGram(n=3, inputCol='words', outputCol='grams')

In [28]:
ngram.transform(wordDataFrame).select('grams').show(truncate=False)

+--------------------------------------------------------------------------------+
|grams                                                                           |
+--------------------------------------------------------------------------------+
|[Hi I heard, I heard about, heard about Spark]                                  |
|[I wish Java, wish Java could, Java could use, could use case, use case classes]|
|[Logistic regression models, regression models are, models are neat]            |
+--------------------------------------------------------------------------------+

