### 1. Import libraries

In [4]:
from pyspark.sql import SparkSession

    # feature module
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

### 2. Start a Spark session

In [5]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

### 3. Create a sample dataframe

In [49]:
# A simple dataframe containing text data

sen_df = spark.createDataFrame([
    (0, 'Hi I heard about Spark'),
    (1, 'I wish java could use case classes'),
    (2, 'Logistic,regression,model,are,neat')
], ['id', 'sentence'])

sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish java could...|
|  2|Logistic,regressi...|
+---+--------------------+



### 4.Tokenizer and Regex tokenizer

Tokenizer: A tokenizer converts the input string to lowercase and then splits it by white spaces

Regex tokenizer: A regex based tokenizer extracts tokens by using the provided regex pattern

In [37]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

tokenizer = Tokenizer(inputCol = 'sentence', outputCol = 'words')

In [13]:
tokenized = tokenizer.transform(sen_df)
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [14]:
tokenized.take(1)

[Row(id=0, sentence='Hi I heard about Spark', words=['hi', 'i', 'heard', 'about', 'spark'])]

In [17]:
count_tokens = udf(lambda words: len(words), IntegerType())
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [40]:
regex_tokenizer = RegexTokenizer(inputCol = 'sentence', outputCol = 'words')
regex_tokenized = regex_tokenizer.transform(sen_df)
regex_tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [53]:
regex_tokenized.select("words").take(3)

[Row(words=['hi', 'i', 'heard', 'about', 'spark']),
 Row(words=['i', 'wish', 'java', 'could', 'use', 'case', 'classes']),
 Row(words=['logistic', 'regression', 'model', 'are', 'neat'])]

\\W matches for any characters which is not a letter, digit or underscore

In [46]:
regex_tokenizer = RegexTokenizer(inputCol = 'sentence', outputCol = 'words', pattern = '\\W')
regex_tokenized = regex_tokenizer.transform(sen_df)
regex_tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic&regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [None]:
# Changing tokenizing pattern

sen_df = spark.createDataFrame([
    (0, 'Hi I heard about Spark'),
    (1, 'I wish java could use case classes'),
    (2, 'Logistic&regression&model&are&neat')
], ['id', 'sentence'])

sen_df.show()

In [50]:
regex_tokenizer = RegexTokenizer(inputCol = 'sentence', outputCol = 'words', pattern = '\\W')
regex_tokenized = regex_tokenizer.transform(sen_df)
regex_tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [52]:
regex_tokenized.select("words").take(3)

[Row(words=['hi', 'i', 'heard', 'about', 'spark']),
 Row(words=['i', 'wish', 'java', 'could', 'use', 'case', 'classes']),
 Row(words=['logistic', 'regression', 'model', 'are', 'neat'])]

In [44]:
regex_tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



### 5.Removing stop words

Stop words are a set of commonly used words in a language. Examples of stop words in English are “a,” “the,” “is,” “are,” etc. Stop words are commonly used in Text Mining and Natural Language Processing (NLP) to eliminate words that are so widely used that they carry very little useful information

In [21]:
from pyspark.ml.feature import StopWordsRemover

sentenceDataFrame = spark.createDataFrame([
    (0, ['I', 'saw', 'the', 'green', 'horse']),
    (1, ['Mary', 'had', 'a', 'little', 'lamb'])
], ['id', 'tokens'])

sentenceDataFrame.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Mary, had, a, li...|
+---+--------------------+



In [22]:
remover = StopWordsRemover(inputCol = 'tokens', outputCol = 'filtered')
remover.transform(sentenceDataFrame).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



### 6.Ngram

An n-gram is a collection of n successive items in a text document that may include words, numbers, symbols, and punctuation. N-gram models are useful in many text analytics applications where sequences of words are relevant, such as in sentiment analysis, text classification, and text generation. N-gram modeling is one of the many techniques used to convert text from an unstructured format to a structured forma

In [24]:
from pyspark.ml.feature import NGram

wordDataFrame = spark.createDataFrame([
    (0, ['Hi', 'I', 'heard', 'about', 'Spark']), 
    (1, ['I', 'wish', 'java', 'could', 'use', 'case', 'classes']),
    (2, ['Logistic', 'regression', 'models', 'are', 'neat'])
], ['id', 'words'])

wordDataFrame.show()

+---+--------------------+
| id|               words|
+---+--------------------+
|  0|[Hi, I, heard, ab...|
|  1|[I, wish, java, c...|
|  2|[Logistic, regres...|
+---+--------------------+



In [55]:
ngram = NGram(inputCol = 'words', outputCol = 'grams')
ngram.transform(wordDataFrame).show()

+---+--------------------+--------------------+
| id|               words|               grams|
+---+--------------------+--------------------+
|  0|[Hi, I, heard, ab...|[Hi I, I heard, h...|
|  1|[I, wish, java, c...|[I wish, wish jav...|
|  2|[Logistic, regres...|[Logistic regress...|
+---+--------------------+--------------------+



In [58]:
ngram.transform(wordDataFrame).select('grams').show(truncate=False)

+-------------------------------------------------------------------------------------+
|grams                                                                                |
+-------------------------------------------------------------------------------------+
|[Hi I heard about, I heard about Spark]                                              |
|[I wish java could, wish java could use, java could use case, could use case classes]|
|[Logistic regression models are, regression models are neat]                         |
+-------------------------------------------------------------------------------------+



In [57]:
ngram.setParams(n=4).transform(wordDataFrame).select('grams').show(truncate=False)

+-------------------------------------------------------------------------------------+
|grams                                                                                |
+-------------------------------------------------------------------------------------+
|[Hi I heard about, I heard about Spark]                                              |
|[I wish java could, wish java could use, java could use case, could use case classes]|
|[Logistic regression models are, regression models are neat]                         |
+-------------------------------------------------------------------------------------+



### 7.Hashing

Hashing is the process of converting data — text, numbers, files, or anything, really — into a fixed-length string of letters and numbers.

In [59]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceDataFrame = spark.createDataFrame([
    (0.0, 'Hi I heard about Spark'), 
    (0.0, 'I wish Java could use case classes'),
    (1.0, 'Logistic regression models are neat')
], ['label', 'sentence'])

sentenceDataFrame.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish Java could...|
|  1.0|Logistic regressi...|
+-----+--------------------+



In [60]:
tokenizer = Tokenizer(inputCol = 'sentence', outputCol = 'words')
word_df = tokenizer.transform(sentenceDataFrame)
word_df.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  1.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



In [61]:
hashing_tf = HashingTF(inputCol = 'words', outputCol = 'rawFeatures')
featurized_df = hashing_tf.transform(word_df)
featurized_df.show()

+-----+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|
+-----+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[24417,49...|
|  0.0|I wish Java could...|[i, wish, java, c...|(262144,[20719,24...|
|  1.0|Logistic regressi...|[logistic, regres...|(262144,[13671,91...|
+-----+--------------------+--------------------+--------------------+



In [31]:
featurized_df.select("rawFeatures").take(1)

[Row(rawFeatures=SparseVector(262144, {24417: 1.0, 49304: 1.0, 73197: 1.0, 91137: 1.0, 234657: 1.0}))]

IDF function stands for inverse document frequency and calculates frequency of occurance of a term to the total number of terms in a corpus / collection

In [66]:
idf = IDF(inputCol = 'rawFeatures', outputCol = 'features')
idf_model = idf.fit(featurized_df)
rescaled_df = idf_model.transform(featurized_df)
rescaled_df.show()

+-----+--------------------+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[24417,49...|(262144,[24417,49...|
|  0.0|I wish Java could...|[i, wish, java, c...|(262144,[20719,24...|(262144,[20719,24...|
|  1.0|Logistic regressi...|[logistic, regres...|(262144,[13671,91...|(262144,[13671,91...|
+-----+--------------------+--------------------+--------------------+--------------------+



In [67]:
rescaled_df.take(1)

[Row(label=0.0, sentence='Hi I heard about Spark', words=['hi', 'i', 'heard', 'about', 'spark'], rawFeatures=SparseVector(262144, {24417: 1.0, 49304: 1.0, 73197: 1.0, 91137: 1.0, 234657: 1.0}), features=SparseVector(262144, {24417: 0.2877, 49304: 0.6931, 73197: 0.6931, 91137: 0.6931, 234657: 0.6931}))]

### 8.Count vectorization

In [62]:
from pyspark.ml.feature import CountVectorizer

df = spark.createDataFrame([
    (0, 'a b c'.split(' ')),
    (1, 'a b b c a'.split(' '))
], ['id', 'words'])

df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [63]:
cv = CountVectorizer(inputCol = 'words', outputCol = 'features', vocabSize = 3, minDF = 2.0)

In [64]:
cv_model = cv.fit(df)
cv_df = cv_model.transform(df)
cv_df.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

