In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [2]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [3]:
sen_df = spark.createDataFrame([
    (0,'Hi I hear about Spark'),
    (1,'I wish java could use case classes'),
    (2,'Logistic,regression,models,are,neat')
],['id','sentence'])

In [4]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I hear about S...|
|  1|I wish java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [5]:
token = Tokenizer(inputCol = 'sentence', outputCol ='words')
regextoken = RegexTokenizer(inputCol = 'sentence', outputCol ='words', pattern ='\\W')

In [6]:
count_token = udf(lambda words: len(words), IntegerType())

In [7]:
tokenizer = token.transform(sen_df)
tokenizer.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I hear about S...|[hi, i, hear, abo...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [8]:
tokenized = tokenizer.withColumn("token", count_token(col('words')))
tokenized.show()

+---+--------------------+--------------------+-----+
| id|            sentence|               words|token|
+---+--------------------+--------------------+-----+
|  0|Hi I hear about S...|[hi, i, hear, abo...|    5|
|  1|I wish java could...|[i, wish, java, c...|    7|
|  2|Logistic,regressi...|[logistic,regress...|    1|
+---+--------------------+--------------------+-----+



In [9]:
retokenizer = regextoken.transform(sen_df)
retokenizer.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I hear about S...|[hi, i, hear, abo...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [10]:
retokenized = retokenizer.withColumn("token", count_token(col('words')))
retokenized.show()

+---+--------------------+--------------------+-----+
| id|            sentence|               words|token|
+---+--------------------+--------------------+-----+
|  0|Hi I hear about S...|[hi, i, hear, abo...|    5|
|  1|I wish java could...|[i, wish, java, c...|    7|
|  2|Logistic,regressi...|[logistic, regres...|    5|
+---+--------------------+--------------------+-----+



# Removing Common words from the tokenized dataFrame

In [11]:
from pyspark.ml.feature import StopWordsRemover

In [12]:
#creating a new pyspark datafame
dataframe = spark.createDataFrame([
    (0,['I', 'saw', 'the', 'green', 'horse']),
    (1,['Marry', 'had', 'a', 'little', 'lamb'])
], ['id','tokens'])
dataframe.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Marry, had, a, l...|
+---+--------------------+



In [13]:
remover = StopWordsRemover(inputCol = 'tokens', outputCol='filtered')
remover.transform(dataframe).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Marry, had, a, l...|[Marry, little, l...|
+---+--------------------+--------------------+



# n-gram sequence of n token for some integer

n-grams:
It takes the input of tokenized string and then make grams from the input where each gram will contain n sequenced strings.

In [14]:
from pyspark.ml.feature import NGram

In [15]:
retokenized.show()

+---+--------------------+--------------------+-----+
| id|            sentence|               words|token|
+---+--------------------+--------------------+-----+
|  0|Hi I hear about S...|[hi, i, hear, abo...|    5|
|  1|I wish java could...|[i, wish, java, c...|    7|
|  2|Logistic,regressi...|[logistic, regres...|    5|
+---+--------------------+--------------------+-----+



In [19]:
ngram = NGram(n=2, inputCol = 'words', outputCol = 'grams')

In [21]:
noutput = ngram.transform(retokenized)

In [23]:
output = noutput.select('grams')
output.show(truncate=False)

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[hi i, i hear, hear about, about spark]                           |
|[i wish, wish java, java could, could use, use case, case classes]|
|[logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



# Feature Extractors

<h2 id="tf-idf">TF-IDF</h2>

<p><a href="http://en.wikipedia.org/wiki/Tf%E2%80%93idf">Term frequency-inverse document frequency (TF-IDF)</a> 
is a feature vectorization method widely used in text mining to reflect the importance of a term 
to a document in the corpus. Denote a term by t, a document by  d , and the corpus by D.
Term frequency <code>$TF(t, d)$</code> is the number of times that term <code>$t$</code> appears in document <code>$d$</code>, while 
document frequency <code>$DF(t, D)$</code> is the number of documents that contains term <code>$t$</code>. If we only use 
term frequency to measure the importance, it is very easy to over-emphasize terms that appear very 
often but carry little information about the document, e.g. &#8220;a&#8221;, &#8220;the&#8221;, and &#8220;of&#8221;. If a term appears 
very often across the corpus, it means it doesn&#8217;t carry special information about a particular document.
Inverse document frequency is a numerical measure of how much information a term provides:

$$ IDF(t, D) = \log \frac{|D| + 1}{DF(t, D) + 1} $$

where |D| is the total number of documents in the corpus. Since logarithm is used, if a term 
appears in all documents, its IDF value becomes 0. Note that a smoothing term is applied to avoid 
dividing by zero for terms outside the corpus. The TF-IDF measure is simply the product of TF and IDF:
$$ TFIDF(t, d, D) = TF(t, d) \cdot IDF(t, D). $$


In [24]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

sentenceData.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish Java could...|
|  1.0|Logistic regressi...|
+-----+--------------------+



In [25]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='token')
words_data = tokenizer.transform(sentenceData)
words_data.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+
|label|sentence                           |token                                     |
+-----+-----------------------------------+------------------------------------------+
|0.0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |
|0.0  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|
|1.0  |Logistic regression models are neat|[logistic, regression, models, are, neat] |
+-----+-----------------------------------+------------------------------------------+



In [29]:
hashingtf = HashingTF(inputCol='token', outputCol='rawFeatures')
featurized_data = hashingtf.transform(words_data)

In [31]:
idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(featurized_data)
rescaled_model = idf_model.transform(featurized_data)
rescaled_model.select('label','features').show(truncate=False)

+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                        |
+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(262144,[24417,49304,73197,91137,234657],[0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                     |
|0.0  |(262144,[20719,24417,55551,116873,147765,162369,192310],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.693147180559945

## CountVectorizer
CountVectorizer and CountVectorizerModel aim to help convert a collection of text documents to vectors of token counts. When an a-priori dictionary is not available, CountVectorizer can be used as an Estimator to extract the vocabulary, and generates a CountVectorizerModel. The model produces sparse representations for the documents over the vocabulary, which can then be passed to other algorithms like LDA.

During the fitting process, CountVectorizer will select the top vocabSize words ordered by term frequency across the corpus. An optional parameter minDF also affects the fitting process by specifying the minimum number (or fraction if < 1.0) of documents a term must appear in to be included in the vocabulary. Another optional binary toggle parameter controls the output vector. If set to true all nonzero counts are set to 1. This is especially useful for discrete probabilistic models that model binary, rather than integer, counts.

In [32]:
from pyspark.ml.feature import CountVectorizer

In [33]:
# Input data: Each row is a bag of words with a ID.
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

model = cv.fit(df)

result = model.transform(df)
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

