In [None]:
# !apt update
# !apt-get install openjdk-11-jdk-headless -qq > /dev/null
# !wget -q http://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
# !tar -xvf spark-3.3.0-bin-hadoop3.tgz
# !pip install -q findspark
# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF, Tokenizer
from pyspark.ml.feature import NGram

In [3]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

## Tokenizer

In [4]:
sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I know Spark can work well with NLP"),
    (2, "Logistic,regression,models,are,supervised")
], ["id", "sentence"])

In [5]:
sentenceDataFrame.show(truncate=False)

+---+-----------------------------------------+
|id |sentence                                 |
+---+-----------------------------------------+
|0  |Hi I heard about Spark                   |
|1  |I know Spark can work well with NLP      |
|2  |Logistic,regression,models,are,supervised|
+---+-----------------------------------------+



In [6]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("sentence", "words")\
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

+-----------------------------------------+--------------------------------------------+------+
|sentence                                 |words                                       |tokens|
+-----------------------------------------+--------------------------------------------+------+
|Hi I heard about Spark                   |[hi, i, heard, about, spark]                |5     |
|I know Spark can work well with NLP      |[i, know, spark, can, work, well, with, nlp]|8     |
|Logistic,regression,models,are,supervised|[logistic,regression,models,are,supervised] |1     |
+-----------------------------------------+--------------------------------------------+------+



In [7]:
regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

+-----------------------------------------+-----------------------------------------------+------+
|sentence                                 |words                                          |tokens|
+-----------------------------------------+-----------------------------------------------+------+
|Hi I heard about Spark                   |[hi, i, heard, about, spark]                   |5     |
|I know Spark can work well with NLP      |[i, know, spark, can, work, well, with, nlp]   |8     |
|Logistic,regression,models,are,supervised|[logistic, regression, models, are, supervised]|5     |
+-----------------------------------------+-----------------------------------------------+------+



## StopWordsRemover

In [None]:
from pyspark.ml.feature import StopWordsRemover

sentenceData = spark.createDataFrame([
    (0, ['I', 'go', 'to', 'school', 'by', 'bus'])
    (0, ['I', 'go', 'to', 'school', 'by', 'bus'])
    [
])

## N-Gram

In [8]:
wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "know", "Spark", "can", "work", "well", "with", "NLP"]),
    (2, ["Logistic", "regression", "models", "are", "supervised"])
], ["id", "words"])

In [9]:
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)

+-------------------------------------------------------------------------+
|ngrams                                                                   |
+-------------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                                |
|[I know, know Spark, Spark can, can work, work well, well with, with NLP]|
|[Logistic regression, regression models, models are, are supervised]     |
+-------------------------------------------------------------------------+



## CountVectorizer

In [10]:
# Input data: Each row is a bag of words with a ID.
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" ")),
    (2, "a b d d a c c".split(" "))
], ["id", "words"])

In [11]:
# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=4, minDF=1)

model = cv.fit(df)
result = model.transform(df)
result.show(truncate=False)

+---+---------------------+-------------------------------+
|id |words                |features                       |
+---+---------------------+-------------------------------+
|0  |[a, b, c]            |(4,[0,1,2],[1.0,1.0,1.0])      |
|1  |[a, b, b, c, a]      |(4,[0,1,2],[2.0,1.0,2.0])      |
|2  |[a, b, d, d, a, c, c]|(4,[0,1,2,3],[2.0,2.0,1.0,2.0])|
+---+---------------------+-------------------------------+



## IF-IDF

In [12]:
sentenceData = spark.createDataFrame([
    (0.0, "a b c"),
    (0.0, "a b c a"),
    (1.0, "a b d d a c c")
], ["label", "sentence"])
sentenceData.show(truncate=False)

+-----+-------------+
|label|sentence     |
+-----+-------------+
|0.0  |a b c        |
|0.0  |a b c a      |
|1.0  |a b d d a c c|
+-----+-------------+



In [13]:
from pyspark.ml.feature import IDF, Tokenizer
sentenceData = spark.createDataFrame([
    (0.0, "a b c"),
    (0.0, "a b c a"),
    (1.0, "a b d d a c c")
], ["label", "sentence"])
sentenceData.show(truncate=False)
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
wordsData.show(truncate=False)

+-----+-------------+
|label|sentence     |
+-----+-------------+
|0.0  |a b c        |
|0.0  |a b c a      |
|1.0  |a b d d a c c|
+-----+-------------+

+-----+-------------+---------------------+
|label|sentence     |words                |
+-----+-------------+---------------------+
|0.0  |a b c        |[a, b, c]            |
|0.0  |a b c a      |[a, b, c, a]         |
|1.0  |a b d d a c c|[a, b, d, d, a, c, c]|
+-----+-------------+---------------------+



In [14]:
cv_TF = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize=4, minDF=1) # số thuộc tính tối đa sẽ lấy (số cột)
model_cv_TF = cv_TF.fit(wordsData)
featurizedData = model_cv_TF.transform(wordsData)
featurizedData.show(truncate=False)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show(truncate=False)

+-----+-------------+---------------------+-------------------------------+
|label|sentence     |words                |rawFeatures                    |
+-----+-------------+---------------------+-------------------------------+
|0.0  |a b c        |[a, b, c]            |(4,[0,1,2],[1.0,1.0,1.0])      |
|0.0  |a b c a      |[a, b, c, a]         |(4,[0,1,2],[2.0,1.0,1.0])      |
|1.0  |a b d d a c c|[a, b, d, d, a, c, c]|(4,[0,1,2,3],[2.0,2.0,1.0,2.0])|
+-----+-------------+---------------------+-------------------------------+

+-----+----------------------------------------------+
|label|features                                      |
+-----+----------------------------------------------+
|0.0  |(4,[0,1,2],[0.0,0.0,0.0])                     |
|0.0  |(4,[0,1,2],[0.0,0.0,0.0])                     |
|1.0  |(4,[0,1,2,3],[0.0,0.0,0.0,1.3862943611198906])|
+-----+----------------------------------------------+

