# NLP on Spark

In [36]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
import pyspark.ml.feature as pmf
from pyspark.ml.clustering import LDA

In [4]:
spark = SparkContext("local", "sqlContext")
sql = SQLContext(spark)

## Tf-Idf

In [26]:
sentenceData = sql.createDataFrame([(0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat")], ["label", "sentence"])

tokenizer = pmf.Tokenizer(inputCol="sentence", outputCol="words")
words = tokenizer.transform(sentenceData)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
features = hashingTF.transform(words)
features.show()

+-----+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|
+-----+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(20,[6,8,13,16],[...|
|  0.0|I wish Java could...|[i, wish, java, c...|(20,[0,2,7,13,15,...|
|  1.0|Logistic regressi...|[logistic, regres...|(20,[3,4,6,11,19]...|
+-----+--------------------+--------------------+--------------------+



In [27]:
idf = pmf.IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(features)
rescaledData = idfModel.transform(features)

rescaledData.select("label", "features").take(1)

[Row(label=0.0, features=SparseVector(20, {6: 0.2877, 8: 0.6931, 13: 0.2877, 16: 0.5754}))]

## Embeddings

In [28]:
doc = sql.createDataFrame([("Hi I heard about Spark".split(" "), ), ("I wish Java could use case classes".split(" "), ), ("Logistic regression models are neat".split(" "), ) ], ["text"])

w2v = pmf.Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
w2v_fit = w2v.fit(doc)
res = w2v_fit.transform(doc)

for row in res.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [0.08372311890125275,-0.03666453883051873,0.027572209388017657]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [-0.03136832533138139,0.026593888444559913,-0.049365587665566375]

Text: [Logistic, regression, models, are, neat] => 
Vector: [-0.02151056742295623,-0.008449985273182392,-0.012495249882340432]



#### Countvectorizer

In [29]:
df = sql.createDataFrame([(0, "a b c".split(" ")), (1, "a b b c a".split(" "))], ["id", "words"])
cv = pmf.CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)
cv_fit = cv.fit(df)

res = cv_fit.transform(df)
res.show(truncate=False)
res.take(1)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+



[Row(id=0, words=['a', 'b', 'c'], features=SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}))]

#### Feature Hasher

In [30]:
df = sql.createDataFrame([ (2.2, True, "1", "foo"), (3.3, False, "2", "bar"), (4.4, False, "3", "baz"), (5.5, False, "4", "foo") ], ["real", "bool", "stringNum", "string"])

fh = pmf.FeatureHasher(inputCols=["real", "bool", "stringNum", "string"], outputCol="features")

df_hash = fh.transform(df)
df_hash.show(truncate=False)

+----+-----+---------+------+--------------------------------------------------------+
|real|bool |stringNum|string|features                                                |
+----+-----+---------+------+--------------------------------------------------------+
|2.2 |true |1        |foo   |(262144,[174475,247670,257907,262126],[2.2,1.0,1.0,1.0])|
|3.3 |false|2        |bar   |(262144,[70644,89673,173866,174475],[1.0,1.0,1.0,3.3])  |
|4.4 |false|3        |baz   |(262144,[22406,70644,174475,187923],[1.0,1.0,4.4,1.0])  |
|5.5 |false|4        |foo   |(262144,[70644,101499,174475,257907],[1.0,1.0,5.5,1.0]) |
+----+-----+---------+------+--------------------------------------------------------+



## Stop words

In [32]:
df = sql.createDataFrame([(0, ["I", "saw", "the", "red", "balloon"]), (1, ["Mary", "had", "a", "little", "lamb"])], ["id", "raw"])

sw = pmf.StopWordsRemover(inputCol="raw", outputCol="filtered")
sw.transform(df).show(truncate=False)

+---+----------------------------+--------------------+
|id |raw                         |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



## N-gram

In [35]:
df = sql.createDataFrame([(0, ["Hi", "I", "heard", "about", "Spark"]),(1, ["I", "wish", "Java", "could", "use", "case", "classes"]),(2, ["Logistic", "regression", "models", "are", "neat"])], ["id", "words"])

ng = pmf.NGram(n=2, inputCol="words", outputCol="ngrams")

df_ng = ng.transform(df)
df_ng.select("ngrams").show(truncate=False)

+------------------------------------------------------------------+
|ngrams                                                            |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



## LDA

In [40]:
path = 'D:/ProgramFiles/Spark/spark-3.0.0-bin-hadoop2.7/data/mllib/'
df = sql.read.format("libsvm").load(path + "sample_lda_libsvm_data.txt")

lda = LDA(k=10, maxIter=10)
lda_fit = lda.fit(df)

ll = lda_fit.logLikelihood(df)
lp = lda_fit.logPerplexity(df)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -797.5200544004555
The upper bound on perplexity: 3.0673848246171365


In [42]:
# Describe topics.
topics = lda_fit.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

# Shows the result
transformed = lda_fit.transform(df)
transformed.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[1, 3, 4]  |[0.10368082673401043, 0.10246920001021176, 0.09945342991888821]|
|1    |[0, 5, 9]  |[0.1076176051626867, 0.09801051465962088, 0.09705006627909334] |
|2    |[5, 10, 9] |[0.09817190617101527, 0.0981118296756393, 0.09564406780739915] |
|3    |[5, 10, 2] |[0.10428733568856435, 0.10200642097504846, 0.09787247160826047]|
|4    |[5, 8, 2]  |[0.10610350651837704, 0.10225089640708551, 0.0969920925809682] |
|5    |[2, 1, 5]  |[0.1017814308587037, 0.09673789329662605, 0.09602700830858574] |
|6    |[3, 5, 4]  |[0.1616436533169385, 0.1391919780709568, 0.11423150148553422]  |
|7    |[8, 3, 5]  |[0.10449091906046457, 0.09702934195910436, 0.09685753582283695]|
|8    |[2, 10, 5] |[0.2048

## Credits & Links

http://spark.apache.org/docs/latest/ml-classification-regression.html
    http://spark.apache.org/docs/latest/ml-clustering.html