In [1]:
from pyspark import SparkConf, SparkContext,SQLContext  
from pyspark.sql import SparkSession   
from pyspark.ml.feature import Word2Vec,CountVectorizer  
from pyspark.ml.clustering import LDA, LDAModel  
from pyspark.sql.functions import col, udf  
from pyspark.sql.types import IntegerType,ArrayType,StringType  
import pylab as pl  
import re

In [2]:
def to_word(termIndices):
  words = []  
  for termID in termIndices:
    words.append(vocab_broadcast.value[termID])      
  return words

In [3]:
#Load your document dataframe here
#================your code here==================
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql.functions import monotonically_increasing_id 


sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
spark_df= spark.read.format("csv").option("header",'false').load("stream_data.csv")
print(type(spark_df))
spark_df = spark_df.select("*").withColumn("id", monotonically_increasing_id())
spark_df = spark_df.withColumnRenamed('_c0','words')
tokenizer = Tokenizer(inputCol='words',outputCol='token_words')
spark_df = tokenizer.transform(spark_df)



#==================================================
spark_df.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/24 02:14:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/24 02:14:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


                                                                                

<class 'pyspark.sql.dataframe.DataFrame'>
+--------------------+---+--------------------+
|               words| id|         token_words|
+--------------------+---+--------------------+
|I absolutely ADOR...|  0|[i, absolutely, a...|
|Java Vs Python Fo...|  1|[java, vs, python...|
|voulu un grec pui...|  2|[voulu, un, grec,...|
|Pareil Il pris de...|  3|[pareil, il, pris...|
|Music Academy Blo...|  4|[music, academy, ...|
|Tarps, tents, and...|  5|[tarps,, tents,, ...|
|voulu un grec pui...|  6|[voulu, un, grec,...|
|We drive efficien...|  7|[we, drive, effic...|
|Check out my Gig ...|  8|[check, out, my, ...|
|Hey, nice bones y...|  9|[hey,, nice, bone...|
|lembro como sofri...| 10|[lembro, como, so...|
|WHO WITH A DEEP T...| 11|[who, with, a, de...|
|@Tina69911364 @As...| 12|[@tina69911364, @...|
|alguem cria um ap...| 13|[alguem, cria, um...|
|@Neptvn08 Comment...| 14|[@neptvn08, comme...|
|une dinguerie de ...| 15|[une, dinguerie, ...|
|Y a une grosse mo...| 16|[y, a, une, gross...

In [4]:
#CountVectorizer
#================your code here==================
#TF
cv = CountVectorizer(inputCol='token_words',outputCol='raw_features',vocabSize=5000, minDF=10.0)

model = cv.fit(spark_df)
result = model.transform(spark_df)

#IDF
idf = IDF(inputCol="raw_features",outputCol="features")
idfModel = idf.fit(result)
result_tfidf = idf.fit(result)
result_tfidf = idfModel.transform(result)
#result_tfidf.select('token_words','features').show(truncate=True)
result_tfidf.show()
cvResult = result_tfidf[["id","features"]]
#==================================================

                                                                                

+--------------------+---+--------------------+--------------------+--------------------+
|               words| id|         token_words|        raw_features|            features|
+--------------------+---+--------------------+--------------------+--------------------+
|I absolutely ADOR...|  0|[i, absolutely, a...|(145,[1,8,13,20,5...|(145,[1,8,13,20,5...|
|Java Vs Python Fo...|  1|[java, vs, python...|(145,[19,74],[1.0...|(145,[19,74],[2.5...|
|voulu un grec pui...|  2|[voulu, un, grec,...|(145,[7,15,16,54,...|(145,[7,15,16,54,...|
|Pareil Il pris de...|  3|[pareil, il, pris...|(145,[2,11,15,25,...|(145,[2,11,15,25,...|
|Music Academy Blo...|  4|[music, academy, ...|(145,[1,3,4,34,12...|(145,[1,3,4,34,12...|
|Tarps, tents, and...|  5|[tarps,, tents,, ...|(145,[3,5,20,111]...|(145,[3,5,20,111]...|
|voulu un grec pui...|  6|[voulu, un, grec,...|(145,[7,15,16,54,...|(145,[7,15,16,54,...|
|We drive efficien...|  7|[we, drive, effic...|(145,[4,70],[1.0,...|(145,[4,70],[1.83...|
|Check out

In [5]:
#train LDA model, cluster the documents into 10 topics 
#================your code here==================

num_topics = 10
max_iterations = 100
lda = LDA(k=num_topics, seed=1, optimizer="em")
lda.setMaxIter(max_iterations)
lda.getMaxIter()
ldaModel = lda.fit(cvResult)


#==================================================

                                                                                

In [6]:
ldaModel.describeTopics().show()
ldaModel.topicsMatrix()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[5, 6, 21, 47, 20...|[0.13557348228467...|
|    1|[9, 23, 25, 32, 4...|[0.16107238029942...|
|    2|[22, 27, 29, 40, ...|[0.10496751333839...|
|    3|[16, 15, 7, 36, 4...|[0.14013980720935...|
|    4|[11, 2, 18, 37, 3...|[0.15383337414683...|
|    5|[3, 17, 19, 28, 3...|[0.14737961326364...|
|    6|[13, 0, 34, 38, 4...|[0.11986345639154...|
|    7|[10, 12, 30, 33, ...|[0.15137624573375...|
|    8|[4, 8, 0, 24, 56,...|[0.15226462355724...|
|    9|[1, 14, 26, 7, 61...|[0.26968790564013...|
+-----+--------------------+--------------------+



DenseMatrix(145, 10, [0.7447, 0.4882, 0.0819, 0.5967, 1.3057, 208.6201, 202.3025, 0.0691, ..., 0.0862, 0.1165, 0.1009, 0.0161, 0.0161, 0.0441, 0.0161, 0.1444], 0)

In [7]:
transformed = ldaModel.transform(cvResult).select("topicDistribution")  
#show the weight of every topic Distribution 
transformed.show(truncate=False)  

22/10/24 02:15:40 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/10/24 02:15:40 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topicDistribution                                                                                                                                                                                        |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.08388544767655748,0.06655225572664249,0.10629971625348385,0.06655293172997798,0.066551504440452,0.07930907079684008,0.2759582555444431,0.06655557580314661,0.1079101277542602

In [8]:
#The higher ll is, the lower lp is, the better model is.
ll = ldaModel.logLikelihood(cvResult)  
lp = ldaModel.logPerplexity(cvResult)
print("ll: ", ll)
print("lp: ", lp)

                                                                                

ll:  -86694.88478220982
lp:  6.524969381452884


In [9]:
# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())+ " words):")
topics = ldaModel.topicsMatrix()
print(topics)

Learned topics (as distributions over vocab of 145 words):
DenseMatrix([[7.44650309e-01, 1.54204742e-01, 1.57275957e-01, ...,
              5.36361818e+00, 1.68619487e+02, 3.84673984e-01],
             [4.88228759e-01, 2.85729202e-01, 1.85765381e-01, ...,
              4.65727450e-01, 3.41844324e-01, 2.94660508e+02],
             [8.18953639e-02, 2.69081593e+01, 5.70656668e+01, ...,
              2.24942295e-01, 1.02318195e-01, 2.01310157e+01],
             ...,
             [2.04559343e-02, 4.54045489e-02, 3.94413044e-02, ...,
              3.10506245e-02, 2.22042280e-02, 4.40750790e-02],
             [1.34201901e-02, 1.34696789e-02, 1.36809182e-02, ...,
              4.11737870e+01, 1.70465397e-02, 1.60909869e-02],
             [1.04869259e-01, 6.84187746e-02, 7.00306313e-02, ...,
              1.20872916e-01, 1.91578228e-01, 1.44446387e-01]])
