In [0]:
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

In [0]:
sub_2014 = spark.read.parquet("/mnt/lsde/group05/submissions_tree_shaking/2014-2.parquet")

In [0]:
document_assembler = DocumentAssembler().setInputCol("title").setOutputCol("document").setCleanupMode("shrink")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized")
stopwords_cleaner = StopWordsCleaner().setInputCols("normalized").setOutputCol("cleanTokens").setCaseSensitive(False)
stemmer = Stemmer().setInputCols(["cleanTokens"]).setOutputCol("stem")
finisher = Finisher().setInputCols(["stem"]).setOutputCols(["tokens"]).setOutputAsArray(True).setCleanAnnotations(False)
nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher])
nlp_model = nlp_pipeline.fit(sub_2014)
processed_df = nlp_model.transform(sub_2014)

In [0]:
processed_df.select('id','tokens').show()

In [0]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="tokens", outputCol="features", vocabSize=500, minDF=3.0)
# train the model
cv_model = cv.fit(processed_df)
# transform the data. Output column name will be features.
vectorized_tokens = cv_model.transform(processed_df)

In [0]:
from pyspark.ml.clustering import LDA
num_topics = 2
lda = LDA(k=num_topics, maxIter=1)
model = lda.fit(vectorized_tokens)
ll = model.logLikelihood(vectorized_tokens)
lp = model.logPerplexity(vectorized_tokens)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

In [0]:
# extract vocabulary from CountVectorizer
vocab = cv_model.vocabulary
topics = model.describeTopics()   
topics_rdd = topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)