Used tutorial https://github.com/maobedkova/TopicModelling_PySpark_SparkNLP/blob/master/Topic_Modelling_with_PySpark_and_Spark_NLP.ipynb for reference

In [1]:
import sparknlp
from sparknlp.annotator import Tokenizer, PerceptronModel
from sparknlp.base import DocumentAssembler
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.clustering import LDA
from pyspark.sql import types as T

spark = sparknlp.start()

24/11/18 12:34:07 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
og_data = spark.read.csv("../data_topicmodel.csv", header= True).select(["id", "cleaned_text"])

In [3]:
#Remove sample
og_data.show(5)
sample_data = og_data.sample(0.001)

+-----+--------------------+
|   id|        cleaned_text|
+-----+--------------------+
|hk5r2|i had an appointm...|
|iqimz|i created this si...|
|pfzt5|hello everyone  i...|
|pk714|i grew up with bo...|
|q0q8x|i have to ask whe...|
+-----+--------------------+
only showing top 5 rows



## Preprocessing

In [4]:
documentAssembler = DocumentAssembler()\
     .setInputCol("cleaned_text")\
     .setOutputCol('document')

In [5]:
tokenizer = Tokenizer() \
            .setInputCols(['document'])\
            .setOutputCol('tokenized')

In [6]:
normalizer = Normalizer() \
     .setInputCols(['tokenized']) \
     .setOutputCol('normalized') 

In [7]:
english = [
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", 
    "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "cannot", "could", "did", 
    "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", 
    "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "it", 
    "its", "itself", "let", "me", "more", "most", "must", "my", "myself", "no", "nor", "not", "of", "off", "on", 
    "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "some", "such", 
    "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", 
    "through", "to", "too", "under", "until", "up", "very", "was", "we", "were", "what", "when", "where", "which", 
    "while", "who", "whom", "why", "with", "would", "you", "your", "yours", "yourself", "yourselves", "will", "ll", 
    "re", "ve", "d", "s", "m", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", 
    "s", "t", "u", "v", "w", "x", "y", "z", "many", "us", "ok", "hows", "ive", "ill", "im", "cant", "topics", "topic",
    "discuss", "thoughts", "yo", "thats", "whats", "lets", "nothing", "oh", "omg", 
         "things", "stuff", "yall", "haha", "yes", "no", "wo", "like", 'good', 
         'work', 'got', 'going', 'dont', 'really', 'want', 'make', 'think', 
         'know', 'feel', 'people', 'life', "getting", "lot" "great", "i", "me", 
         "my", "myself", "we", "our", "ours", "ourselves", 
        "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", 
        "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
        "they", "them", "their", "theirs","themselves", "what", "which", "who", 
        "whom", "this", "that", "these", "those", "am", "is", "are", "was", 
        "were", "be", "been", "being", "have", "has", "had", "having", "do", 
        "does", "did", "doing", "will", "would", "should", "can", "could", "may",
        "might", "must", "shall", "ought", "about", "above", "across", "after", 
        "against", "along", "amid", "among", "around", "as", "at", "before", "behind",
        "below", "beneath", "beside", "between", "beyond", "but", "by", 
        "concerning", "considering", "despite", "down", "during", "except", "for",
        "from", "in", "inside", "into", "like", "near", "next", "notwithstanding",
        "of", "off", "on", "onto", "opposite", "out", "outside", "over", "past",
        "regarding", "round", "since", "than", "through", "throughout", "till", 
        "to", "toward", "towards", "under", "underneath", "unlike", "until", "up",
        "upon", "versus", "via", "with", "within", "without", "cant", "cannot", 
        "couldve", "couldnt", "didnt", "doesnt", "dont", "hadnt", "hasnt", 
        "havent", "hed", "hell", "hes", "howd", "howll", "hows", "id", "ill", 
        "im", "ive", "isnt", "itd", "itll", "its", "lets", "mightve", "mustve", 
        "mustnt", "shant", "shed", "shell", "shes", "shouldve", "shouldnt", 
        "thatll", "thats", "thered", "therell", "therere", "theres", "theyd", 
        "theyll", "theyre", "theyve", "wed", "well", "were", "weve", "werent", 
        "whatd", "whatll", "whatre", "whats", "whatve", "whend", "whenll", 
        "whens", "whered", "wherell", "wheres", "whichd", "whichll", "whichre", 
        "whichs", "whod", "wholl", "whore", "whos", "whove", "whyd", "whyll", 
        "whys", "wont", "wouldve", "wouldnt", "youd", "youll", "youre", "youve",
        "f", "m", "because", "go", "lot", "get", "still", "way", "something", "much",
        "thing", "someone", "person", "anything", "goes", "ok", "so", "just", "mostly", 
        "put", "also", "lots", "yet"]

time = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", 
        "sunday", "morning", "noon", "afternoon", "evening", "night", "midnight",
        "dawn", "dusk", "week", "weekend", "weekends","weekly", "today", 
        "yesterday", "tomorrow", "yesterdays", "todays", "mondays", "tuesdays",
        "wednesdays", "thursdays", "fridays", "saturdays", "sundays", "day",
        "everyday", "daily", "workday", 'time', 'month', 'year', 'pm', 'am', "ago",
        "year"]

reddit = ["welcome", "hi", "hello", "sub", "reddit", "thanks", "thank", "maybe",
          "wo30", "mods", "mod", "moderators", "subreddit", "btw", "aw", "aww", 
          "aww", "hey", "hello", "join", "joined", "post", "rselfimprovement"]

topic_specific = ["self", "improvement", "change", "action",
    'change', 'start', 'goal', 'habit', 'new', 'old', 
    'care', 'world', 'everyone', 'love', 'u', 'right', 'mean', 'matter',
    'best', 'step', 'focus', 'hard', 'small',
    'bad', 'help', 'time', 'problem', 'issue', 'advice',
    'bit', 'experience', 'different',
    'point', 'situation', 'negative', 'control', 'positive',
    'use', 'question', 'idea', 'amp', 'medium', 'hour', 'day', 'minute',
    'aaaaloot']

stopwords = english + time + reddit + topic_specific

from sparknlp.annotator import StopWordsCleaner

stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['normalized']) \
     .setOutputCol('unigrams') \
     .setStopWords(stopwords)

In [8]:
from sparknlp.annotator import NGramGenerator

ngrammer = NGramGenerator() \
    .setInputCols(['normalized']) \
    .setOutputCol('ngrams') \
    .setN(3) \
    .setEnableCumulative(True) \
    .setDelimiter('_')

In [9]:
pos = PerceptronModel.load("/project/macs40123/spark-jars/pos_anc_en_3.0.0_3.0_1614962126490/")\
      .setInputCols("document", "unigrams")\
      .setOutputCol("pos")

                                                                                

In [10]:
from sparknlp.base import Finisher

finisher = Finisher().setInputCols(['unigrams', 'ngrams', 'pos'])

In [11]:
my_pipeline = Pipeline(
      stages = [
          documentAssembler,
          tokenizer,
          normalizer,
          stopwords_cleaner,
          ngrammer,
          pos,
          finisher
      ])

In [12]:
pipelineModel = my_pipeline.fit(sample_data)
processed_data = pipelineModel.transform(sample_data)
processed_data.show(1)



+------+--------------------+--------------------+--------------------+--------------------+
|    id|        cleaned_text|   finished_unigrams|     finished_ngrams|        finished_pos|
+------+--------------------+--------------------+--------------------+--------------------+
|2x6sjp|in my quest for s...|[quest, selfimpro...|[in, my, quest, f...|[NN, NN, NN, NNS,...|
+------+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row



In [13]:
processed_data.columns

['id', 'cleaned_text', 'finished_unigrams', 'finished_ngrams', 'finished_pos']

In [14]:
#data.select('document').show(5, truncate = 100)

In [15]:
#data.select('tokenized').show(1, truncate = 100)

In [16]:
#data.select('normalized').show(1, truncate = 100)

In [17]:
#data.select('lemmatized').show(1, truncate = 1000)

In [18]:
processed_data.select('finished_unigrams').show(10, truncate = 100)

+----------------------------------------------------------------------------------------------------+
|                                                                                   finished_unigrams|
+----------------------------------------------------------------------------------------------------+
|[quest, selfimprovement, course, failures, times, failed, slipped, back, habits, gut, reaction, t...|
|[throwaway, account, easterncentral, europe, english, second, third, language, looking, ideas, im...|
|[fairly, broad, two, main, points, individual, askbased, scenarios, projects, often, involve, cha...|
|[growing, outcast, school, society, awkward, shy, overweight, nerd, spent, hating, worrying, ever...|
|[early, stages, friends, wonderful, mentors, learned, now, mentors, close, find, one, please, adv...|
|[male, mainly, regards, better, long, practice, wildly, inconsistent, playing, saxophone, seven, ...|
|[try, others, try, laugh, value, become, arrogant, selfish, asshole, ign

Create pos-tags n-grams that correspond to words n-grams

In [19]:
#Merge POS tags as just one string to be able to take it as a document in the Spark NLP Pipeline
from pyspark.sql import functions as F
pos_as_string = F.udf(lambda x: ' '.join(x), T.StringType())
processed_data = processed_data.withColumn('finished_pos', pos_as_string(F.col('finished_pos')))

In [20]:
processed_data.show(3)

[Stage 8:>                                                          (0 + 1) / 1]

+------+--------------------+--------------------+--------------------+--------------------+
|    id|        cleaned_text|   finished_unigrams|     finished_ngrams|        finished_pos|
+------+--------------------+--------------------+--------------------+--------------------+
|2x6sjp|in my quest for s...|[quest, selfimpro...|[in, my, quest, f...|NN NN NN NNS NNS ...|
|4xu065|throwaway account...|[throwaway, accou...|[throwaway, accou...|NN NN JJ NNP NNP ...|
|543cwv|since this is a f...|[fairly, broad, t...|[since, this, is,...|RB JJ CD JJ NNS J...|
+------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



                                                                                

New pipeline for pos-tags

In [21]:
pos_documentAssembler = DocumentAssembler() \
     .setInputCol('finished_pos') \
     .setOutputCol('pos_document')

pos_tokenizer = Tokenizer() \
     .setInputCols(['pos_document']) \
     .setOutputCol('pos')
     
    
pos_ngrammer = NGramGenerator() \
    .setInputCols(['pos']) \
    .setOutputCol('pos_ngrams') \
    .setN(3) \
    .setEnableCumulative(True) \
    .setDelimiter('_')

pos_finisher = Finisher() \
     .setInputCols(['pos', 'pos_ngrams']) \

pos_pipeline = Pipeline() \
     .setStages([pos_documentAssembler,                  
                 pos_tokenizer,
                 pos_ngrammer,  
                 pos_finisher])

processed_data = pos_pipeline.fit(processed_data).transform(processed_data)

In [22]:
processed_data.columns

['id',
 'cleaned_text',
 'finished_unigrams',
 'finished_ngrams',
 'finished_pos',
 'finished_pos_ngrams']

In [23]:
processed_data.show(1)

+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    id|        cleaned_text|   finished_unigrams|     finished_ngrams|        finished_pos| finished_pos_ngrams|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|2x6sjp|in my quest for s...|[quest, selfimpro...|[in, my, quest, f...|[NN, NN, NN, NNS,...|[NN, NN, NN, NNS,...|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row



                                                                                

In [24]:
def filter_unigrams(finished_unigrams, finished_pos):
    '''Filters individual words based on their POS tag'''
    return [word for word, pos in zip(finished_unigrams, finished_pos)
            if pos in ['JJ', 'NN', 'NNS', 'NNPS']]

udf_filter_unigrams = F.udf(filter_unigrams, T.ArrayType(T.StringType()))

In [25]:
processed_data = processed_data.withColumn('filtered_unigrams_by_pos', udf_filter_unigrams(
                                                   F.col('finished_unigrams'),
                                                   F.col('finished_pos')))

processed_data.show(5)

+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+
|    id|        cleaned_text|   finished_unigrams|     finished_ngrams|        finished_pos| finished_pos_ngrams|filtered_unigrams_by_pos|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+
|2x6sjp|in my quest for s...|[quest, selfimpro...|[in, my, quest, f...|[NN, NN, NN, NNS,...|[NN, NN, NN, NNS,...|    [quest, selfimpro...|
|4xu065|throwaway account...|[throwaway, accou...|[throwaway, accou...|[NN, NN, JJ, NNP,...|[NN, NN, JJ, NNP,...|    [throwaway, accou...|
|543cwv|since this is a f...|[fairly, broad, t...|[since, this, is,...|[RB, JJ, CD, JJ, ...|[RB, JJ, CD, JJ, ...|    [broad, main, poi...|
|66mavm|growing up i was ...|[growing, outcast...|[growing, up, i, ...|[VBG, JJ, NN, NN,...|[VBG, JJ, NN, NN,...|    [outcast, school,...|
|6g7bde|in the early stag..

                                                                                

In [26]:
def filter_pos_ngrams(finished_ngrams, finished_pos_tags):
    return [word for word, pos in zip(finished_ngrams, finished_pos_tags) 
            if (len(pos.split('_')) == 2 and \
                pos.split('_')[0] in ['JJ', 'NN', 'NNS', 'VB', 'VBP'] and \
                 pos.split('_')[1] in ['JJ', 'NN', 'NNS'])
            or (len(pos.split('_')) == 3 and \
                pos.split('_')[0] in ['JJ', 'NN', 'NNS', 'VB', 'VBP'] and \
                 pos.split('_')[1] in ['JJ', 'NN', 'NNS', 'VB', 'VBP'] and \
                  pos.split('_')[2] in ['NN', 'NNS'])]
    
udf_filter_pos_ngrams = F.udf(filter_pos_ngrams, T.ArrayType(T.StringType()))

In [27]:
processed_data = processed_data.withColumn('filtered_ngrams_by_pos',
                       udf_filter_pos_ngrams(F.col('finished_ngrams'),
                                             F.col('finished_pos_ngrams')))

In [28]:
processed_data.show(5)

+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+----------------------+
|    id|        cleaned_text|   finished_unigrams|     finished_ngrams|        finished_pos| finished_pos_ngrams|filtered_unigrams_by_pos|filtered_ngrams_by_pos|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+----------------------+
|2x6sjp|in my quest for s...|[quest, selfimpro...|[in, my, quest, f...|[NN, NN, NN, NNS,...|[NN, NN, NN, NNS,...|    [quest, selfimpro...|  [not, helpful, th...|
|4xu065|throwaway account...|[throwaway, accou...|[throwaway, accou...|[NN, NN, JJ, NNP,...|[NN, NN, JJ, NNP,...|    [throwaway, accou...|  [com, vocabulary,...|
|543cwv|since this is a f...|[fairly, broad, t...|[since, this, is,...|[RB, JJ, CD, JJ, ...|[RB, JJ, CD, JJ, ...|    [broad, main, poi...|  [things, their, s...|
|66mavm|growing up i was ...

                                                                                

In [29]:
processed_data.select('id','filtered_ngrams_by_pos').show(10, truncate=100)

+------+----------------------------------------------------------------------------------------------------+
|    id|                                                                              filtered_ngrams_by_pos|
+------+----------------------------------------------------------------------------------------------------+
|2x6sjp|[not, helpful, thinking, sometimes, you, didnt, you, stress, like, excuse, to, not, this, is, jus...|
|4xu065|[com, vocabulary, my, vocabulary, around, second, take, when, i, good, as, i, dont, have, for, no...|
|543cwv|[things, their, subpar, previous, job, had, everybody, was, competitive, and, this, incessant, ne...|
|66mavm|[not, for, my, own, personal, gain, study, only, to, be, work, a, job, only, to, be, less, loser,...|
|6g7bde|                                                                     [mentors, for, me, that, no, i]|
|7oobzy|[will, dont, also, been, im, getting, touches, behind, back, had, a, beat, the, team, captain, th...|
|9sp10c|[h

                                                                                

In [30]:
processed_data.select('id','filtered_unigrams_by_pos').show(10, truncate=100)

+------+----------------------------------------------------------------------------------------------------+
|    id|                                                                            filtered_unigrams_by_pos|
+------+----------------------------------------------------------------------------------------------------+
|2x6sjp|[quest, selfimprovement, course, failures, times, habits, gut, reaction, try, pathetic, helpful, ...|
|4xu065|[throwaway, account, easterncentral, second, third, language, ideas, improve, tutor, available, s...|
|543cwv|[broad, main, points, individual, scenarios, projects, systems, teams, sign, stakeholders, teams,...|
|66mavm|[outcast, school, society, awkward, shy, overweight, nerd, everything, therapy, bpd, inferior, na...|
|6g7bde|                                                 [stages, friends, wonderful, mentors, mentors, thx]|
|7oobzy|[male, long, practice, inconsistent, saxophone, years, somedays, sightread, super, piece, days, f...|
|9sp10c|[o

In [31]:
processed_data.columns

['id',
 'cleaned_text',
 'finished_unigrams',
 'finished_ngrams',
 'finished_pos',
 'finished_pos_ngrams',
 'filtered_unigrams_by_pos',
 'filtered_ngrams_by_pos']

In [32]:
#Now that POS was done, lemmatization makes more sense at this point

#Merge tokens as just one string to be able to take it as a document in the new Pipeline
from pyspark.sql import functions as F
tokens_as_string = F.udf(lambda x: ' '.join(x), T.StringType())
processed_data = processed_data.withColumn('joined_tokens', tokens_as_string(F.col('filtered_unigrams_by_pos')))

last_documentAssembler = DocumentAssembler() \
     .setInputCol('joined_tokens') \
     .setOutputCol('joined_document')

last_tokenizer = Tokenizer() \
     .setInputCols(['joined_document']) \
     .setOutputCol('tokenized')
     
lemmatizer = LemmatizerModel.load("../models/lemma_ewt_en_3.4.3_3.0_1651416655397/")\
      .setInputCols("tokenized")\
      .setOutputCol("lemmatized")

#Delete these tokens that remained from the lemmatizer model and topic's n grams
last_stopwords = ["_", "self_improvement"]

last_stopwords_cleaner1 = StopWordsCleaner() \
     .setInputCols(['lemmatized']) \
     .setOutputCol('cleaned_unigrams') \
     .setStopWords(last_stopwords)

last_finisher = Finisher() \
     .setInputCols(['cleaned_unigrams']) \

last_pipeline = Pipeline() \
     .setStages([last_documentAssembler,                  
                 last_tokenizer,
                 lemmatizer,
                 last_stopwords_cleaner1,
                 last_finisher])

final_data = last_pipeline.fit(processed_data).transform(processed_data)


Create one column merging unigrams and ngrams

In [33]:
from pyspark.sql.functions import concat
final_data = final_data.withColumn('final', concat(F.col('finished_cleaned_unigrams'), \
                                                   F.col('filtered_ngrams_by_pos')))\
                                                   .select('id','cleaned_text','final')
                                                                                                                          

In [34]:
final_data.select('final').show(50, truncate=100)

                                                                                

+----------------------------------------------------------------------------------------------------+
|                                                                                               final|
+----------------------------------------------------------------------------------------------------+
|[quest, selfimprovement, course, failure, time, habit, gut, reaction, try, pathetic, helpful, thi...|
|[throwaway, account, easterncentral, second, third, language, idea, improve, tutor, available, sk...|
|[broad, main, point, individual, scenario, project, system, team, sign, stakeholders, team, syste...|
|[outcast, school, society, awkward, shy, overweight, nerd, everything, therapy, bpd, inferior, na...|
|                            [stage, wonderful, mentors, mentors, thx, mentors, for, me, that, no, i]|
|[male, long, practice, inconsistent, saxophone, year, somedays, sightread, super, piece, day, fir...|
|[other, laugh, value, arrogant, selfish, asshole, joke, weak, af, batman

                                                                                

In [35]:
final_data.columns

['id', 'cleaned_text', 'final']

## Vectorization

In [36]:
#Apply TF-IDF filtering
tfizer = CountVectorizer(inputCol='final', outputCol='tf_features', minDF=0.01, maxDF=0.80)
tf_model = tfizer.fit(final_data)
tf_result = tf_model.transform(final_data)

idfizer = IDF(inputCol='tf_features', outputCol='tf_idf_features')
idf_model = idfizer.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

                                                                                

## LDA

In [37]:
num_topics = 10
max_iter = 100

lda = LDA(k=num_topics, maxIter=max_iter, featuresCol='tf_idf_features', seed=2503)
lda_model = lda.fit(tfidf_result)

                                                                                

24/11/18 12:34:30 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/11/18 12:34:30 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


LocalLDAModel: uid=LDA_3d988932cad9, k=10, numFeatures=941

In [38]:
vocab = tf_model.vocabulary

def get_words(token_list):
     return [vocab[token_id] for token_id in token_list]
       
udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

In [39]:
num_top_words = 15

topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
topics.select('topic', 'topicWords').show(truncate=90)

+-----+------------------------------------------------------------------------------------------+
|topic|                                                                                topicWords|
+-----+------------------------------------------------------------------------------------------+
|    0|[depression, sleep, nice, bed, single, fuck, sad, course, phone, or, fix, smoke, one, t...|
|    1|[happiness, short, value, grade, therapist, school, therapy, career, high, she, test, c...|
|    2|[content, exercise, habit, talk, muscle, book, sure, interested, weight, cold, mindset,...|
|    3|[open, house, girlfriend, ready, lose, necessary, door, kid, awkward, party, care, resp...|
|    4|[decision, path, conversation, perfect, information, girl, intention, poor, type, face,...|
|    5|[relationship, job, i, anyone, drug, deep, drink, everything, my, school, home, success...|
|    6|[social, brain, money, you, are, media, free, easy, take, try, your, of, more, mental, ...|
|    7|[te

# REMEMBER TO CHANGE DATASET TO COMPLETE ONE
# NEED TO SEE WHAT'S GOING ON WITH THE STOP WORDS
# FIT MULTIPLE OPTION IN PYSPARK


In [40]:
lda_model.logPerplexity(tfidf_result)


                                                                                

6.527195331658404

In [41]:
lda_model.logLikelihood(tfidf_result)

                                                                                

-382833.9663836582

Try with different parameters

In [48]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(lda.seed, [2503]) \
    .addGrid(lda.k, [5, 8]) \
    .addGrid(lda.maxIter, [10, 20]) \
    .build()

def evaluate_model(model, data):
    log_likelihood = model.logLikelihood(data)
    perplexity = model.logPerplexity(data)
    return log_likelihood, perplexity

best_perplexity = 100

for param_map in paramGrid:
    model = lda.copy(param_map).fit(tfidf_result)
    log_likelihood, perplexity = evaluate_model(model, tfidf_result)
    best_log_likelihood = (f"Params: {param_map}, Log Likelihood: {log_likelihood}")
    print(f"Params: {param_map}, Log Likelihood: {log_likelihood}, Perplexity: {perplexity}")
    if perplexity < best_perplexity: 
        best_perplexity = perplexity, params

print(best_perplexity)
print(best_log_likelihood)

                                                                                

Params: {Param(parent='LDA_3d988932cad9', name='seed', doc='random seed.'): 2503, Param(parent='LDA_3d988932cad9', name='k', doc='The number of topics (clusters) to infer. Must be > 1.'): 5, Param(parent='LDA_3d988932cad9', name='maxIter', doc='max number of iterations (>= 0).'): 10}, Log Likelihood: -400286.983939022, Perplexity: 6.824763637278805


                                                                                

Params: {Param(parent='LDA_3d988932cad9', name='seed', doc='random seed.'): 2503, Param(parent='LDA_3d988932cad9', name='k', doc='The number of topics (clusters) to infer. Must be > 1.'): 5, Param(parent='LDA_3d988932cad9', name='maxIter', doc='max number of iterations (>= 0).'): 20}, Log Likelihood: -395287.41595394246, Perplexity: 6.739522619819511


                                                                                

Params: {Param(parent='LDA_3d988932cad9', name='seed', doc='random seed.'): 2503, Param(parent='LDA_3d988932cad9', name='k', doc='The number of topics (clusters) to infer. Must be > 1.'): 8, Param(parent='LDA_3d988932cad9', name='maxIter', doc='max number of iterations (>= 0).'): 10}, Log Likelihood: -403286.535717719, Perplexity: 6.875905025154921




Params: {Param(parent='LDA_3d988932cad9', name='seed', doc='random seed.'): 2503, Param(parent='LDA_3d988932cad9', name='k', doc='The number of topics (clusters) to infer. Must be > 1.'): 8, Param(parent='LDA_3d988932cad9', name='maxIter', doc='max number of iterations (>= 0).'): 20}, Log Likelihood: -397205.56277359754, Perplexity: 6.772226402832697


                                                                                