In [1]:
import sparknlp
from sparknlp.annotator import Tokenizer, PerceptronModel
from sparknlp.base import DocumentAssembler
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.clustering import LDA
from sparknlp.annotator import StopWordsCleaner
from pyspark.sql import types as T
from pyspark.sql import functions as F
from sparknlp.annotator import NGramGenerator
from sparknlp.base import Finisher
from pyspark.ml.tuning import ParamGridBuilder

spark = sparknlp.start()

data = spark.read.csv("../cleaned_moral_scores.csv", header= True).select(["id", "cleaned_text"])

#Preprocessing
documentAssembler = DocumentAssembler()\
     .setInputCol("cleaned_text")\
     .setOutputCol('document')

tokenizer = Tokenizer() \
            .setInputCols(['document'])\
            .setOutputCol('tokenized')

normalizer = Normalizer() \
     .setInputCols(['tokenized']) \
     .setOutputCol('normalized') 

english = [
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", 
    "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "cannot", "could", "did", 
    "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", 
    "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "it", 
    "its", "itself", "let", "me", "more", "most", "must", "my", "myself", "no", "nor", "not", "of", "off", "on", 
    "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "some", "such", 
    "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", 
    "through", "to", "too", "under", "until", "up", "very", "was", "we", "were", "what", "when", "where", "which", 
    "while", "who", "whom", "why", "with", "would", "you", "your", "yours", "yourself", "yourselves", "will", "ll", 
    "re", "ve", "d", "s", "m", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", 
    "s", "t", "u", "v", "w", "x", "y", "z", "many", "us", "ok", "hows", "ive", "ill", "im", "cant", "topics", "topic",
    "discuss", "thoughts", "yo", "thats", "whats", "lets", "nothing", "oh", "omg", 
         "things", "stuff", "yall", "haha", "yes", "no", "wo", "like", 'good', 
         'work', 'got', 'going', 'dont', 'really', 'want', 'make', 'think', 
         'know', 'feel', 'people', 'life', "getting", "lot" "great", "i", "me", 
         "my", "myself", "we", "our", "ours", "ourselves", 
        "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", 
        "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
        "they", "them", "their", "theirs","themselves", "what", "which", "who", 
        "whom", "this", "that", "these", "those", "am", "is", "are", "was", 
        "were", "be", "been", "being", "have", "has", "had", "having", "do", 
        "does", "did", "doing", "will", "would", "should", "can", "could", "may",
        "might", "must", "shall", "ought", "about", "above", "across", "after", 
        "against", "along", "amid", "among", "around", "as", "at", "before", "behind",
        "below", "beneath", "beside", "between", "beyond", "but", "by", 
        "concerning", "considering", "despite", "down", "during", "except", "for",
        "from", "in", "inside", "into", "like", "near", "next", "notwithstanding",
        "of", "off", "on", "onto", "opposite", "out", "outside", "over", "past",
        "regarding", "round", "since", "than", "through", "throughout", "till", 
        "to", "toward", "towards", "under", "underneath", "unlike", "until", "up",
        "upon", "versus", "via", "with", "within", "without", "cant", "cannot", 
        "couldve", "couldnt", "didnt", "doesnt", "dont", "hadnt", "hasnt", 
        "havent", "hed", "hell", "hes", "howd", "howll", "hows", "id", "ill", 
        "im", "ive", "isnt", "itd", "itll", "its", "lets", "mightve", "mustve", 
        "mustnt", "shant", "shed", "shell", "shes", "shouldve", "shouldnt", 
        "thatll", "thats", "thered", "therell", "therere", "theres", "theyd", 
        "theyll", "theyre", "theyve", "wed", "well", "were", "weve", "werent", 
        "whatd", "whatll", "whatre", "whats", "whatve", "whend", "whenll", 
        "whens", "whered", "wherell", "wheres", "whichd", "whichll", "whichre", 
        "whichs", "whod", "wholl", "whore", "whos", "whove", "whyd", "whyll", 
        "whys", "wont", "wouldve", "wouldnt", "youd", "youll", "youre", "youve",
        "f", "m", "because", "go", "lot", "get", "still", "way", "something", "much",
        "thing", "someone", "person", "anything", "goes", "ok", "so", "just", "mostly", 
        "put", "also", "lots", "yet", "ha", "etc"]

time = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", 
        "sunday", "morning", "noon", "afternoon", "evening", "night", "midnight",
        "dawn", "dusk", "week", "weekend", "weekends","weekly", "today", 
        "yesterday", "tomorrow", "yesterdays", "todays", "mondays", "tuesdays",
        "wednesdays", "thursdays", "fridays", "saturdays", "sundays", "day",
        "everyday", "daily", "workday", 'time', 'month', 'year', 'pm', 'am', "ago",
        "year"]

reddit = ["welcome", "hi", "hello", "sub", "reddit", "thanks", "thank", "maybe",
          "wo30", "mods", "mod", "moderators", "subreddit", "btw", "aw", "aww", 
          "aww", "hey", "hello", "join", "joined", "post", "rselfimprovement", "op"]

topic_specific = ["self", "improvement", "change", "action",
    'change', 'start', 'goal', 'habit', 'new', 'old', 
    'care', 'world', 'everyone', 'love', 'u', 'right', 'mean', 'matter',
    'best', 'step', 'focus', 'hard', 'small',
    'bad', 'help', 'time', 'problem', 'issue', 'advice',
    'bit', 'experience', 'different',
    'point', 'situation', 'negative', 'control', 'positive',
    'use', 'question', 'idea', 'amp', 'medium', 'hour', 'day', 'minute',
    'aaaaloot', "selfimprovement", "_", "ampxb"]

stopwords = english + time + reddit + topic_specific

stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['normalized']) \
     .setOutputCol('unigrams') \
     .setStopWords(stopwords)

pos = PerceptronModel.load("/project/macs40123/spark-jars/pos_anc_en_3.0.0_3.0_1614962126490/")\
      .setInputCols("document", "unigrams")\
      .setOutputCol("pos")

finisher = Finisher().setInputCols(['unigrams', 'pos'])

my_pipeline = Pipeline(
      stages = [
          documentAssembler,
          tokenizer,
          normalizer,
          stopwords_cleaner,
          #ngrammer,
          pos,
          finisher
      ])

pipelineModel = my_pipeline.fit(data)
processed_data = pipelineModel.transform(data)
processed_data.persist()

#Filter by POS
def filter_unigrams(finished_unigrams, finished_pos):
    '''Filters individual words based on their POS tag'''
    return [word for word, pos in zip(finished_unigrams, finished_pos)
            if pos in ['JJ', 'NN', 'NNS', 'NNPS']]

udf_filter_unigrams = F.udf(filter_unigrams, T.ArrayType(T.StringType()))

processed_data = processed_data.withColumn('filtered_unigrams_by_pos', udf_filter_unigrams(
                                                   F.col('finished_unigrams'),
                                                   F.col('finished_pos')))

#Now that POS was done, lemmatization makes more sense at this point

#Merge tokens as just one string to be able to take it as a document in the new Pipeline
tokens_as_string = F.udf(lambda x: ' '.join(x), T.StringType())
processed_data = processed_data.withColumn('joined_tokens', tokens_as_string(F.col('filtered_unigrams_by_pos')))

last_documentAssembler = DocumentAssembler() \
     .setInputCol('joined_tokens') \
     .setOutputCol('joined_document')

last_tokenizer = Tokenizer() \
     .setInputCols(['joined_document']) \
     .setOutputCol('tokenized')
     
lemmatizer = LemmatizerModel.load("../models/lemma_ewt_en_3.4.3_3.0_1651416655397/")\
      .setInputCols("tokenized")\
      .setOutputCol("lemmatized")

stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemmatized']) \
     .setOutputCol('final') \
     .setStopWords(stopwords)

last_finisher = Finisher() \
     .setInputCols(['final']) \

last_pipeline = Pipeline() \
     .setStages([last_documentAssembler,                  
                 last_tokenizer,
                 lemmatizer,
                 stopwords_cleaner,
                 last_finisher])

final_data = last_pipeline.fit(processed_data).transform(processed_data)

processed_data.unpersist()
final_data.persist()

## Vectorization
#Apply TF-IDF filtering
tfizer = CountVectorizer(inputCol='finished_final', outputCol='tf_features', minDF=0.01, maxDF=0.80, vocabSize= 2000)
tf_model = tfizer.fit(final_data)
tf_result = tf_model.transform(final_data)

idfizer = IDF(inputCol='tf_features', outputCol='tf_idf_features')
idf_model = idfizer.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

final_data.unpersist()
tfidf_result.persist()

24/11/28 11:29:53 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


                                                                                

DataFrame[id: string, cleaned_text: string, finished_unigrams: array<string>, finished_pos: array<string>, filtered_unigrams_by_pos: array<string>, joined_tokens: string, finished_final: array<string>, tf_features: vector, tf_idf_features: vector]

Functions to evaluate and interpret model

In [4]:
vocab = tf_model.vocabulary

def evaluate_model(model, data):
    log_likelihood = model.logLikelihood(data)
    perplexity = model.logPerplexity(data)
    return log_likelihood, perplexity

def get_words(token_list):
     return [vocab[token_id] for token_id in token_list]
udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

In [5]:
## LDA
lda = LDA(k=10, maxIter=50, learningDecay=0.5, learningOffset = 50, featuresCol='tf_idf_features', seed=2503)
lda_model = lda.fit(tfidf_result)

                                                                                

24/11/28 11:40:33 WARN OnlineLDAOptimizer: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.
24/11/28 11:40:33 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/11/28 11:40:33 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


                                                                                

In [6]:
evaluate_model(lda_model, tfidf_result)

                                                                                

(-175306522.5911957, 5.891725616579532)

In [7]:
num_top_words = 15
topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
topics.select('topic', 'topicWords').show(truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|topicWords                                                                                                                                         |
+-----+---------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[book, read, fear, mind, practice, brain, meditation, behavior, power, process, failure, challenge, growth, ability, personal]                     |
|1    |[woman, man, girl, guy, sex, porn, attractive, dude, partner, addiction, car, shit, fuck, apps, male]                                              |
|2    |[relationship, therapy, therapist, happy, happiness, date, toxic, personality, expectation, tough, friend, friendships, honest, mindset, character]|
|3    |[kid, parent, child, family, mom, god, dream, dad, money,

In [8]:
## LDA
lda3 = LDA(k=10, maxIter=50, learningDecay=0.5, learningOffset = 50, featuresCol='tf_idf_features', topicConcentration= 0.1, seed=2503)
lda_model3 = lda3.fit(tfidf_result)

24/11/28 11:43:35 WARN OnlineLDAOptimizer: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.


                                                                                

In [9]:
evaluate_model(lda_model3, tfidf_result)

                                                                                

(-175306522.5911957, 5.891725616579532)

In [10]:
num_top_words = 15
topics = lda_model3.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
topics.select('topic', 'topicWords').show(truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|topicWords                                                                                                                                         |
+-----+---------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[book, read, fear, mind, practice, brain, meditation, behavior, power, process, failure, challenge, growth, ability, personal]                     |
|1    |[woman, man, girl, guy, sex, porn, attractive, dude, partner, addiction, car, shit, fuck, apps, male]                                              |
|2    |[relationship, therapy, therapist, happy, happiness, date, toxic, personality, expectation, tough, friend, friendships, honest, mindset, character]|
|3    |[kid, parent, child, family, mom, god, dream, dad, money,

In [11]:
## LDA
lda5 = LDA(k=8, maxIter=100, learningDecay=0.5, learningOffset = 50, featuresCol='tf_idf_features', topicConcentration= 0.05, seed=2503)
lda_model5 = lda5.fit(tfidf_result)
print(evaluate_model(lda_model5, tfidf_result))
num_top_words = 15
topics = lda_model5.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
topics.select('topic', 'topicWords').show(truncate=False)

24/11/28 11:46:41 WARN OnlineLDAOptimizer: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.




(-175494930.79244724, 5.898057665206587)
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|topicWords                                                                                                                                          |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[book, read, mind, practice, fear, belief, purpose, power, success, result, learn, personal, challenge, important, ability]                         |
|1    |[woman, man, girl, guy, sex, porn, attractive, date, addiction, fuck, shit, partner, dude, drug, girlfriend]                                        |
|2    |[relationship, confidence, therapy, value, therapist, happy, happiness, confident, worth, opinion, toxic, personality, respect, esteem, expectation]|
|3    |[kid, pare

                                                                                

Create dataframe with topic distributions

In [13]:
## LDA
lda5 = LDA(k=9, maxIter=100, learningDecay=0.5, learningOffset = 50, featuresCol='tf_idf_features', topicConcentration= 0.05, seed=2503)
lda_model5 = lda5.fit(tfidf_result)
print(evaluate_model(lda_model5, tfidf_result))
num_top_words = 15
topics = lda_model5.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
topics.select('topic', 'topicWords').show(truncate=False)

24/11/28 11:53:06 WARN OnlineLDAOptimizer: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.




(-175071829.05512756, 5.883838004081948)
+-----+------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|topicWords                                                                                                                                      |
+-----+------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[book, read, practice, fear, mind, brain, meditation, power, challenge, learn, process, behavior, failure, ability, success]                    |
|1    |[woman, man, girl, guy, sex, porn, partner, date, attractive, relationship, fuck, shit, dude, girlfriend, male]                                 |
|2    |[confidence, therapy, relationship, happy, happiness, therapist, confident, toxic, worth, opinion, esteem, low, personality, expectation, tough]|
|3    |[kid, parent, family, child, mom, 

                                                                                

In [15]:
lda5 = LDA(k=8, maxIter=100, learningDecay=0.5, learningOffset = 50, featuresCol='tf_idf_features', topicConcentration= 0.05, seed=2503)
lda_model5 = lda5.fit(tfidf_result)
print(evaluate_model(lda_model5, tfidf_result))
num_top_words = 15
topics = lda_model5.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
topics.select('topic', 'topicWords').show(truncate=False)

24/11/28 11:56:14 WARN OnlineLDAOptimizer: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.




(-175494930.79244724, 5.898057665206587)
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|topicWords                                                                                                                                          |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[book, read, mind, practice, fear, belief, purpose, power, success, result, learn, personal, challenge, important, ability]                         |
|1    |[woman, man, girl, guy, sex, porn, attractive, date, addiction, fuck, shit, partner, dude, drug, girlfriend]                                        |
|2    |[relationship, confidence, therapy, value, therapist, happy, happiness, confident, worth, opinion, toxic, personality, respect, esteem, expectation]|
|3    |[kid, pare

                                                                                

In [16]:
lda_13 = LDA(k=13, maxIter=100, learningDecay=0.5, learningOffset = 50, featuresCol='tf_idf_features', topicConcentration= 0.05, seed=2503)
lda_model3 = lda_13.fit(tfidf_result)
print(evaluate_model(lda_model5, tfidf_result))
num_top_words = 15
topics = lda_model5.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
topics.select('topic', 'topicWords').show(truncate=False)

24/11/28 11:59:22 WARN OnlineLDAOptimizer: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.




(-175494930.79244724, 5.898057665206586)
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|topicWords                                                                                                                                          |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[book, read, mind, practice, fear, belief, purpose, power, success, result, learn, personal, challenge, important, ability]                         |
|1    |[woman, man, girl, guy, sex, porn, attractive, date, addiction, fuck, shit, partner, dude, drug, girlfriend]                                        |
|2    |[relationship, confidence, therapy, value, therapist, happy, happiness, confident, worth, opinion, toxic, personality, respect, esteem, expectation]|
|3    |[kid, pare

                                                                                

In [18]:
topics = lda_model3.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
topics.select('topic', 'topicWords').show(truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------+
|topic|topicWords                                                                                                                                   |
+-----+---------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[emotion, fear, behavior, mind, failure, mistakes, power, process, desire, emotional, ability, respect, feeling, challenge, attention]       |
|1    |[sex, porn, partner, car, addiction, drug, smart, drive, dude, number, risk, shit, normal, guy, fuck]                                        |
|2    |[woman, man, girl, relationship, guy, date, personality, attractive, happiness, toxic, beautiful, male, expectation, happy, girlfriend]      |
|3    |[parent, kid, family, child, money, house, dream, mom, adult, god, dad, angry, anger, mother,

Try to get coherence score

In [21]:
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.ml.feature import CountVectorizer
import numpy as np

def calculate_coherence_score(lda_model, vectorized_data, vocab, top_n=10):
    """
    Calculate coherence scores for a PySpark LDA topic model.

    Parameters:
    -----------
    lda_model : pyspark.ml.clustering.LDAModel
        The trained PySpark LDA model.
    vectorized_data : pyspark.sql.DataFrame
        The vectorized data used for training the LDA model.
    vocab : list
        Vocabulary from the CountVectorizer.
    top_n : int, optional
        Number of top words to consider for each topic (default is 10).

    Returns:
    --------
    list
        A list of coherence scores for each topic.
    """
    # Extract topics and their top words
    topics = lda_model.describeTopics(maxTermsPerTopic=top_n).collect()
    
    # Convert sparse vector to a word-document matrix
    word_doc_matrix = vectorized_data.select("tf_idf_features").rdd \
        .flatMap(lambda x: [(i, 1) for i in x["tf_idf_features"].indices]) \
        .toDF(["word_id", "count"]) \
        .groupBy("word_id").agg(F.sum("count").alias("doc_count"))
    
    # Map word_id back to vocabulary
    vocab_df = spark.createDataFrame([(i, w) for i, w in enumerate(vocab)], ["word_id", "word"])
    word_doc_matrix = word_doc_matrix.join(vocab_df, on="word_id")
    
    # Calculate total number of documents
    total_docs = vectorized_data.count()
    
    coherence_scores = []
    for topic in topics:
        topic_words = topic["termIndices"]
        
        # Calculate word co-occurrence
        pairwise_coherence = []
        for i, word1 in enumerate(topic_words):
            for word2 in topic_words[i + 1:]:
                # Document counts for each word
                doc_count_word1 = word_doc_matrix.filter(F.col("word_id") == word1).select("doc_count").first()[0]
                doc_count_word2 = word_doc_matrix.filter(F.col("word_id") == word2).select("doc_count").first()[0]
                
                # Joint document count for both words
                joint_doc_count = vectorized_data.select("features").rdd \
                    .filter(lambda row: word1 in row["features"].indices and word2 in row["features"].indices) \
                    .count()
                
                # Avoid division by zero
                if joint_doc_count > 0:
                    p_word1_word2 = joint_doc_count / total_docs
                    p_word1 = doc_count_word1 / total_docs
                    pairwise_coherence.append(np.log((p_word1_word2 + 1e-12) / p_word1))
        
        # Average coherence for the topic
        coherence_scores.append(np.mean(pairwise_coherence) if pairwise_coherence else 0)
    
    return coherence_scores

In [22]:
calculate_coherence_score(lda_model3, tfidf_result, vocab, top_n=10)

                                                                                

TypeError: Unable to infer the type of the field word_id.

In [None]:
# Transform data to get topic distributions
df_with_lda = lda_model.transform(tfidf_result)

df_with_lda.select("topicDistribution").show(1, truncate=False)

Create topic labels

In [None]:
from pyspark.ml.linalg import DenseVector

# Define the function to get the topic label
def get_topic_label(vector):
    '''
    Takes the topic distribution for each document and returns a label

    Input (numpy array): topic probabilities distribution
    Output (list of int): list of integers corresponding to topics
    '''

    #Convert numpy array into a DenseVector
    dense_vector = DenseVector(vector)
        
    #Create columns for each topic 