In [44]:
from bs4 import BeautifulSoup
from gensim.models import CoherenceModel
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim.corpora as corpora
import gensim
import html
import itertools
import nltk
import numpy as np
import pandas as pd
import pickle 
import re

In [45]:
# Load and merge dataframes
comments = pd.concat([pd.read_pickle('./pickle_dataframes/comments1.pkl'),
                      pd.read_pickle('./pickle_dataframes/comments2.pkl')]).reset_index(drop=True)

posts = pd.concat([pd.read_pickle('./pickle_dataframes/posts1.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts2.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts3.pkl')]).reset_index(drop=True)

users = pd.read_pickle('./pickle_dataframes/users.pkl')
postlinks = pd.read_pickle('./pickle_dataframes/posts_links.pkl')
tags = pd.read_pickle('./pickle_dataframes/tags.pkl')

In [22]:
#comments = comments.sample(frac=0.1, random_state=0)
#posts = posts.sample(frac=0.1, random_state=0)

From running various tests we found that the topic modelling method that yielded the best highest coherence score and the lowest perplexity score was:   

In [46]:
# Modify preprocess_text function
def preprocess_text(text, remove_stopwords=False, use_lemmatize=True):
    # Decode HTML entities
    text = html.unescape(text)

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())

    words = text.split()
    if remove_stopwords:
        words = [word for word in words if word not in stopwords.words('english')]
    if use_lemmatize:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

    text = ' '.join(words)
    
    return text

In [24]:
# Define apply_lda_and_log function with run_name parameter
def apply_topic_modeling_and_log(df, remove_stopwords, use_lemmatize, tags_weighting, run_name, ngram_range=(1, 1), max_features=1000):

    # Initialize dictionaries to store topic distributions
    lda_distributions = {}
    nmf_distributions = {}

    # Preprocess Title, Body, and Tags
    df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
    df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
    df['Tags'] = df['Tags'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))


    # Combine Title, Body, and Tags with specified weight for Tags
    # We Keep the original order (title, body, tags) as it reflects the natural flow of information
    df['CombinedText'] = df['Title'] + ' ' + df['Body'] + ' ' + (df['Tags'] * tags_weighting)

    # Create a Dictionary and Corpus needed for Topic Modeling
    words = [doc.split() for doc in df['CombinedText']]
    id2word = corpora.Dictionary(words)
    corpus = [id2word.doc2bow(text) for text in words]

    # Apply TF-IDF with the specified max_features
    # ngram_range=(1, 2) for bi-grams, (1, 3) for tri-grams, and (2, 2) for only bi-grams
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['CombinedText'])

    # Apply LDA and NMF for different numbers of topics
    # Prepare a structured dictionary to store results with n_topics as part of the key
    all_topics_results = {}
    for n_topics in [5, 10, 15, 20]:
        
        # LDA
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
        lda.fit(tfidf_matrix)

        # Extract Topic Distributions for LDA
        lda_topic_distributions = lda.transform(tfidf_matrix)

        # Normalize LDA Topic Distributions
        lda_normalized = np.array(lda_topic_distributions) / np.sum(lda_topic_distributions, axis=1)[:, None]

        # Calculate Coherence Score
        lda_gensim = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=0)
        coherence_model_lda = CoherenceModel(model=lda_gensim, texts=words, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

        # Calculate LDA Perplexity
        lda_perplexity = lda.perplexity(tfidf_matrix)

        # Extract and log the top words for each topic as a table
        feature_names = tfidf_vectorizer.get_feature_names_out()
        top_words_data = []
        for topic_idx, topic in enumerate(lda.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            top_words_data.append([f"Topic {topic_idx}"] + top_words)

        # NMF
        nmf_model = NMF(n_components=n_topics, random_state=0)
        nmf_W = nmf_model.fit_transform(tfidf_matrix)

        # Normalize NMF Topic Distributions (nmf_W is already the topic distribution matrix)
        nmf_normalized = np.array(nmf_W) / np.sum(nmf_W, axis=1)[:, None]

        nmf_H = nmf_model.components_

        # Calculate NMF Reconstruction Error
        nmf_reconstruction_error = np.linalg.norm(tfidf_matrix - nmf_W.dot(nmf_H))

        # Log the top words for each topic for NMF
        nmf_top_words_data = []
        for topic_idx, topic in enumerate(nmf_H):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            nmf_top_words_data.append([f"Topic {topic_idx}"] + top_words)


        # Store the results including perplexity and reconstruction error
        all_topics_results[f"{run_name}_n_topics_{n_topics}"] = {
            'lda_normalized': lda_normalized,
            'nmf_normalized': nmf_normalized,
            'lda_coherence': coherence_lda,
            'lda_perplexity': lda_perplexity,
            'nmf_reconstruction_error': nmf_reconstruction_error,
            'lda_top_words': top_words_data,
            'nmf_top_words': nmf_top_words_data
        }
        


    # Return the topic distributions
    return all_topics_results

In [6]:
# Test various combinations
use_lemmatize_options = [True]
tags_weighting_options = [1, 2, 5]
ngram_range_options = [(1, 1), (1, 2), (1, 3)]
max_features_options = [1000]
remove_stopwords = True 

all_results = {}

for use_lemmatize, tags_weighting, ngram_range, max_features in itertools.product(use_lemmatize_options, tags_weighting_options, ngram_range_options, max_features_options):
    run_name = f"Run_remove_{remove_stopwords}_lemmatize_{use_lemmatize}_weight_{tags_weighting}_ngram_{ngram_range}_maxfeat_{max_features}"

    topics_results = apply_topic_modeling_and_log(
        posts,
        remove_stopwords,
        use_lemmatize,
        tags_weighting, 
        run_name, 
        ngram_range, 
        max_features
    )
    print(run_name)
    all_results.update(topics_results)

# save dictionary to person_data.pkl file
#with open('all_results.pkl', 'wb') as fp:
#    pickle.dump(all_results, fp)

Run_remove_True_lemmatize_True_weight_1_ngram_(1, 1)_maxfeat_1000
Run_remove_True_lemmatize_True_weight_1_ngram_(1, 2)_maxfeat_1000
Run_remove_True_lemmatize_True_weight_1_ngram_(1, 3)_maxfeat_1000
Run_remove_True_lemmatize_True_weight_2_ngram_(1, 1)_maxfeat_1000
Run_remove_True_lemmatize_True_weight_2_ngram_(1, 2)_maxfeat_1000
Run_remove_True_lemmatize_True_weight_2_ngram_(1, 3)_maxfeat_1000
Run_remove_True_lemmatize_True_weight_5_ngram_(1, 1)_maxfeat_1000
Run_remove_True_lemmatize_True_weight_5_ngram_(1, 2)_maxfeat_1000
Run_remove_True_lemmatize_True_weight_5_ngram_(1, 3)_maxfeat_1000


### Find best topic modelling technique + parameters

In [13]:
all_results = pd.read_pickle('./all_results.pkl')

import heapq

# Initialize min-heaps to track the top 5 best scores and parameters for LDA and NMF
top_5_lda = []
top_5_nmf = []

# Iterate through all results
for run_name, results in all_results.items():
    # Extract LDA and NMF scores
    lda_score = (results['lda_coherence'], -results['lda_perplexity'])  # Negative perplexity for min-heap
    nmf_score = -results['nmf_reconstruction_error']  # Negative error for min-heap

    # Update top 5 LDA
    if len(top_5_lda) < 5 or lda_score > top_5_lda[0][0]:
        if len(top_5_lda) == 5:
            heapq.heappop(top_5_lda)
        heapq.heappush(top_5_lda, (lda_score, run_name))

    # Update top 5 NMF
    if len(top_5_nmf) < 5 or nmf_score > top_5_nmf[0][0]:
        if len(top_5_nmf) == 5:
            heapq.heappop(top_5_nmf)
        heapq.heappush(top_5_nmf, (nmf_score, run_name))

# Output top 5 LDA
print("Top 5 LDA:")
for score, params in sorted(top_5_lda, reverse=True):
    print(f"Parameters: {params}, Coherence: {score[0]}, Perplexity: {-score[1]}")

# Output top 5 NMF
print("\nTop 5 NMF:")
for score, params in sorted(top_5_nmf, reverse=True):
    print(f"Parameters: {params}, Reconstruction Error: {-score}")

Top 5 LDA:
Parameters: Run_remove_True_lemmatize_True_weight_1_ngram_(1, 2)_maxfeat_1000_n_topics_20, Coherence: 0.49817471503946614, Perplexity: 1718.5727994660563
Parameters: Run_remove_True_lemmatize_True_weight_2_ngram_(1, 3)_maxfeat_1000_n_topics_20, Coherence: 0.4828223717706496, Perplexity: 1705.171976733514
Parameters: Run_remove_True_lemmatize_True_weight_2_ngram_(1, 2)_maxfeat_1000_n_topics_20, Coherence: 0.4828223717706496, Perplexity: 1705.171976733514
Parameters: Run_remove_True_lemmatize_True_weight_2_ngram_(1, 1)_maxfeat_1000_n_topics_20, Coherence: 0.4828223717706496, Perplexity: 1722.258531185024
Parameters: Run_remove_True_lemmatize_True_weight_1_ngram_(1, 3)_maxfeat_1000_n_topics_15, Coherence: 0.481533622150989, Perplexity: 1546.6154196909786

Top 5 NMF:
Parameters: Run_remove_True_lemmatize_True_weight_5_ngram_(1, 3)_maxfeat_1000_n_topics_20, Reconstruction Error: 204.03639907100012
Parameters: Run_remove_True_lemmatize_True_weight_5_ngram_(1, 2)_maxfeat_1000_n_top

### Run the best topic model
- Parameters: Run_remove_True_lemmatize_True_weight_2_ngram_(1, 3)_maxfeat_1000_n_topics_20, Coherence: 0.4828223717706496, Perplexity: 1705.171976733514


In [47]:
questions = posts[posts.PostTypeId==1]
answers = posts[posts['PostTypeId'] == 2]

In [48]:
remove_stopwords = True
use_lemmatize = True 
tags_weighting = 1
ngram_range = (1, 2)
max_features = 1000
n_topics = 20

# Apply preprocessing to each column
questions['Title'] = questions['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
questions['Body'] = questions['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
questions['Tags'] = questions['Tags'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))

# Combine Title, Body, and Tags
questions['CombinedText'] = questions['Title'] + ' ' + questions['Body'] + ' ' + (questions['Tags'] * tags_weighting)


# Apply TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
tfidf_matrix = tfidf_vectorizer.fit_transform(questions['CombinedText'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions['Title'] = questions['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions['Body'] = questions['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

In [49]:
# Apply TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
tfidf_matrix = tfidf_vectorizer.fit_transform(questions['CombinedText'])

# Apply LDA
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda.fit(tfidf_matrix)

# Assign topics to questions
topic_assignments = lda.transform(tfidf_matrix)
questions['Topic'] = topic_assignments.argmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions['Topic'] = topic_assignments.argmax(axis=1)


In [50]:
topics_df = questions[['Id', 'Topic']]

# Merge to assign topics from questions to their answers
answers_with_topics = answers.merge(topics_df, left_on='ParentId', right_on='Id', how='left')

# Rename the 'Topic' column to something like 'InheritedTopic' to avoid confusion
# answers_with_topics.rename(columns={'Topic': 'InheritedTopic'}, inplace=True)

### Assign Topic to every post

In [61]:
# Save the posts dataframe with topic assignments
#questions.to_pickle('./pickle_dataframes/questions_with_topics.pkl')
#answers_with_topics.to_pickle('./pickle_dataframes/answers_with_topics.pkl')

In [62]:
answers_with_topics

Unnamed: 0,Id_x,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,Id_y,Topic
0,4,2,1,-1,2012-12-04 21:58:11.187,7,-1,<p>First-past-the-post voting tends to result ...,26,2012-12-04 21:58:11.187,Comment: N/A,Comment: N/A,-1,1,1,18
1,5,2,1,-1,2012-12-04 21:58:39.037,47,-1,<p>Simple plurality voting has very little in ...,8,2012-12-04 22:04:42.767,Comment: N/A,Comment: N/A,-1,1,1,18
2,13,2,7,-1,2012-12-04 22:17:48.290,85,-1,<p>The standard terms of left and right politi...,18,2012-12-04 22:17:48.290,Comment: N/A,Comment: N/A,-1,7,7,7
3,18,2,9,-1,2012-12-04 22:26:45.633,12,-1,"<p>In an IRV (<a href=""https://en.wikipedia.or...",18,2017-06-14 10:16:02.733,Comment: N/A,Comment: N/A,-1,2,9,18
4,19,2,2,-1,2012-12-04 22:29:16.460,40,-1,<p>The mathematical phenomenon you're talking ...,26,2012-12-04 22:29:16.460,Comment: N/A,Comment: N/A,-1,10,2,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36085,81116,2,81095,-1,2023-09-02 12:14:00.063,-1,-1,<blockquote>&#xA;<p>since the government is ra...,26824,2023-09-02 12:14:00.063,Comment: N/A,Comment: N/A,-1,0,81095,17
36086,81118,2,81095,-1,2023-09-02 16:28:41.700,0,-1,<p>Does not the article you link to already an...,44212,2023-09-02 16:28:41.700,Comment: N/A,Comment: N/A,-1,1,81095,17
36087,81120,2,81119,-1,2023-09-02 21:05:10.440,4,-1,<p>Generally speaking there's a large incumben...,18373,2023-09-02 22:24:01.417,Comment: N/A,Comment: N/A,-1,2,81119,18
36088,81124,2,81119,-1,2023-09-03 03:32:57.620,-1,-1,<p>Because they know where all the bodies are ...,39779,2023-09-03 03:32:57.620,Comment: N/A,Comment: N/A,-1,0,81119,18
