In [3]:
from bs4 import BeautifulSoup
from gensim.models import CoherenceModel
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim.corpora as corpora
import gensim
import html
import itertools
import nltk
import numpy as np
import pandas as pd
import re
import wandb

In [4]:
# Read the data
df_comments = pd.read_pickle('./pickle_dataframes/comments_typecasted.pkl')
df_posts = pd.read_pickle('./pickle_dataframes/posts_typecasted.pkl')

df_postlinks = pd.read_pickle('./pickle_dataframes/post_links_typecasted.pkl')
df_tags = pd.read_pickle('./pickle_dataframes/tags_typecasted.pkl')
df_users = pd.read_pickle('./pickle_dataframes/users_typecasted.pkl')

In [5]:
df_comments

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId
1,12,47428,3,One of the things that make a url user-friendl...,2008-09-06 13:51:47.843,4642
2,14,47481,0,"I agree, both CodeRush and RefactorPro are vis...",2008-09-06 14:15:46.897,4642
3,15,47373,0,Just wanted to mention that this is an excelle...,2008-09-06 14:30:40.217,2495
4,16,47497,1,"Indeed, the only way to do this is get the ser...",2008-09-06 14:42:35.303,4642
7,20,1517,0,"In the interests of tact, this is the kind of ...",2008-09-06 15:44:39.477,199
...,...,...,...,...,...,...
89336446,135796215,77031085,0,"`sub(' R-HSA.*', '', a)`",2023-09-03 06:01:39.930,3962914
89336448,135796217,77031076,0,"At the risk of stating the obvious, are you us...",2023-09-03 06:02:27.873,2288659
89336450,135796219,77030109,0,no problem. could you also upvote and accept m...,2023-09-03 06:02:45.790,22240478
89336451,135796220,77031063,0,remove await and async from getdistricts,2023-09-03 06:03:14.907,22196971


### Filtering Posts

In [None]:
questions_df = df_posts[df_posts['PostTypeId'] == 1]
sub_posts = df_posts[df_posts['PostTypeId'] == 2]

### Topic model: only posts from active users

In [21]:
# Remove entries with -1 in UserId and OwnerUserId columns
df_comments = df_comments[df_comments['UserId'] != -1]
df_posts = df_posts[df_posts['OwnerUserId'] != -1]

# Calculate post and comment counts for each user
user_posts_count = df_posts.groupby('OwnerUserId')['OwnerUserId'].size().rename('PostCount')
user_comments_count = df_comments.groupby('UserId')['UserId'].size().rename('CommentCount')

# Merge counts with user data and fill missing values
user_data = (df_users
             .merge(user_posts_count, left_on='Id', right_index=True, how='left')
             .merge(user_comments_count, left_on='Id', right_index=True, how='left')
             .fillna({'PostCount': 0, 'CommentCount': 0}))

# Add a column for total activity and filter for active users
active_users = user_data.assign(TotalActivity=lambda x: x['PostCount'] + x['CommentCount'])
active_users = active_users[active_users['TotalActivity'] > 20]

# Set of active user IDs
active_user_ids = set(active_users['Id'])

# Filter dataframes for active user activity
filtered_questions_df = questions_df[questions_df['OwnerUserId'].isin(active_user_ids)]
filtered_sub_posts = sub_posts[(sub_posts['ParentId'].isin(active_user_ids)) |
                                (sub_posts['Id'].isin(active_user_ids))]

# Combine filters for comments related to active users
active_user_post_ids = set(df_posts[df_posts['OwnerUserId'].isin(active_user_ids)]['Id'])
filtered_comments = df_comments[(df_comments['UserId'].isin(active_user_ids)) | 
                                (df_comments['PostId'].isin(active_user_post_ids))].drop_duplicates()

In [23]:
print(questions_df.size)
print(filtered_questions_df.size, '\n')

print(sub_posts.size)
print(filtered_sub_posts.size, '\n')

print(df_comments.size)
print(filtered_comments.size, '\n')

224028
139902 

505260
13342 

1054752
998796 



### Preprocess text

In [None]:
# Modify preprocess_text function
def preprocess_text(text, remove_stopwords=False, use_lemmatize=True, use_stemmer=False):
    # Decode HTML entities
    text = html.unescape(text)

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())

    words = text.split()
    if remove_stopwords:
        words = [word for word in words if word not in stopwords.words('english')]
    if use_lemmatize:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    elif use_stemmer:  # Apply stemming only if use_stemmer is True
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]

    text = ' '.join(words)
    
    return text

In [None]:
# WandB Timeeee
wandb.login()

In [None]:
# Define apply_lda_and_log function with run_name parameter
def apply_topic_modeling_and_log(df, remove_stopwords, use_lemmatize, use_stemmer, tags_weighting, run_name, ngram_range=(1, 1), max_features=1000):
    # Start a new WandB run with the specified name
    wandb.init(project="stackexchange_politics", entity="s223730", name=run_name)

    # Initialize dictionaries to store topic distributions
    lda_distributions = {}
    nmf_distributions = {}

    # Preprocess Title, Body, and Tags
    df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
    df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
    df['Tags'] = df['Tags'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))


    # Combine Title, Body, and Tags with specified weight for Tags
    # We Keep the original order (title, body, tags) as it reflects the natural flow of information
    df['CombinedText'] = df['Title'] + ' ' + df['Body'] + ' ' + (df['Tags'] * tags_weighting)

    # Create a Dictionary and Corpus needed for Topic Modeling
    words = [doc.split() for doc in df['CombinedText']]
    id2word = corpora.Dictionary(words)
    corpus = [id2word.doc2bow(text) for text in words]

    # Apply TF-IDF with the specified max_features
    # ngram_range=(1, 2) for bi-grams, (1, 3) for tri-grams, and (2, 2) for only bi-grams
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['CombinedText'])

    # Apply LDA and NMF for different numbers of topics
    # Prepare a structured dictionary to store results with n_topics as part of the key
    all_topics_results = {}
    for n_topics in [5, 10, 15, 20]:
        
        # LDA
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
        lda.fit(tfidf_matrix)

        # Extract Topic Distributions for LDA
        lda_topic_distributions = lda.transform(tfidf_matrix)

        # Normalize LDA Topic Distributions
        lda_normalized = np.array(lda_topic_distributions) / np.sum(lda_topic_distributions, axis=1)[:, None]

        # Calculate Coherence Score
        lda_gensim = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=0)
        coherence_model_lda = CoherenceModel(model=lda_gensim, texts=words, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

        # Calculate LDA Perplexity
        lda_perplexity = lda.perplexity(tfidf_matrix)

        # Log Coherence and Perplexity Score
        wandb.log({"coherence_score": coherence_lda, "perplexity_score": lda.perplexity(tfidf_matrix)})
        
        # Extract and log the top words for each topic as a table
        feature_names = tfidf_vectorizer.get_feature_names_out()
        top_words_data = []
        for topic_idx, topic in enumerate(lda.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            top_words_data.append([f"Topic {topic_idx}"] + top_words)

        # Create a WandB Table with top words data
        columns = ["Topic"] + [f"Word {i+1}" for i in range(10)]
        top_words_table = wandb.Table(data=top_words_data, columns=columns)
        
        # Log the table to WandB
        wandb.log({f"n_topics_{n_topics}_cleaned_{str(remove_stopwords)}_lemmatize_{str(use_lemmatize)}_weight_{tags_weighting}": top_words_table})

        # NMF
        nmf_model = NMF(n_components=n_topics, random_state=0)
        nmf_W = nmf_model.fit_transform(tfidf_matrix)

        # Normalize NMF Topic Distributions (nmf_W is already the topic distribution matrix)
        nmf_normalized = np.array(nmf_W) / np.sum(nmf_W, axis=1)[:, None]

        nmf_H = nmf_model.components_

        # Calculate NMF Reconstruction Error
        nmf_reconstruction_error = np.linalg.norm(tfidf_matrix - nmf_W.dot(nmf_H))

        # Log the top words for each topic for NMF
        nmf_top_words_data = []
        for topic_idx, topic in enumerate(nmf_H):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            nmf_top_words_data.append([f"Topic {topic_idx}"] + top_words)

        nmf_top_words_table = wandb.Table(data=nmf_top_words_data, columns=columns)
        wandb.log({f"nmf_n_topics_{n_topics}": nmf_top_words_table})

        # Store the results including perplexity and reconstruction error
        all_topics_results[f"{run_name}_n_topics_{n_topics}"] = {
            'lda_normalized': lda_normalized,
            'nmf_normalized': nmf_normalized,
            'lda_coherence': coherence_lda,
            'lda_perplexity': lda_perplexity,
            'nmf_reconstruction_error': nmf_reconstruction_error,
            'lda_top_words': top_words_data,
            'nmf_top_words': nmf_top_words_data
        }
        
    # Close WandB run
    wandb.finish()

    # Return the topic distributions
    return all_topics_results

### Running different LDA configurations

In [None]:
# Define your parameter ranges
remove_stopwords = True
use_lemmatize_options = [False, True]
use_stemmer_options = [False, True]
tags_weighting_options = [1, 2, 5]
ngram_range_options = [(1, 1), (1, 2), (1, 3)]
max_features_options = [1000, 2000]

# Store the results for each n_topics uniquely
all_results = {}

# Iterate over the combinations of other options
for use_lemmatize, use_stemmer, tags_weighting, ngram_range, max_features in itertools.product(use_lemmatize_options, use_stemmer_options, tags_weighting_options, ngram_range_options, max_features_options):
    
    # Skip the iteration if both lemmatize and stemmer are set to True
    if use_lemmatize and use_stemmer:
        continue
    
    # Construct a unique run name for this combination
    run_name = f"Run_remove_{remove_stopwords}_lemmatize_{use_lemmatize}_stemmer_{use_stemmer}_weight_{tags_weighting}_ngram_{ngram_range}_maxfeat_{max_features}"

    # Run the function and get the results
    topics_results = apply_topic_modeling_and_log(
        questions_df, 
        remove_stopwords, 
        use_lemmatize, 
        use_stemmer, 
        tags_weighting, 
        run_name, 
        ngram_range, 
        max_features
    )

    # Update all_results to include these structured results
    all_results.update(topics_results)

In [None]:
# Save the results to a file or handle them as needed
# For example, saving to a pickle file
import pickle
with open('topic_modeling_results_n_topics.pkl', 'wb') as f:
    pickle.dump(all_results, f)

### Suggestions for Improvements
- Adjust StopWords?
- **Hyperparameter Tuning**: Tune the parameters of the LDA model,
    - learning decay
    - batch size

### Topic Modelling


- Should only Post-Level have topic assigned to them?
    - Then Sub-Posts are assigned the same topic as Post
    - Comments are assigned the same topic as Post

In [None]:
questions_df.to_pickle('questions_cleaned_text.pkl')

### Sentiment Analysis
Apply Sentiment Analysis on:
- Post Level
- Sub Post Level
- Comment Level

### Community Detection

**User-Post-Topic Matrix**: 
- Create a matrix where rows represent users and columns represent topics. 
- Each cell contains the count of posts/comments a user has made in a particular topic.
    - Post Level: where `PostTypeId` == 1 AND `ParentId` == -1
    - Sub Post Level: where `PostTypeId` == 1 AND `ParentId` != -1
    - Comment Level: where `PostTypeId` == 2
- **Include Post Statistics**
    - AcceptedAnswerId
    - Score
    - ViewCount
    - AnswerCount
    - CommentCount
- **Include Comment Statistics**
    - Score

**Clustering Algorithms**
- K-Means: Use the user-topic matrix to cluster users. Determine the optimal number of clusters (communities) using the Elbow method or Silhouette score.

- Hierarchical Clustering: Useful for understanding the data structure and forming hierarchical communities. Dendrograms can visualize the community structure.

- DBSCAN: Good for datasets with noise and clusters of varying densities.

**Market Basket Analysis**
- Association Rules and Apriori Algorithm: 
    - Treat each user's set of topics as a 'basket'. 
    - Identify strong rules where the presence of one topic implies the presence of another in a user's posts
    - This can highlight topic-based communities.
- Frequent Itemsets: 
    - Identify sets of topics that frequently occur together in users' posts.

**Locality Sensitive Hashing (LSH)**
- LSH for Dimension Reduction: 
    - If the user-topic matrix is very sparse and high-dimensional, LSH can reduce dimensions while preserving the similarity structure. This can make subsequent clustering more effective.

**Advanced Techniques**
- PCY Algorithm: If you're dealing with very large data, this algorithm efficiently finds frequent itemsets, useful in subsequent association rule mining.

### Evaluating Communities

**Davies-Bouldin Index**: Evaluate the quality of clusters. 
- Lower Davies-Bouldin index values signify better clustering.