# Topic Modeling

## Pre-processing helpers

In [3]:
import json
import re
import time
import os
from IPython.display import clear_output

import pandas as pd
import matplotlib.pyplot as plt
import stanza
from bs4 import BeautifulSoup
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaModel
from gensim.models.coherencemodel import CoherenceModel
from markdown import markdown
from wordcloud import WordCloud

In [None]:
STOPWORDS = STOPWORDS.union((
    "var", "variable", "computed", "costa", "botocore", "version", "step",
    "support", "source", "hashicorp", "service", "branch", "pull", "merge", "issue",
    "pr", "galoy-pay", "bumped", "add", "payload", "boto", "accurics", "hana",
    "bump", "added", "latest", "update", "github", "test", "sourced",
    "instead", "use", "plan", "updates", "diff", "bump-galoy-pay-image", "draft",
    "iam", "i'm", "v1", "apply", "fix", "fixes", "kvo", "needed", "tco", "create",
    "run", "code", "feat", "lambda", "need", "link", "project", "new", "change",
    "they're","SAM_template","ABC_Lambda", "AWS", "Cloudformation", "removepermission",
    "cloudformation", "how", "like"
))

UPOS = ('PROPN', 'NOUN', 'VERB', 'ADJ', 'ADV')
nlp_pipeline = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

def load_json_files(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
                data.append(json.load(file))
    return data

def clean_markup(doc):
    # Convert Markdown to HTML markup
    clean_doc = markdown(doc,extensions=['fenced_code'])
    clean_doc = BeautifulSoup(clean_doc)
    # Remove unwanted content
    for s in clean_doc.select('code'):
        s.extract()
    for s in clean_doc.select('pre'):
        s.extract()
    for s in clean_doc.select('blockquotes'):
        s.extract()
    # Remove HTML markup
    clean_doc = ''.join(clean_doc.findAll(text=True))
    # Remove URLs
    clean_doc = re.sub(r'\S*https?:\S*', '', clean_doc, flags=re.MULTILINE)
    
    return clean_doc

def prepare_document_so(doc):
    clean_doc = clean_markup(doc)
    tokens = []
    for token in nlp_pipeline(clean_doc).iter_tokens():
        token_dict = token.to_dict()[0]
        if 'upos' in token_dict and token_dict['upos'] in UPOS and token_dict['text'] not in STOPWORDS:
            tokens.append(token_dict['lemma'])
        else:
            print(f"Token missing 'upos': {token_dict}")
    
    return tokens

def prepare_corpus_so(documents):
    corpus = []
    total_docs = len(documents)
    for i in range(total_docs):
        print(f"SO post:{i}\n")
        if isinstance(documents[i], dict):
            if 'Body' in documents[i]:
                tokens_body = prepare_document_so(documents[i]['Body'])
                corpus.append(tokens_body)
                print(f"body tokens: {tokens_body}")
            if 'Title' in documents[i]:
                tokens_title = prepare_document_so(documents[i]['Title'])
                corpus.append(tokens_title)
                print(f"title tokens: {tokens_title}")
            if 'comments' in documents[i]:
                for comment in documents[i]['comments']:
                    tokens_comments = prepare_document_so(comment['Text'])
                    corpus.append(tokens_comments)
                    print(f"comments tokens: {tokens_comments}")
            if 'answers' in documents[i]:
                for answer in documents[i]['answers']:
                    tokens_answers = prepare_document_so(answer['Body'])
                    corpus.append(tokens_answers)
                    print(f"answers tokens: {tokens_answers}")
    return corpus

def build_tfidf_model_so(corpus):
    corpus_dict = Dictionary(corpus)
    corpus_bow = tuple(corpus_dict.doc2bow(sentence) for sentence in corpus)
    tfidf_model = TfidfModel(corpus_bow, normalize=True)

    return corpus_dict, corpus_bow, tfidf_model

def get_keywords(model, num_topics=-1, num_words=5):
    topic_vectors = model.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)
    return sorted(tuple(set([w[0] for t in topic_vectors for w in t[1]])))

folder_path = '../step-2-output/questions'
data = load_json_files(folder_path)

# Flatten the list of documents if necessary
documents = []
for sublist in data:
    if isinstance(sublist, list):
        documents.extend(sublist)
    else:
        documents.append(sublist)

corpus = prepare_corpus_so(documents)


(corpus_dict, corpus_bow, tfidf_model) = build_tfidf_model_so(corpus)

Explore hyperparameters
- K = {5,6,...,34,35}
- alpha = {0.01,50/K}
- beta = {0.01,50/K}
- chunksize = {1,2,4,8,...,1024}

In [None]:
results = []
for num_topics in range(5, 36):
    for alpha in (0.01, 50/num_topics):
        for beta in (0.01, 50/num_topics):
            for chunksize in (1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024):
                start_time = time.time()
                print(f"Training LDA with num_topics={num_topics}, alpha={alpha}, beta={beta}, chunksize={chunksize}")
                
                lda_model = LdaModel(
                    corpus=corpus_bow,
                    id2word=corpus_dict,
                    num_topics=num_topics,
                    alpha=alpha,
                    eta=beta,
                    chunksize=chunksize,
                    passes=100
                )
                
                perplexity = lda_model.log_perplexity(corpus_bow)
                coherence_model_lda = CoherenceModel(
                    model=lda_model,
                    texts=corpus,
                    dictionary=corpus_dict,
                    coherence='c_v'
                )
                coherence_lda = coherence_model_lda.get_coherence()
                top_words = lda_model.print_topics(num_words=10)
                results.append({
                    'num_topics': num_topics,
                    'alpha': alpha,
                    'beta': beta,
                    'chunksize': chunksize,
                    'perplexity': perplexity,
                    'coherence': coherence_lda,
                    'top_words': top_words
                })
                
                end_time = time.time()
                print(f"Completed in {end_time - start_time:.2f} seconds")

results_df = pd.DataFrame(results)

### Output Results

#### Display results using IPython display


In [None]:
display(results_df)

#### Display top words for each topic


In [None]:
for index, row in results_df.iterrows():
    print(f"Model {index + 1}: num_topics={row['num_topics']}, alpha={row['alpha']}, beta={row['beta']}, chunksize={row['chunksize']}")
    for topic in row['top_words']:
        topic_id, words_str = topic
        words = [word.split('*')[1].strip('"') for word in words_str.split(' + ')]
        print(f"Topic {topic_id}: {', '.join(words)}")
    print()

#### Plots

In [None]:

# Plotting Coherence and Perplexity Scores
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(results_df['num_topics'], results_df['coherence'], marker='o')
plt.title('Coherence Score by Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')

plt.subplot(1, 2, 2)
plt.plot(results_df['num_topics'], results_df['perplexity'], marker='o')
plt.title('Perplexity Score by Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Perplexity Score')

plt.tight_layout()
plt.show()

# Function to generate word cloud for each topic
def plot_word_cloud(lda_model, num_topics):
    for t in range(num_topics):
        plt.figure()
        plt.imshow(WordCloud(background_color='white').fit_words(dict(lda_model.show_topic(t, 20))))
        plt.axis('off')
        plt.title(f'Topic #{t}')
        plt.show()

# Generate word clouds for the best model
best_model_index = results_df['coherence'].idxmax()
best_model = results_df.loc[best_model_index]
num_topics = best_model['num_topics']

# Assuming lda_model is the best LDA model you want to visualize
# Plot word clouds for each topic
plot_word_cloud(lda_model, num_topics)

# Bar plots for topics
def plot_top_words(lda_model, num_topics, num_words=10):
    for topic_id in range(num_topics):
        plt.figure(figsize=(10, 5))
        top_words = lda_model.show_topic(topic_id, num_words)
        words, weights = zip(*top_words)
        plt.barh(words, weights)
        plt.gca().invert_yaxis()
        plt.title(f'Topic {topic_id}')
        plt.show()

# Plot top words for each topic
plot_top_words(lda_model, num_topics)
