# Reduced BERTopic Training

In [None]:
import pandas as pd
from bertopic import BERTopic
import os
import datetime
import warnings
from gensim import corpora
import logging
from src.utils.topic_diversity import topic_diversity
from copy import deepcopy
from gensim import models
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords

nltk_data_path = "../data/libs/nltk_data"
nltk.data.path.append(nltk_data_path)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.disable(logging.CRITICAL)
warnings.filterwarnings("ignore")

In [None]:
# CONFIGURATION FOR SAVING
model_name = 'BERTOPIC_reduced'

# Get today's date in YYYYMMDD format
date_today = datetime.datetime.today().strftime("%Y%m%d_%H%M")

# Saved dir path
results_dir = f"../results/{date_today}_{model_name}"

# Create the directory if it doesn't exist
os.makedirs(results_dir, exist_ok=True)# Load Processed Data
df = pd.read_csv("../data/processed/20250515_1207_minimal_clean_merged_tweets.csv")

df.info()

## Load model

In [None]:
BERTopic_model = BERTopic.load("../results/20250531_0008_BERTOPIC/bertopic_model_20250531_0008_30_words")

## Preparation

In [None]:
TOP_DIVERSITY_WORDS_N = 30
TOP_COHERENCE_WORDS_N = 10

documents = df['final_text'].astype(str).tolist()
tokenized_texts = [str(doc).split() for doc in documents]
dictionary = corpora.Dictionary(tokenized_texts)
print(f"Sampled {len(documents)} documents from full dataset.")

## Previous best vectorizer for update topic later

In [None]:
best_embedding_model = 'paraphrase-MiniLM-L12-v2'

stopwords_list = set(stopwords.words("english"))
custom_stopwords = list(stopwords_list.union({
     "actually", "ago", "agree", "also", "answer", "anyone", "around", "article", "ask", "away", "back", "bad", "bit", "could", "come", "covid", "covid-19", "covid_19", "day", "damn", "disagree", "due", "else", "ever", "everyone", "example", "finally", "find", "follow", "fuck", "get", "give", "go", "good", "hah", "haha", "happen", "hear", "hell", "info", "join", "kinda", "kind", "know", "later", "leave", "less", "link", "live", "lol", "lolol", "long", "long-covid", "long_covid", "longcovid", "look", "lot", "make", "many", "may", "maybe", "much", "must", "need", "never", "new", "next", "news", "omg", "one", "ones", "people",  "ppl", "please", "post", "probably", "pretty", "quite", "read", "really", "right", "say", "see", "share", "shit", "show", "speak", "sorry", "sort", "sort-of", "still", "suck", "sure", "take", "talk", "tell", "thank", "thank-you", "thanks", "think", "thing", "time", "today", "try", "tweet", "twitter", "type", "uh", "uh-huh", "um", "update", "use", "vid", "via", "want", "way", "well", "would", "wrong", "yeah", "yep","even" ,"keep", "yet", "thread", "story", "watch", "listen", "write", "video", "comment", "piece", "start", "stop", "let", "put", "become", "seem", "great", "amazing", "interesting", "clear", "big", "huge", "point", "amp", "rt", "the", "to", "is", "are", "was", "were", "has", "have", "had", "do", "does", "did", "can", "will", "just", "going", "gonna",  "covid", "long", "you", "we", "your", "i", "he", "she", "they", "me", "us", "our", "their", "my", "his", "her", "them", "should", "this", "that", "these", "those", "some", "any", "each", "other", "another", "most", "something", "anything", "everything", "nothing", "way"
}))
best_vectorizer = {"ngram_range": (1, 2), "min_df": 10, "max_df": 0.85, "stop_words": custom_stopwords}
BEST_VECT_ARG = best_vectorizer

## Train the reduced model

In [None]:
reduction_range = range(373, 375, 1)
original_model = deepcopy(BERTopic_model)

for nr_topics in reduction_range:
    print(f"\nReducing to {nr_topics} topics...")

    try:
        # Clone the model and reduce topics
        model_reduced = deepcopy(original_model)
        model_reduced.reduce_topics(documents, nr_topics=nr_topics)

        # Reapply custom vectorizer for clean top words
        model_reduced.update_topics(
            documents,
            vectorizer_model=CountVectorizer(**BEST_VECT_ARG),
            top_n_words=TOP_DIVERSITY_WORDS_N
        )

        topic_info_df = model_reduced.get_topic_info()
        topic_labels = topic_info_df.set_index("Topic")["Name"].to_dict()

        topic_word_data = []
        topic_words_for_coherence = []

        # Loop through topics
        for topic_id, word_list in model_reduced.get_topics().items():
            if topic_id == -1:
                continue

            topic_label = topic_labels.get(topic_id, f"Topic {topic_id}")

            # ✅ For diversity export save top 30
            for rank, (word, weight) in enumerate(word_list[:TOP_DIVERSITY_WORDS_N], start=1):
                topic_word_data.append({
                    "topic": topic_id,
                    "topic_label": topic_label,
                    "word_rank": rank,
                    "word": word,
                    "weight": weight
                })

            # ✅ For coherence split multi-word expressions
            top_words = [token for word, _ in word_list[:TOP_COHERENCE_WORDS_N] for token in word.split()]
            topic_words_for_coherence.append(top_words)

        # Save topic words to CSV
        df_words = pd.DataFrame(topic_word_data)
        df_words.to_csv(os.path.join(results_dir, f"reduced_{nr_topics}_topics_top{TOP_DIVERSITY_WORDS_N}_words_{date_today}.csv"), index=False)
        print(f"Saved topic words for {nr_topics} topics.")

        # Save interactive visualization
        vis_path = os.path.join(results_dir, f"reduced_{nr_topics}_topics_visualization_{date_today}.html")
        model_reduced.visualize_topics().write_html(vis_path)
        print(f"Saved visualization to {vis_path}")

        # Save reduced model
        model_path = os.path.join(results_dir, f"bertopic_model_reduced_{nr_topics}_{date_today}")
        model_reduced.save(model_path)
        print(f"Saved reduced model to {model_path}")

        # Final filter of topic words list
        print(f"Using {len(topic_words_for_coherence)} topics for scoring")

        # Coherence
        coherence_model = models.CoherenceModel(
            topics=topic_words_for_coherence,
            texts=tokenized_texts,
            dictionary=dictionary,
            coherence='c_v',
            topn=TOP_COHERENCE_WORDS_N
        )
        coherence_score = coherence_model.get_coherence()
        print(f"Coherence (c_v): {coherence_score:.4f}")

        # Diversity
        diversity_scores = {}
        top_n_values = [5, 10, 20, 30]
        for top_n in top_n_values:
            div_score = topic_diversity(model_reduced, top_n=top_n, model_type='bertopic')
            diversity_scores[f"diversity_score_top{top_n}"] = div_score
            print(f"Diversity top-{top_n}: {div_score:.4f}")

        # Save all scores
        result = {
            "nr_topics": nr_topics,
            "coherence_c_v": coherence_score,
            **diversity_scores
        }
        df_result = pd.DataFrame([result])
        df_result.to_csv(os.path.join(results_dir, f"scores_{nr_topics}_topics_{date_today}.csv"), index=False)
        print(f"Saved scores for {nr_topics} top  ics.")

    except Exception as e:
        print(f"Skipping {nr_topics} topics due to error: {e}")

## Visualize The Reduction Results

In [None]:
df = pd.read_csv("../results/20250531_0008_BERTOPIC_reduced/_overall_reduced_stats_20250531_0008.csv")
df.info()

### Coherence Score Viz

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='nr_topics', y='coherence_c_v', data=df)
plt.title('Coherence Score vs Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.grid(True)
plt.tight_layout()
plt.show()

### Diversity Score Viz

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='nr_topics', y='diversity_score_top5', data=df, label='Top 5')
sns.lineplot(x='nr_topics', y='diversity_score_top10', data=df, label='Top 10')
sns.lineplot(x='nr_topics', y='diversity_score_top20', data=df, label='Top 20')
sns.lineplot(x='nr_topics', y='diversity_score_top30', data=df, label='Top 30')
plt.title('Topic Diversity vs Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Diversity Score')
plt.legend(title='Top-N Words')
plt.grid(True)
plt.tight_layout()
plt.show()