# Reduced Top2Vec Training

## Dependencies

In [None]:
import pandas as pd
import os
import datetime
import warnings
from top2vec import Top2Vec
from gensim import corpora, models
import logging
import seaborn as sns
import matplotlib.pyplot as plt

logging.disable(logging.CRITICAL)
warnings.filterwarnings("ignore")

## Load Dataset

In [None]:
# Load Processed Data
df = pd.read_csv("../data/processed/20250515_1207_minimal_clean_merged_tweets.csv")
df.info()

## Preparation

In [None]:
# CONFIGURATION FOR SAVING
model_name = 'TOP2VEC_reduced'

# Get today's date in YYYYMMDD format
date_today = datetime.datetime.today().strftime("%Y%m%d_%H%M")

# Saved dir path
results_dir = f"../results/{date_today}_{model_name}"

# Create the directory if it doesn't exist
os.makedirs(results_dir, exist_ok=True)

TOP_DIVERSITY_WORDS_N = 30
TOP_COHERENCE_WORDS_N = 10

model_path = f"../results/{date_today}_TOP2VEC/top2vec_model_{date_today}.model"
top2vec_model = Top2Vec.load(model_path)

# Tokenization
documents = df['final_text'].astype(str).tolist()
tokenized_texts = [str(doc).split() for doc in documents]
dictionary = corpora.Dictionary(tokenized_texts)

### Custom Diversity Score Function

In [None]:
def compute_topic_diversity_from_topics(topics_words, top_n=10):
    all_words = []
    for topic in topics_words:
        all_words.extend(topic[:top_n])
    unique_words = set(all_words)
    diversity_score = len(unique_words) / len(all_words)
    return diversity_score

## Main Reduce Topic Function

In [None]:
for num_topics_target in range(100, 24, -25):
    print(f"Reduced to {num_topics_target} topics")

    # Reduce prev model
    top2vec_model.hierarchical_topic_reduction(num_topics=num_topics_target)

    # Get only reduced topics
    topics_words, _, _ = top2vec_model.get_topics()
    topics_words = topics_words[:num_topics_target]

    # Prepare topic_words_list for coherence calculation
    topic_words_list = [topic[:TOP_DIVERSITY_WORDS_N] for topic in topics_words]

    # Now save topic-word list properly
    topic_word_list = []
    for i, word_list in enumerate(topics_words):
        for word in word_list[:TOP_DIVERSITY_WORDS_N]:
            topic_word_list.append({
                "topic": i,
                "word": word
            })

    # Compute Coherence Score
    coherence_model = models.CoherenceModel(
        topics=topic_words_list,
        texts=tokenized_texts,
        dictionary=dictionary,
        coherence='c_v',
        topn=TOP_COHERENCE_WORDS_N,
    )
    coherence_score = coherence_model.get_coherence()

    print(f"Final Top2Vec model coherence (c_v): {coherence_score:.4f}")

    # Compute Topic Diversity Scores
    top_n_values = [5, 10, 20, 30]
    diversity_score_results = []
    for top_n in top_n_values:
        diversity_score = compute_topic_diversity_from_topics(topics_words, top_n=top_n)
        diversity_score_results.append({"top_n": top_n, "topic_diversity": diversity_score})
        print(f"top_n: {top_n} topic_diversity: {diversity_score}")

    # Save diversity scores
    df_diversity = pd.DataFrame(diversity_score_results)
    df_diversity.to_csv(os.path.join(results_dir, f"diversity_scores_reduced_{num_topics_target}_top2vec_topic_{date_today}.csv"), index=False)

    # Save topic-word distributions
    topic_word_list = []
    for i, topic_words in enumerate(topics_words):
        for word in topic_words[:TOP_DIVERSITY_WORDS_N]:
            topic_word_list.append({
                "topic": i,
                "word": word
            })
    df_topics = pd.DataFrame(topic_word_list)
    df_topics.to_csv(os.path.join(results_dir, f"topic_word_distributions_reduced_{num_topics_target}_top2vec_{date_today}.csv"), index=False)

    # Save model
    top2vec_model.save(os.path.join(results_dir, f"model_reduced_{num_topics_target}_top2vec_{date_today}.model"))

    # Save Summary
    summary = {
        "coherence_score": coherence_score,
        "dictionary_size": len(dictionary),
        "num_documents": len(documents),
        "num_topics": len(topics_words),
    }
    for row in diversity_score_results:
        summary[f"diversity_score_top{row['top_n']}"] = row["topic_diversity"]

    pd.DataFrame([summary]).to_csv(os.path.join(results_dir, f"scores_model_summary_reduced_{num_topics_target}_top2vec_{date_today}.csv"), index=False)

    print(f"Final Reduced Top2Vec to {num_topics_target} topic model, topics, and summaries saved to: {results_dir}")

    # Build structured list
    topic_word_data = []
    for topic_num, word_list in enumerate(topics_words):
        for rank, word in enumerate(word_list, start=1):
            topic_word_data.append({
                "topic": topic_num,
                "word_rank": rank,
                "word": word,
            })

    # Convert to DataFrame
    df_topic_words = pd.DataFrame(topic_word_data)

    # Save to CSV
    topic_words_filename = os.path.join(results_dir, f"top{TOP_DIVERSITY_WORDS_N}_reduced_{num_topics_target}_top2vec_words_per_topic_{date_today}.csv")
    df_topic_words.to_csv(topic_words_filename, index=False)
    print(f"Top {TOP_DIVERSITY_WORDS_N} words per topic saved to: {topic_words_filename}")

## Visualize The Reduction Results

In [None]:
df = pd.read_csv("../results/20250518_1841_TOP2VEC_reduced/__overall_stats_20250518_1841_TOP2VEC_reduced.csv")
df.info()

### Coherence Score Viz

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='num_topics', y='coherence_score', data=df)
plt.title('Coherence Score vs Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.grid(True)
plt.tight_layout()
plt.show()

### Diversity Score Viz

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='num_topics', y='diversity_score_top5', data=df, label='Top 5')
sns.lineplot(x='num_topics', y='diversity_score_top10', data=df, label='Top 10')
sns.lineplot(x='num_topics', y='diversity_score_top20', data=df, label='Top 20')
sns.lineplot(x='num_topics', y='diversity_score_top30', data=df, label='Top 30')
plt.title('Topic Diversity vs Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Diversity Score')
plt.legend(title='Top-N Words')
plt.grid(True)
plt.tight_layout()
plt.show()