# Top2Vec Training

## Dependencies

In [None]:
import pandas as pd
import os
import datetime
import warnings
from top2vec import Top2Vec
from gensim import corpora, models
from src.utils.topic_diversity import topic_diversity
import itertools
import logging

logging.disable(logging.CRITICAL)
warnings.filterwarnings("ignore")

## Load Dataset

In [None]:
df = pd.read_csv("../data/processed/20250515_1207_minimal_clean_merged_tweets.csv")
df.info()

## Preparation / Config

In [None]:
# CONFIGURATION FOR SAVING
model_name = 'TOP2VEC'

# Get today's date in YYYYMMDD format
date_today = datetime.datetime.today().strftime("%Y%m%d_%H%M")

# Saved dir path
results_dir = f"../results/{date_today}_{model_name}"

# Create the directory if it doesn't exist
os.makedirs(results_dir, exist_ok=True)

TOP_DIVERSITY_WORDS_N = 30
TOP_COHERENCE_WORDS_N = 10

documents = df['final_text'].astype(str).tolist()
print(f"{len(documents)} rows from full dataset.")

tokenized_texts = [str(doc).split() for doc in documents]
dictionary = corpora.Dictionary(tokenized_texts)

## Hyperparameter Embedding Models

In [None]:
# Define Embedding Params
embedding_models = ['doc2vec', 'universal-sentence-encoder', 'universal-sentence-encoder-large', 'all-MiniLM-L6-v2', 'paraphrase-multilingual-MiniLM-L12-v2']

# Store results
embedding_model_hyperparameter = []
best_coherence = -1
best_embedding_model = None

# Run grid search for embedding hyperparameter
for embedding_model in embedding_models:
    # Train Top2Vec model
    top2vec_model = Top2Vec(
        documents=documents,
        embedding_model=embedding_model,
        workers=10,
    )

    # Topic words
    topics_words, _, _ = top2vec_model.get_topics()
    topic_words_list = [topic[:TOP_DIVERSITY_WORDS_N] for topic in topics_words]

    # Compute Coherence
    coherence_model = models.CoherenceModel(
        topics=topic_words_list,
        texts=tokenized_texts,
        dictionary=dictionary,
        coherence='c_v',
        topn=TOP_COHERENCE_WORDS_N,
    )
    coherence_score = coherence_model.get_coherence()

    # Topic stats
    topic_sizes, _ = top2vec_model.get_topic_sizes()
    avg_topic_size = sum(topic_sizes) / len(topic_sizes)
    num_topics = len(topics_words)

    # Store result
    embedding_model_hyperparameter.append({
        "embedding_model": embedding_model,
        "num_topics": num_topics,
        "avg_topic_size": avg_topic_size,
        "coherence_c_v": coherence_score,
    })

    print(f"embedding_model={embedding_model}, num_topics={num_topics}, avg_topic_size={avg_topic_size} coherence={coherence_score:.4f}")

    # Track best
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_embedding_model = embedding_model


# Summary
print("\nBest Filtering Parameters:")
print(f"Best Embedding Model: {best_embedding_model}")

# Save Results
df_embedding_model_hyperparameter = pd.DataFrame(embedding_model_hyperparameter)
df_embedding_model_hyperparameter.to_csv(os.path.join(results_dir, f"embedding_model_hyperparameter_{date_today}.csv"), index=False)
print(f"Filter Embedding Model saved in: {results_dir} ")

## Hyperparameter UMAP & HDBSCAN

In [None]:
BEST_EMBEDDING = best_embedding_model

# Define search ranges
umap_args = [
        {"n_neighbors": 15, "min_dist": 0.1, "n_components": 5, "metric": "cosine"},
        {"n_neighbors": 30, "min_dist": 0.0, "n_components": 10, "metric": "cosine"},
        {"n_neighbors": 10, "min_dist": 0.25, "n_components": 5, "metric": "cosine"}
    ]

hdbscan_args = [
        {"min_cluster_size": 30, "min_samples": 10, "cluster_selection_method": "eom"},
        {"min_cluster_size": 15, "min_samples": 5, "cluster_selection_method": "eom"},
        {"min_cluster_size": 50, "min_samples": 15, "cluster_selection_method": "leaf"}
    ]

# Store results
umap_hdbscan_hyperparameter = []
best_coherence = -1
best_umap_hdbscan = None

# Run grid search for embedding hyperparameter
for umap_arg, hdbscan_arg in itertools.product(umap_args, hdbscan_args):
    # Train Top2Vec model
    top2vec_model = Top2Vec(
        documents=documents,
        embedding_model=BEST_EMBEDDING,
        workers=12,
        umap_args=umap_arg,
        hdbscan_args=hdbscan_arg,
    )

    # Topic words
    topics_words, _, _ = top2vec_model.get_topics()
    topic_words_list = [topic[:TOP_DIVERSITY_WORDS_N] for topic in topics_words]

    # Compute Coherence
    coherence_model = models.CoherenceModel(
        topics=topic_words_list,
        texts=tokenized_texts,
        dictionary=dictionary,
        coherence='c_v',
        topn=TOP_COHERENCE_WORDS_N,
    )
    coherence_score = coherence_model.get_coherence()

    # Topic stats
    topic_sizes, _ = top2vec_model.get_topic_sizes()
    avg_topic_size = sum(topic_sizes) / len(topic_sizes)
    num_topics = len(topics_words)

    # Store result
    umap_hdbscan_hyperparameter.append({
        "embedding_model": BEST_EMBEDDING,
        "umap_arg": umap_arg,
        "hdbscan_arg": hdbscan_arg,
        "num_topics": num_topics,
        "avg_topic_size": avg_topic_size,
        "coherence_c_v": coherence_score,
    })

    print(f"umap_arg={umap_arg}, hdbscan_arg={hdbscan_arg}, num_topics={num_topics}, avg_topic_size={avg_topic_size} coherence={coherence_score:.4f}")

    # Track best
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_umap_hdbscan = (umap_arg, hdbscan_arg)

# Summary
print("\nBest Umap HDBScan Parameters:")
print(f"Best Umap Args: {best_umap_hdbscan[0]}, Best Hdbscan Args: {best_umap_hdbscan[1]}")

# Save Results
df_umap_hdbscan_hyperparameter = pd.DataFrame(umap_hdbscan_hyperparameter)
df_umap_hdbscan_hyperparameter.to_csv(os.path.join(results_dir, f"umap_hdbscan_hyperparameter_{date_today}.csv"), index=False)
print(f"Filter Umap HDBScan results saved in: {results_dir} ")

## Hyperparameter min_counts & topic_merge_deltas

In [None]:
BEST_EMBEDDING = best_embedding_model
BEST_UMAP_ARGS = best_umap_hdbscan[0]
BEST_HDBSCAN_ARGS = best_umap_hdbscan[1]

# Define search ranges
min_counts = [30, 50, 100]
topic_merge_deltas = [0.05, 0.1, 0.2]

# Store results
count_delta_hyperparameter = []
best_coherence = -1
best_count_delta = None

# Run grid search for embedding hyperparameter
for min_count, topic_merge_delta in itertools.product(min_counts, topic_merge_deltas):
    # Train Top2Vec model
    top2vec_model = Top2Vec(
        documents=documents,
        embedding_model=BEST_EMBEDDING,
        workers=12,
        umap_args=BEST_UMAP_ARGS,
        hdbscan_args=BEST_HDBSCAN_ARGS,
        min_count=min_count,
        topic_merge_delta=topic_merge_delta,
    )

    # Topic words
    topics_words, _, _ = top2vec_model.get_topics()
    topic_words_list = [topic[:TOP_DIVERSITY_WORDS_N] for topic in topics_words]

    # Compute Coherence
    coherence_model = models.CoherenceModel(
        topics=topic_words_list,
        texts=tokenized_texts,
        dictionary=dictionary,
        coherence='c_v',
        topn=TOP_COHERENCE_WORDS_N,
    )
    coherence_score = coherence_model.get_coherence()

    # Topic stats
    topic_sizes, _ = top2vec_model.get_topic_sizes()
    avg_topic_size = sum(topic_sizes) / len(topic_sizes)
    num_topics = len(topics_words)

    # Store result
    count_delta_hyperparameter.append({
        "embedding_model": BEST_EMBEDDING,
        "umap_arg": BEST_UMAP_ARGS,
        "hdbscan_arg": BEST_HDBSCAN_ARGS,
        "min_count": min_count,
        "topic_merge_delta": topic_merge_delta,
        "num_topics": num_topics,
        "avg_topic_size": avg_topic_size,
        "coherence_c_v": coherence_score,
    })

    print(f"min_count={min_count}, topic_merge_delta={topic_merge_delta}, num_topics={num_topics}, avg_topic_size={avg_topic_size} coherence={coherence_score:.4f}")

    # Track best
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_count_delta = (min_count, topic_merge_delta)


# Summary
print("\nBest Count Delta:")
print(f"Best Min Count: {best_count_delta[0]}, Best Merge Delta: {best_count_delta[1]}")

# Save Results
df_count_delta_hyperparameter = pd.DataFrame(count_delta_hyperparameter)
df_count_delta_hyperparameter.to_csv(os.path.join(results_dir, f"count_delta_hyperparameter_{date_today}.csv"), index=False)
print(f"Filter Count Delta results saved in: {results_dir} ")

## Final Model

In [None]:
BEST_EMBEDDING = best_embedding_model
BEST_UMAP_ARGS = best_umap_hdbscan[0]
BEST_HDBSCAN_ARGS = best_umap_hdbscan[1]
BEST_MIN_COUNT = best_count_delta[0]
BEST_MERGE_DELTA = best_count_delta[1]

# Train final Top2Vec model
top2vec_model = Top2Vec(
    documents=documents,
    embedding_model=BEST_EMBEDDING,
    workers=12,
    umap_args=BEST_UMAP_ARGS,
    hdbscan_args=BEST_HDBSCAN_ARGS,
    min_count=BEST_MIN_COUNT,
    topic_merge_delta=BEST_MERGE_DELTA,
)

# Get topic words
topics_words, _, _ = top2vec_model.get_topics()
topic_words_list = [topic[:TOP_DIVERSITY_WORDS_N] for topic in topics_words]

# Compute Coherence Score
coherence_model = models.CoherenceModel(
    topics=topic_words_list,
    texts=tokenized_texts,
    dictionary=dictionary,
    coherence='c_v',
    topn=TOP_COHERENCE_WORDS_N,
)
coherence_score = coherence_model.get_coherence()
print(f"Final Top2Vec model coherence (c_v): {coherence_score:.4f}")

# Compute Topic Diversity Scores
top_n_values = [5, 10, 20, 30]
diversity_score_results = []
for top_n in top_n_values:
    diversity_score = topic_diversity(top2vec_model, top_n=top_n, model_type='top2vec')
    diversity_score_results.append({"top_n": top_n, "topic_diversity": diversity_score})
    print(f"top_n: {top_n} topic_diversity: {diversity_score}")

# Save diversity scores
df_diversity = pd.DataFrame(diversity_score_results)
df_diversity.to_csv(os.path.join(results_dir, f"top2vec_topic_diversity_scores_{date_today}.csv"), index=False)

# Save topic-word distributions
topic_word_list = []
for i, topic_words in enumerate(topics_words):
    for word in topic_words[:TOP_DIVERSITY_WORDS_N]:
        topic_word_list.append({
            "topic": i,
            "word": word
        })
df_topics = pd.DataFrame(topic_word_list)
df_topics.to_csv(os.path.join(results_dir, f"top2vec_topic_word_distributions_{date_today}.csv"), index=False)

# Save document-topic distribution
doc_topics = []
doc_topics_list = top2vec_model.get_documents_topics(doc_ids=list(range(len(documents))))

for i, topic in enumerate(doc_topics_list):
    row = {
        "doc_id": i,
        "topic": topic
    }
    doc_topics.append(row)

df_doc_topics = pd.DataFrame(doc_topics)
df_doc_topics.to_csv(os.path.join(results_dir, f"top2vec_document_topic_distributions_{date_today}.csv"), index=False)

# Save model
top2vec_model.save(os.path.join(results_dir, f"top2vec_model_{date_today}.model"))

# Save Summary
summary = {
    "embedding_model": BEST_EMBEDDING,
    "min_count": BEST_MIN_COUNT,
    "topic_merge_delta": BEST_MERGE_DELTA,
    "umap_args": BEST_UMAP_ARGS,
    "hdbscan_args": BEST_HDBSCAN_ARGS,
    "coherence_score": coherence_score,
    "dictionary_size": len(dictionary),
    "num_documents": len(documents),
    "num_topics": len(topics_words),
}
for row in diversity_score_results:
    summary[f"diversity_score_top{row['top_n']}"] = row["topic_diversity"]

pd.DataFrame([summary]).to_csv(os.path.join(results_dir, f"top2vec_model_summary_{date_today}.csv"), index=False)
print(f"Final Top2Vec model, topics, and summaries saved to: {results_dir}")

### View Top-30 Words per Topic

In [None]:
topics_words, _, _ = top2vec_model.get_topics()

# Build structured list
topic_word_data = []
for topic_num, word_list in enumerate(topics_words):
    for rank, word in enumerate(word_list, start=1):
        topic_word_data.append({
            "topic": topic_num,
            "word_rank": rank,
            "word": word,
            "weight": None  # Top2Vec does not expose word weights directly
        })

# Convert to DataFrame
df_topic_words = pd.DataFrame(topic_word_data)

# Save to CSV
topic_words_filename = os.path.join(results_dir, f"top2vec_top{TOP_DIVERSITY_WORDS_N}_words_per_topic_{date_today}.csv")
df_topic_words.to_csv(topic_words_filename, index=False)

print(f"Top {TOP_DIVERSITY_WORDS_N} words per topic saved to: {topic_words_filename}")

# Print Sample Preview
print(f"\nTop {TOP_DIVERSITY_WORDS_N} Words per Topic:")
for topic_num in range(len(topics_words)):
    words_only = topics_words[topic_num]
    print(f"Topic {topic_num}: {', '.join(words_only)}")