In [1]:
# Install required packages
%pip install bertopic sentence-transformers umap-learn hdbscan gensim plotly scikit-learn pyyaml gdown -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m90.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# ============================================================
# PROJECT SETUP - Config-based with Train/Infer Modes
# ============================================================
import os
import yaml
from pathlib import Path

# Clone repo if running fresh on Colab
if 'google.colab' in str(get_ipython()) and not os.path.exists('/content/BERTopic-arXiv-Analysis'):
    !git clone https://github.com/pavannn16/BERTopic-arXiv-Analysis.git /content/BERTopic-arXiv-Analysis

# Load configuration
def load_config():
    config_paths = ['config.yaml', '../config.yaml', '/content/BERTopic-arXiv-Analysis/config.yaml']
    for path in config_paths:
        if os.path.exists(path):
            with open(path, 'r') as f:
                return yaml.safe_load(f), path
    return None, None

config, config_path = load_config()
if config:
    print(f"Loaded config from {config_path}")
else:
    config = {'mode': 'infer'}

MODE = config.get('mode', 'infer')
print(f"Mode: {MODE.upper()}")

# Note: Hyperparameter tuning is a TRAIN-only operation
# In INFER mode, we load and display existing results
if MODE == 'infer':
    print("INFER mode: Will load existing hyperparameter tuning results")

# Setup paths
if 'google.colab' in str(get_ipython()):
    if MODE == 'train':
        from google.colab import drive
        drive.mount('/content/drive')
        PROJECT_PATH = '/content/drive/MyDrive/BERTopic-arXiv-Analysis'
        print("TRAIN mode: Personal Drive mounted")
    else:
        PROJECT_PATH = '/content/BERTopic-arXiv-Analysis'
        print("INFER mode: Using data from cloned repo")
else:
    PROJECT_PATH = str(Path(os.getcwd()).parent) if 'notebooks' in os.getcwd() else os.getcwd()
    print("Running locally")

for folder in ['data/raw', 'data/processed', 'data/embeddings', 'models', 'results']:
    os.makedirs(f'{PROJECT_PATH}/{folder}', exist_ok=True)

print(f"Project path: {PROJECT_PATH}")

Cloning into '/content/BERTopic-arXiv-Analysis'...
remote: Enumerating objects: 241, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 241 (delta 59), reused 67 (delta 38), pack-reused 143 (from 1)[K
Receiving objects: 100% (241/241), 181.40 MiB | 16.60 MiB/s, done.
Resolving deltas: 100% (127/127), done.
Updating files: 100% (66/66), done.
Loaded config from /content/BERTopic-arXiv-Analysis/config.yaml
Mode: TRAIN
Mounted at /content/drive
TRAIN mode: Personal Drive mounted
Project path: /content/drive/MyDrive/BERTopic-arXiv-Analysis


In [3]:
# Import libraries
import pandas as pd
import numpy as np
import json
import time
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
from collections import Counter

# BERTopic components
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# Evaluation
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.metrics import silhouette_score

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

print("Libraries imported!")

Libraries imported!


In [4]:
# Check GPU
import torch
if torch.cuda.is_available():
    device = 'cuda'
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
    device = 'mps'
    print("Apple Silicon GPU available")
else:
    device = 'cpu'
    print("No GPU - using CPU (will be slower)")

GPU available: NVIDIA L4


## 1. Load Processed Data

In [5]:
# Load processed data
df = pd.read_csv(f"{PROJECT_PATH}/data/processed/arxiv_cs_ai_processed.csv")
documents = df['text'].tolist()

print(f"Loaded {len(documents)} documents")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")

# Prepare tokenized docs for coherence computation
tokenized_docs = [doc.lower().split() for doc in documents]
dictionary = Dictionary(tokenized_docs)
dictionary.filter_extremes(no_below=5, no_above=0.95)
print(f"Dictionary size: {len(dictionary)}")

Loaded 19998 documents
Date range: 2025-07-03 to 2025-12-02
Dictionary size: 24009


## 2. Embedding Model Comparison

We compare two popular Sentence-BERT models:

| Model | Parameters | Speed | Quality |
|-------|------------|-------|--------|
| **all-mpnet-base-v2** | 110M | Slower | Higher |
| **all-MiniLM-L6-v2** | 22M | 5x faster | Good |

In [6]:
# Define embedding models to compare
EMBEDDING_MODELS = {
    'mpnet': 'all-mpnet-base-v2',
    'minilm': 'all-MiniLM-L6-v2'
}

embeddings_dict = {}

for name, model_name in EMBEDDING_MODELS.items():
    embeddings_path = f"{PROJECT_PATH}/data/embeddings/embeddings_{name}.npy"

    if os.path.exists(embeddings_path):
        print(f"Loading pre-computed {name} embeddings...")
        embeddings_dict[name] = np.load(embeddings_path)
    else:
        print(f"\nComputing embeddings with {model_name}...")
        start_time = time.time()

        model = SentenceTransformer(model_name)
        embeddings = model.encode(
            documents,
            batch_size=64,
            show_progress_bar=True,
            convert_to_numpy=True,
            device=device
        )

        elapsed = time.time() - start_time
        print(f"  Time: {elapsed:.1f}s ({len(documents)/elapsed:.1f} docs/sec)")

        # Save embeddings
        np.save(embeddings_path, embeddings)
        embeddings_dict[name] = embeddings
        print(f"  Saved to {embeddings_path}")

print(f"\nEmbeddings ready:")
for name, emb in embeddings_dict.items():
    print(f"  {name}: {emb.shape}")

Loading pre-computed mpnet embeddings...
Loading pre-computed minilm embeddings...

Embeddings ready:
  mpnet: (19998, 768)
  minilm: (19998, 384)


## 3. Evaluation Functions

In [7]:
def compute_metrics(topic_model, documents, embeddings, topics, tokenized_docs, dictionary):
    """
    Compute all evaluation metrics for a BERTopic model.

    Returns:
        dict with coherence, diversity, silhouette, n_topics, outlier_pct
    """
    # Get topic words
    topics_dict = topic_model.get_topics()
    if -1 in topics_dict:
        del topics_dict[-1]

    n_topics = len(topics_dict)

    # Outlier percentage
    n_outliers = sum(1 for t in topics if t == -1)
    outlier_pct = 100 * n_outliers / len(topics)

    # Topic Diversity
    all_words = []
    topic_words_list = []
    for topic_id in sorted(topics_dict.keys()):
        words = [word for word, _ in topics_dict[topic_id][:10]]
        topic_words_list.append(words)
        all_words.extend(words)

    diversity = len(set(all_words)) / len(all_words) if all_words else 0

    # Coherence (NPMI)
    try:
        coherence_model = CoherenceModel(
            topics=topic_words_list,
            texts=tokenized_docs,
            dictionary=dictionary,
            coherence='c_npmi'
        )
        coherence = coherence_model.get_coherence()
    except:
        coherence = 0.0

    # Silhouette Score
    try:
        mask = np.array(topics) != -1
        if mask.sum() > 100 and len(set(np.array(topics)[mask])) > 1:
            silhouette = silhouette_score(embeddings[mask], np.array(topics)[mask])
        else:
            silhouette = 0.0
    except:
        silhouette = 0.0

    return {
        'n_topics': n_topics,
        'outlier_pct': outlier_pct,
        'coherence': coherence,
        'diversity': diversity,
        'silhouette': silhouette,
        'combined_score': coherence * 0.5 + diversity * 0.3 + (1 - outlier_pct/100) * 0.2
    }

print("Evaluation functions defined!")

Evaluation functions defined!


## 4. Hyperparameter Grid Search

We'll search over:
- **Embedding model**: MPNet vs MiniLM
- **min_cluster_size**: [10, 15, 20, 30, 50]
- **n_neighbors**: [10, 15, 25]
- **n_components**: [5, 10]

In [8]:
# Define hyperparameter grid
PARAM_GRID = {
    'embedding_model': ['mpnet', 'minilm'],
    'min_cluster_size': [10, 15, 20, 30, 50],
    'n_neighbors': [10, 15, 25],
    'n_components': [5, 10]
}

# Calculate total combinations
n_combinations = 1
for values in PARAM_GRID.values():
    n_combinations *= len(values)

print(f"Total hyperparameter combinations: {n_combinations}")

if MODE == 'infer':
    print("INFER mode: Will load existing search results instead of running grid search")
else:
    print(f"Estimated time: {n_combinations * 0.5:.0f} minutes")

Total hyperparameter combinations: 60
Estimated time: 30 minutes


In [9]:
# ============================================================
# Run grid search (TRAIN mode) or load existing results (INFER mode)
# ============================================================
if MODE == 'infer':
    print("INFER mode: Skipping grid search (loading existing results)")
    print("   Grid search takes ~30 minutes - results are pre-computed")
else:
    # TRAIN MODE - Run full grid search
    results = []
    best_score = -float('inf')
    best_params = None
    best_model = None

    print("Starting hyperparameter search...\n")

    for emb_name in PARAM_GRID['embedding_model']:
        embeddings = embeddings_dict[emb_name]

        for min_cluster_size in PARAM_GRID['min_cluster_size']:
            for n_neighbors in PARAM_GRID['n_neighbors']:
                for n_components in PARAM_GRID['n_components']:

                    params = {
                        'embedding_model': emb_name,
                        'min_cluster_size': min_cluster_size,
                        'n_neighbors': n_neighbors,
                        'n_components': n_components
                    }

                    try:
                        # Build model
                        umap_model = UMAP(
                            n_neighbors=n_neighbors,
                            n_components=n_components,
                            min_dist=0.0,
                            metric='cosine',
                            random_state=42
                        )

                        hdbscan_model = HDBSCAN(
                            min_cluster_size=min_cluster_size,
                            min_samples=10,
                            metric='euclidean',
                            cluster_selection_method='eom',
                            prediction_data=True
                        )

                        vectorizer_model = CountVectorizer(
                            ngram_range=(1, 2),
                            stop_words='english',
                            min_df=5,
                            max_df=0.95
                        )

                        topic_model = BERTopic(
                            umap_model=umap_model,
                            hdbscan_model=hdbscan_model,
                            vectorizer_model=vectorizer_model,
                            top_n_words=10,
                            calculate_probabilities=False,
                            verbose=False
                        )

                        # Fit model
                        topics, _ = topic_model.fit_transform(documents, embeddings=embeddings)

                        # Compute metrics
                        metrics = compute_metrics(
                            topic_model, documents, embeddings, topics,
                            tokenized_docs, dictionary
                        )
                        metrics.update(params)
                        results.append(metrics)

                        # Check if best
                        if metrics['combined_score'] > best_score:
                            best_score = metrics['combined_score']
                            best_params = params.copy()
                            best_model = topic_model
                            best_topics = topics
                            best_embeddings = embeddings

                        print(f"✓ {emb_name}, mcs={min_cluster_size}, nn={n_neighbors}, nc={n_components} → "
                              f"Topics: {metrics['n_topics']}, Coh: {metrics['coherence']:.3f}, "
                              f"Div: {metrics['diversity']:.3f}, Out: {metrics['outlier_pct']:.1f}%")

                    except Exception as e:
                        print(f"✗ {emb_name}, mcs={min_cluster_size}, nn={n_neighbors}, nc={n_components} → Error: {str(e)[:50]}")

    print(f"\n{'='*60}")
    print(f"GRID SEARCH COMPLETE!")
    print(f"{'='*60}")

Starting hyperparameter search...

✓ mpnet, mcs=10, nn=10, nc=5 → Topics: 325, Coh: -0.064, Div: 0.729, Out: 33.1%
✓ mpnet, mcs=10, nn=10, nc=10 → Topics: 307, Coh: -0.046, Div: 0.723, Out: 30.9%
✓ mpnet, mcs=10, nn=15, nc=5 → Topics: 263, Coh: -0.035, Div: 0.730, Out: 32.4%
✓ mpnet, mcs=10, nn=15, nc=10 → Topics: 261, Coh: -0.043, Div: 0.746, Out: 31.1%
✓ mpnet, mcs=10, nn=25, nc=5 → Topics: 235, Coh: -0.029, Div: 0.734, Out: 32.9%
✓ mpnet, mcs=10, nn=25, nc=10 → Topics: 227, Coh: -0.029, Div: 0.752, Out: 33.1%
✓ mpnet, mcs=15, nn=10, nc=5 → Topics: 239, Coh: -0.023, Div: 0.734, Out: 29.3%
✓ mpnet, mcs=15, nn=10, nc=10 → Topics: 236, Coh: -0.015, Div: 0.732, Out: 28.4%
✓ mpnet, mcs=15, nn=15, nc=5 → Topics: 212, Coh: -0.006, Div: 0.740, Out: 29.2%
✓ mpnet, mcs=15, nn=15, nc=10 → Topics: 205, Coh: -0.012, Div: 0.752, Out: 28.9%
✓ mpnet, mcs=15, nn=25, nc=5 → Topics: 192, Coh: -0.000, Div: 0.753, Out: 32.9%
✓ mpnet, mcs=15, nn=25, nc=10 → Topics: 182, Coh: 0.003, Div: 0.768, Out: 31.2%


In [10]:
# ============================================================
# Load or create results DataFrame
# ============================================================
results_path = f"{PROJECT_PATH}/results/hyperparameter_search_results.csv"

if MODE == 'infer':
    print("INFER mode: Loading existing hyperparameter search results...")
    results_df = pd.read_csv(results_path)
    print(f"Loaded {len(results_df)} configurations from previous search")
else:
    # TRAIN MODE - Create from grid search results
    results_df = pd.DataFrame(results)
    results_df.to_csv(results_path, index=False)
    print(f"Results saved to {results_path}")

results_df = results_df.sort_values('combined_score', ascending=False)

print("\nTop 10 Configurations:")
display_cols = ['embedding_model', 'min_cluster_size', 'n_neighbors', 'n_components',
                'n_topics', 'coherence', 'diversity', 'outlier_pct', 'combined_score']
print(results_df[display_cols].head(10).to_string(index=False))

Results saved to /content/drive/MyDrive/BERTopic-arXiv-Analysis/results/hyperparameter_search_results.csv

Top 10 Configurations:
embedding_model  min_cluster_size  n_neighbors  n_components  n_topics  coherence  diversity  outlier_pct  combined_score
          mpnet                50           25            10        69   0.036245   0.865217    23.732373        0.430223
          mpnet                50           15            10        79   0.046542   0.832911    22.532253        0.428080
          mpnet                50           10             5        89   0.048002   0.822472    22.752275        0.425238
          mpnet                50           25             5        80   0.050174   0.850000    27.442744        0.425201
          mpnet                50           10            10        87   0.039587   0.829885    22.492249        0.423775
          mpnet                50           15             5        89   0.052408   0.822472    25.017502        0.422911
         minilm 

In [11]:
# Visualize hyperparameter effects
fig = make_subplots(rows=2, cols=2, subplot_titles=[
    'Coherence by min_cluster_size',
    'Coherence by Embedding Model',
    'Topics vs Coherence',
    'Outlier % vs Coherence'
])

# 1. min_cluster_size effect
for emb in results_df['embedding_model'].unique():
    subset = results_df[results_df['embedding_model'] == emb]
    grouped = subset.groupby('min_cluster_size')['coherence'].mean()
    fig.add_trace(
        go.Scatter(x=grouped.index, y=grouped.values, name=emb, mode='lines+markers'),
        row=1, col=1
    )

# 2. Embedding model comparison
emb_comparison = results_df.groupby('embedding_model')['coherence'].agg(['mean', 'std'])
fig.add_trace(
    go.Bar(x=emb_comparison.index, y=emb_comparison['mean'],
           error_y=dict(type='data', array=emb_comparison['std']),
           name='Coherence', showlegend=False),
    row=1, col=2
)

# 3. Topics vs Coherence
fig.add_trace(
    go.Scatter(x=results_df['n_topics'], y=results_df['coherence'],
               mode='markers', name='Configs', showlegend=False,
               marker=dict(color=results_df['combined_score'], colorscale='Viridis')),
    row=2, col=1
)

# 4. Outlier % vs Coherence
fig.add_trace(
    go.Scatter(x=results_df['outlier_pct'], y=results_df['coherence'],
               mode='markers', name='Configs', showlegend=False,
               marker=dict(color=results_df['combined_score'], colorscale='Viridis')),
    row=2, col=2
)

fig.update_layout(height=700, title_text="Hyperparameter Analysis")

# Save and display
fig.write_html(f"{PROJECT_PATH}/results/hyperparameter_analysis.html")
print(f"Saved: {PROJECT_PATH}/results/hyperparameter_analysis.html")

try:
    fig.show()
except Exception as e:
    print(f"Interactive display unavailable. Open the HTML file to view.")

Saved: /content/drive/MyDrive/BERTopic-arXiv-Analysis/results/hyperparameter_analysis.html


## 5. Best Model Analysis

In [12]:
# ============================================================
# Best Model Configuration
# ============================================================
print("="*60)
print("BEST MODEL CONFIGURATION")
print("="*60)

if MODE == 'infer':
    # Load best config from saved file
    with open(f"{PROJECT_PATH}/results/best_config.json", 'r') as f:
        best_config = json.load(f)
    best_params = best_config['params']
    best_score = best_config['metrics'].get('coherence_npmi', 0) * 0.5 + \
                 best_config['metrics'].get('diversity', 0) * 0.3 + \
                 (1 - best_config['metrics'].get('outlier_pct', 0)/100) * 0.2

print(f"\nEmbedding Model: {best_params['embedding_model']}")
print(f"min_cluster_size: {best_params['min_cluster_size']}")
print(f"n_neighbors: {best_params['n_neighbors']}")
print(f"n_components: {best_params['n_components']}")

# Get best model metrics from results_df
best_metrics = results_df.iloc[0]
print(f"\nMetrics:")
print(f"  Topics: {int(best_metrics['n_topics'])}")
print(f"  Coherence (NPMI): {best_metrics['coherence']:.4f}")
print(f"  Diversity: {best_metrics['diversity']:.4f}")
print(f"  Outlier %: {best_metrics['outlier_pct']:.1f}%")
print(f"  Silhouette: {best_metrics['silhouette']:.4f}")

BEST MODEL CONFIGURATION

Embedding Model: mpnet
min_cluster_size: 50
n_neighbors: 25
n_components: 10

Metrics:
  Topics: 69
  Coherence (NPMI): 0.0362
  Diversity: 0.8652
  Outlier %: 23.7%
  Silhouette: 0.0331


## 6. Outlier Reduction

BERTopic can reassign outliers to their nearest topics using various strategies.

In [13]:
# ============================================================
# Outlier Reduction (TRAIN mode) or load results (INFER mode)
# ============================================================
if MODE == 'infer':
    print("INFER mode: Loading outlier reduction results from saved config...")
    with open(f"{PROJECT_PATH}/results/best_config.json", 'r') as f:
        best_config = json.load(f)
    n_outliers_before = best_config['outlier_reduction']['before']
    n_outliers_after = best_config['outlier_reduction']['after']
    print(f"Outliers before reduction: {n_outliers_before}")
    print(f"Outliers after reduction: {n_outliers_after}")
    print(f"Outliers reduced by: {n_outliers_before - n_outliers_after} documents")
else:
    # TRAIN MODE - Run outlier reduction
    n_outliers_before = sum(1 for t in best_topics if t == -1)
    print(f"Outliers before reduction: {n_outliers_before} ({100*n_outliers_before/len(best_topics):.1f}%)")

    # Reduce outliers using c-TF-IDF strategy
    print("\nReducing outliers using c-TF-IDF similarity...")
    new_topics = best_model.reduce_outliers(
        documents,
        best_topics,
        strategy="c-tf-idf",
        threshold=0.1
    )

    # Update model with new topics
    best_model.update_topics(documents, topics=new_topics)

    # Count outliers after
    n_outliers_after = sum(1 for t in new_topics if t == -1)
    print(f"Outliers after reduction: {n_outliers_after} ({100*n_outliers_after/len(new_topics):.1f}%)")
    print(f"Outliers reduced by: {n_outliers_before - n_outliers_after} documents")

Outliers before reduction: 4746 (23.7%)

Reducing outliers using c-TF-IDF similarity...




Outliers after reduction: 2694 (13.5%)
Outliers reduced by: 2052 documents


In [14]:
# ============================================================
# Final Metrics after Outlier Reduction
# ============================================================
if MODE == 'infer':
    print("INFER mode: Loading final metrics from saved config...")
    with open(f"{PROJECT_PATH}/results/best_config.json", 'r') as f:
        best_config = json.load(f)
    final_metrics = best_config['metrics']
    print("\nMetrics After Outlier Reduction:")
    print(f"  Topics: {final_metrics['n_topics']}")
    print(f"  Coherence (NPMI): {final_metrics['coherence_npmi']:.4f}")
    print(f"  Diversity: {final_metrics['diversity']:.4f}")
    print(f"  Outlier %: {final_metrics['outlier_pct']:.1f}%")
    print(f"  Silhouette: {final_metrics['silhouette']:.4f}")
else:
    # TRAIN MODE - Recompute metrics
    final_metrics = compute_metrics(
        best_model, documents, best_embeddings, new_topics,
        tokenized_docs, dictionary
    )

    print("\nMetrics After Outlier Reduction:")
    print(f"  Topics: {final_metrics['n_topics']}")
    print(f"  Coherence (NPMI): {final_metrics['coherence']:.4f}")
    print(f"  Diversity: {final_metrics['diversity']:.4f}")
    print(f"  Outlier %: {final_metrics['outlier_pct']:.1f}%")
    print(f"  Silhouette: {final_metrics['silhouette']:.4f}")


Metrics After Outlier Reduction:
  Topics: 69
  Coherence (NPMI): 0.0475
  Diversity: 0.3565
  Outlier %: 13.5%
  Silhouette: 0.0149


## 7. Embedding Model Comparison Summary

In [15]:
# Compare embedding models
emb_summary = results_df.groupby('embedding_model').agg({
    'coherence': ['mean', 'std', 'max'],
    'diversity': ['mean', 'max'],
    'n_topics': ['mean', 'min', 'max'],
    'outlier_pct': ['mean', 'min']
}).round(4)

print("\n" + "="*60)
print("EMBEDDING MODEL COMPARISON")
print("="*60)
print(emb_summary.to_string())

# Winner determination
mpnet_best = results_df[results_df['embedding_model'] == 'mpnet']['coherence'].max()
minilm_best = results_df[results_df['embedding_model'] == 'minilm']['coherence'].max()

print(f"\nBest MPNet coherence: {mpnet_best:.4f}")
print(f"Best MiniLM coherence: {minilm_best:.4f}")
print(f"\n{'MPNet' if mpnet_best > minilm_best else 'MiniLM'} wins by {abs(mpnet_best - minilm_best):.4f}!")


EMBEDDING MODEL COMPARISON
                coherence                 diversity         n_topics          outlier_pct         
                     mean     std     max      mean     max     mean min  max        mean      min
embedding_model                                                                                   
minilm             0.0137  0.0284  0.0521    0.7839  0.8817    147.3  71  255     35.7249  25.8226
mpnet              0.0078  0.0322  0.0524    0.7755  0.8652    168.5  69  325     27.6988  22.3122

Best MPNet coherence: 0.0524
Best MiniLM coherence: 0.0521

MPNet wins by 0.0003!


In [16]:
# Visualization: MPNet vs MiniLM
fig = go.Figure()

for emb in ['mpnet', 'minilm']:
    subset = results_df[results_df['embedding_model'] == emb]
    fig.add_trace(go.Box(
        y=subset['coherence'],
        name=emb.upper(),
        boxpoints='all',
        jitter=0.3
    ))

fig.update_layout(
    title='Coherence Score Distribution: MPNet vs MiniLM',
    yaxis_title='Coherence (NPMI)',
    showlegend=False
)

fig.write_html(f"{PROJECT_PATH}/results/embedding_model_comparison.html")
print(f"Saved: {PROJECT_PATH}/results/embedding_model_comparison.html")

try:
    fig.show()
except Exception as e:
    print(f"Interactive display unavailable. Open the HTML file to view.")

Saved: /content/drive/MyDrive/BERTopic-arXiv-Analysis/results/embedding_model_comparison.html


## 8. Save Best Model

In [17]:
# Save best model
if MODE == "infer":
    best_model_path = f"{PROJECT_PATH}/models/bertopic_best_model"
    print(f"INFER MODE: Model already exists at {best_model_path}")
    print(f"Configuration already saved to best_config.json")
else:
    best_model_path = f"{PROJECT_PATH}/models/bertopic_best_model"
    os.makedirs(best_model_path, exist_ok=True)

    best_model.save(
        best_model_path,
        serialization="safetensors",
        save_ctfidf=True,
        save_embedding_model=False
    )

    print(f"Best model saved to {best_model_path}")

    # Save best configuration
    best_config = {
        'params': best_params,
        'metrics': {
            'n_topics': int(final_metrics['n_topics']),
            'coherence_npmi': float(final_metrics['coherence']),
            'diversity': float(final_metrics['diversity']),
            'outlier_pct': float(final_metrics['outlier_pct']),
            'silhouette': float(final_metrics['silhouette'])
        },
        'outlier_reduction': {
            'before': n_outliers_before,
            'after': n_outliers_after,
            'strategy': 'c-tf-idf'
        }
    }

    with open(f"{PROJECT_PATH}/results/best_config.json", 'w') as f:
        json.dump(best_config, f, indent=2)

    print("Configuration saved to best_config.json")



Best model saved to /content/drive/MyDrive/BERTopic-arXiv-Analysis/models/bertopic_best_model
Configuration saved to best_config.json


In [18]:
# Save topic assignments with best model
if MODE == "infer":
    print("INFER MODE: Topic assignments already saved.")
    assignments_path = f"{PROJECT_PATH}/results/topic_assignments_best.csv"
    if os.path.exists(assignments_path):
        results_df_final = pd.read_csv(assignments_path)
        print(f"Loaded existing assignments: {len(results_df_final)} documents")
else:
    results_df_final = df.copy()
    results_df_final['topic'] = new_topics

    # Add topic labels
    topic_info = best_model.get_topic_info()
    topic_labels = {row['Topic']: row['Name'] for _, row in topic_info.iterrows()}
    results_df_final['topic_name'] = results_df_final['topic'].map(topic_labels)

    results_df_final.to_csv(f"{PROJECT_PATH}/results/topic_assignments_best.csv", index=False)
    topic_info.to_csv(f"{PROJECT_PATH}/results/topic_info_best.csv", index=False)

    print("Topic assignments saved!")

Topic assignments saved!


In [19]:
# Compute and save 2D embeddings for best model
if MODE == "infer":
    embeddings_2d_path = f"{PROJECT_PATH}/data/embeddings/embeddings_2d_best.npy"
    print("INFER MODE: 2D embeddings already computed.")
    if os.path.exists(embeddings_2d_path):
        embeddings_2d = np.load(embeddings_2d_path)
        print(f"Loaded 2D embeddings: shape {embeddings_2d.shape}")
else:
    print("Computing 2D UMAP projection...")
    umap_2d = UMAP(n_components=2, min_dist=0.1, metric='cosine', random_state=42)
    embeddings_2d = umap_2d.fit_transform(best_embeddings)

    np.save(f"{PROJECT_PATH}/data/embeddings/embeddings_2d_best.npy", embeddings_2d)
    print("2D embeddings saved!")

Computing 2D UMAP projection...
2D embeddings saved!


## 9. Summary Report

In [20]:
# Generate comprehensive report
# In INFER mode, load from config if needed
if MODE == "infer":
    config_path = f"{PROJECT_PATH}/results/best_config.json"
    if os.path.exists(config_path):
        with open(config_path, 'r') as f:
            best_config = json.load(f)
        best_params = best_config.get('params', {})
        final_metrics = best_config.get('metrics', {})
        outlier_info = best_config.get('outlier_reduction', {})
        n_outliers_before = outlier_info.get('before', 0)
        n_outliers_after = outlier_info.get('after', 0)

report = f"""
{'='*70}
HYPERPARAMETER TUNING & MODEL COMPARISON REPORT
Project: Topic Modeling arXiv cs.AI with BERTopic
Team: Pavan Chauhan, Vedanta Nayak
{'='*70}

HYPERPARAMETER SEARCH
{'-'*50}
Total configurations tested: {len(results_df)}
Embedding models: MPNet, MiniLM
min_cluster_size: [10, 15, 20, 30, 50]
n_neighbors: [10, 15, 25]
n_components: [5, 10]

EMBEDDING MODEL COMPARISON
{'-'*50}
MPNet (all-mpnet-base-v2):
  - Best coherence: {results_df[results_df['embedding_model']=='mpnet']['coherence'].max():.4f}
  - Mean coherence: {results_df[results_df['embedding_model']=='mpnet']['coherence'].mean():.4f}
  - Parameters: 110M

MiniLM (all-MiniLM-L6-v2):
  - Best coherence: {results_df[results_df['embedding_model']=='minilm']['coherence'].max():.4f}
  - Mean coherence: {results_df[results_df['embedding_model']=='minilm']['coherence'].mean():.4f}
  - Parameters: 22M (5x smaller)

BEST MODEL
{'-'*50}
Embedding: {best_params.get('embedding_model', 'mpnet').upper()}
min_cluster_size: {best_params.get('min_cluster_size', 'N/A')}
n_neighbors: {best_params.get('n_neighbors', 'N/A')}
n_components: {best_params.get('n_components', 'N/A')}

OUTLIER REDUCTION
{'-'*50}
Strategy: c-TF-IDF similarity
Before: {n_outliers_before} outliers ({100*n_outliers_before/len(documents):.1f}%)
After: {n_outliers_after} outliers ({100*n_outliers_after/len(documents):.1f}%)
Improvement: {n_outliers_before - n_outliers_after} documents reassigned

FINAL METRICS
{'-'*50}
Topics: {final_metrics.get('n_topics', 'N/A')}
Coherence (NPMI): {final_metrics.get('coherence_npmi', final_metrics.get('coherence', 0)):.4f}
Topic Diversity: {final_metrics.get('diversity', 0):.4f} ({final_metrics.get('diversity', 0)*100:.1f}%)
Silhouette Score: {final_metrics.get('silhouette', 0):.4f}
Outlier %: {final_metrics.get('outlier_pct', 0):.1f}%

{'='*70}
"""

print(report)

with open(f"{PROJECT_PATH}/results/hyperparameter_tuning_report.txt", 'w') as f:
    f.write(report)

print(f"Report saved!")


HYPERPARAMETER TUNING & MODEL COMPARISON REPORT
Project: Topic Modeling arXiv cs.AI with BERTopic
Team: Pavan Chauhan, Vedanta Nayak

HYPERPARAMETER SEARCH
--------------------------------------------------
Total configurations tested: 60
Embedding models: MPNet, MiniLM
min_cluster_size: [10, 15, 20, 30, 50]
n_neighbors: [10, 15, 25]
n_components: [5, 10]

EMBEDDING MODEL COMPARISON
--------------------------------------------------
MPNet (all-mpnet-base-v2):
  - Best coherence: 0.0524
  - Mean coherence: 0.0078
  - Parameters: 110M

MiniLM (all-MiniLM-L6-v2):
  - Best coherence: 0.0521
  - Mean coherence: 0.0137
  - Parameters: 22M (5x smaller)

BEST MODEL
--------------------------------------------------
Embedding: MPNET
min_cluster_size: 50
n_neighbors: 25
n_components: 10

OUTLIER REDUCTION
--------------------------------------------------
Strategy: c-TF-IDF similarity
Before: 4746 outliers (23.7%)
After: 2694 outliers (13.5%)
Improvement: 2052 documents reassigned

FINAL METRIC