In [1]:
# Install required packages
%pip install bertopic sentence-transformers umap-learn hdbscan gensim plotly scikit-learn pyyaml gdown -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
# ============================================================
# PROJECT SETUP - Config-based with Train/Infer Modes
# ============================================================
import os
import yaml
from pathlib import Path

# Clone repo if running fresh on Colab
if 'google.colab' in str(get_ipython()) and not os.path.exists('/content/BERTopic-arXiv-Analysis'):
    !git clone https://github.com/pavannn16/BERTopic-arXiv-Analysis.git /content/BERTopic-arXiv-Analysis

# Load configuration
def load_config():
    config_paths = ['config.yaml', '../config.yaml', '/content/BERTopic-arXiv-Analysis/config.yaml']
    for path in config_paths:
        if os.path.exists(path):
            with open(path, 'r') as f:
                return yaml.safe_load(f), path
    return None, None

config, config_path = load_config()
if config:
    print(f"‚úÖ Loaded config from {config_path}")
else:
    config = {'mode': 'infer'}

MODE = config.get('mode', 'infer')
print(f"üîß Mode: {MODE.upper()}")

# Note: Hyperparameter tuning is a TRAIN-only operation
# In INFER mode, we load and display existing results
if MODE == 'infer':
    print("‚ÑπÔ∏è INFER mode: Will load existing hyperparameter tuning results")

# Setup paths
if 'google.colab' in str(get_ipython()):
    if MODE == 'train':
        from google.colab import drive
        drive.mount('/content/drive')
        PROJECT_PATH = '/content/drive/MyDrive/BERTopic-arXiv-Analysis'
        print("‚úÖ TRAIN mode: Personal Drive mounted")
    else:
        PROJECT_PATH = '/content/BERTopic-arXiv-Analysis'
        print("‚úÖ INFER mode: Using data from cloned repo")
else:
    PROJECT_PATH = str(Path(os.getcwd()).parent) if 'notebooks' in os.getcwd() else os.getcwd()
    print("‚úÖ Running locally")

for folder in ['data/raw', 'data/processed', 'data/embeddings', 'models', 'results']:
    os.makedirs(f'{PROJECT_PATH}/{folder}', exist_ok=True)

print(f"üìÅ Project path: {PROJECT_PATH}")

‚úÖ Loaded config from ../config.yaml
üîß Mode: INFER
‚ÑπÔ∏è INFER mode: Will load existing hyperparameter tuning results
‚úÖ Running locally
üìÅ Project path: /Users/pavan/Downloads/CSULA SEM1/AI/Code Assignments/BERTopic-arXiv-Analysis


In [3]:
# Import libraries
import pandas as pd
import numpy as np
import json
import time
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
from collections import Counter

# BERTopic components
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# Evaluation
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.metrics import silhouette_score

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

print("Libraries imported!")

Libraries imported!


In [4]:
# Check GPU
import torch
if torch.cuda.is_available():
    device = 'cuda'
    print(f"‚úÖ GPU available: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
    device = 'mps'
    print("‚úÖ Apple Silicon GPU available")
else:
    device = 'cpu'
    print("‚ö†Ô∏è No GPU - using CPU (will be slower)")

‚úÖ Apple Silicon GPU available


## 1. Load Processed Data

In [5]:
# Load processed data
df = pd.read_csv(f"{PROJECT_PATH}/data/processed/arxiv_cs_ai_processed.csv")
documents = df['text'].tolist()

print(f"Loaded {len(documents)} documents")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")

# Prepare tokenized docs for coherence computation
tokenized_docs = [doc.lower().split() for doc in documents]
dictionary = Dictionary(tokenized_docs)
dictionary.filter_extremes(no_below=5, no_above=0.95)
print(f"Dictionary size: {len(dictionary)}")

Loaded 19998 documents
Date range: 2025-07-03 to 2025-12-02
Dictionary size: 24014


## 2. Embedding Model Comparison

We compare two popular Sentence-BERT models:

| Model | Parameters | Speed | Quality |
|-------|------------|-------|--------|
| **all-mpnet-base-v2** | 110M | Slower | Higher |
| **all-MiniLM-L6-v2** | 22M | 5x faster | Good |

In [6]:
# Define embedding models to compare
EMBEDDING_MODELS = {
    'mpnet': 'all-mpnet-base-v2',
    'minilm': 'all-MiniLM-L6-v2'
}

embeddings_dict = {}

for name, model_name in EMBEDDING_MODELS.items():
    embeddings_path = f"{PROJECT_PATH}/data/embeddings/embeddings_{name}.npy"

    if os.path.exists(embeddings_path):
        print(f"Loading pre-computed {name} embeddings...")
        embeddings_dict[name] = np.load(embeddings_path)
    else:
        print(f"\nComputing embeddings with {model_name}...")
        start_time = time.time()

        model = SentenceTransformer(model_name)
        embeddings = model.encode(
            documents,
            batch_size=64,
            show_progress_bar=True,
            convert_to_numpy=True,
            device=device
        )

        elapsed = time.time() - start_time
        print(f"  Time: {elapsed:.1f}s ({len(documents)/elapsed:.1f} docs/sec)")

        # Save embeddings
        np.save(embeddings_path, embeddings)
        embeddings_dict[name] = embeddings
        print(f"  Saved to {embeddings_path}")

print(f"\n‚úÖ Embeddings ready:")
for name, emb in embeddings_dict.items():
    print(f"  {name}: {emb.shape}")

Loading pre-computed mpnet embeddings...
Loading pre-computed minilm embeddings...

‚úÖ Embeddings ready:
  mpnet: (19998, 768)
  minilm: (19998, 384)


## 3. Evaluation Functions

In [7]:
def compute_metrics(topic_model, documents, embeddings, topics, tokenized_docs, dictionary):
    """
    Compute all evaluation metrics for a BERTopic model.

    Returns:
        dict with coherence, diversity, silhouette, n_topics, outlier_pct
    """
    # Get topic words
    topics_dict = topic_model.get_topics()
    if -1 in topics_dict:
        del topics_dict[-1]

    n_topics = len(topics_dict)

    # Outlier percentage
    n_outliers = sum(1 for t in topics if t == -1)
    outlier_pct = 100 * n_outliers / len(topics)

    # Topic Diversity
    all_words = []
    topic_words_list = []
    for topic_id in sorted(topics_dict.keys()):
        words = [word for word, _ in topics_dict[topic_id][:10]]
        topic_words_list.append(words)
        all_words.extend(words)

    diversity = len(set(all_words)) / len(all_words) if all_words else 0

    # Coherence (NPMI)
    try:
        coherence_model = CoherenceModel(
            topics=topic_words_list,
            texts=tokenized_docs,
            dictionary=dictionary,
            coherence='c_npmi'
        )
        coherence = coherence_model.get_coherence()
    except:
        coherence = 0.0

    # Silhouette Score
    try:
        mask = np.array(topics) != -1
        if mask.sum() > 100 and len(set(np.array(topics)[mask])) > 1:
            silhouette = silhouette_score(embeddings[mask], np.array(topics)[mask])
        else:
            silhouette = 0.0
    except:
        silhouette = 0.0

    return {
        'n_topics': n_topics,
        'outlier_pct': outlier_pct,
        'coherence': coherence,
        'diversity': diversity,
        'silhouette': silhouette,
        'combined_score': coherence * 0.5 + diversity * 0.3 + (1 - outlier_pct/100) * 0.2
    }

print("Evaluation functions defined!")

Evaluation functions defined!


## 4. Hyperparameter Grid Search

We'll search over:
- **Embedding model**: MPNet vs MiniLM
- **min_cluster_size**: [10, 15, 20, 30, 50]
- **n_neighbors**: [10, 15, 25]
- **n_components**: [5, 10]

In [8]:
# Define hyperparameter grid
PARAM_GRID = {
    'embedding_model': ['mpnet', 'minilm'],
    'min_cluster_size': [10, 15, 20, 30, 50],
    'n_neighbors': [10, 15, 25],
    'n_components': [5, 10]
}

# Calculate total combinations
n_combinations = 1
for values in PARAM_GRID.values():
    n_combinations *= len(values)

print(f"Total hyperparameter combinations: {n_combinations}")

if MODE == 'infer':
    print("‚è≠Ô∏è INFER mode: Will load existing search results instead of running grid search")
else:
    print(f"Estimated time: {n_combinations * 0.5:.0f} minutes")

Total hyperparameter combinations: 60
‚è≠Ô∏è INFER mode: Will load existing search results instead of running grid search


In [9]:
# ============================================================
# Run grid search (TRAIN mode) or load existing results (INFER mode)
# ============================================================
if MODE == 'infer':
    print("‚è≠Ô∏è INFER mode: Skipping grid search (loading existing results)")
    print("   Grid search takes ~30 minutes - results are pre-computed")
else:
    # TRAIN MODE - Run full grid search
    results = []
    best_score = -float('inf')
    best_params = None
    best_model = None

    print("Starting hyperparameter search...\n")

    for emb_name in PARAM_GRID['embedding_model']:
        embeddings = embeddings_dict[emb_name]

        for min_cluster_size in PARAM_GRID['min_cluster_size']:
            for n_neighbors in PARAM_GRID['n_neighbors']:
                for n_components in PARAM_GRID['n_components']:

                    params = {
                        'embedding_model': emb_name,
                        'min_cluster_size': min_cluster_size,
                        'n_neighbors': n_neighbors,
                        'n_components': n_components
                    }

                    try:
                        # Build model
                        umap_model = UMAP(
                            n_neighbors=n_neighbors,
                            n_components=n_components,
                            min_dist=0.0,
                            metric='cosine',
                            random_state=42
                        )

                        hdbscan_model = HDBSCAN(
                            min_cluster_size=min_cluster_size,
                            min_samples=10,
                            metric='euclidean',
                            cluster_selection_method='eom',
                            prediction_data=True
                        )

                        vectorizer_model = CountVectorizer(
                            ngram_range=(1, 2),
                            stop_words='english',
                            min_df=5,
                            max_df=0.95
                        )

                        topic_model = BERTopic(
                            umap_model=umap_model,
                            hdbscan_model=hdbscan_model,
                            vectorizer_model=vectorizer_model,
                            top_n_words=10,
                            calculate_probabilities=False,
                            verbose=False
                        )

                        # Fit model
                        topics, _ = topic_model.fit_transform(documents, embeddings=embeddings)

                        # Compute metrics
                        metrics = compute_metrics(
                            topic_model, documents, embeddings, topics,
                            tokenized_docs, dictionary
                        )
                        metrics.update(params)
                        results.append(metrics)

                        # Check if best
                        if metrics['combined_score'] > best_score:
                            best_score = metrics['combined_score']
                            best_params = params.copy()
                            best_model = topic_model
                            best_topics = topics
                            best_embeddings = embeddings

                        print(f"‚úì {emb_name}, mcs={min_cluster_size}, nn={n_neighbors}, nc={n_components} ‚Üí "
                              f"Topics: {metrics['n_topics']}, Coh: {metrics['coherence']:.3f}, "
                              f"Div: {metrics['diversity']:.3f}, Out: {metrics['outlier_pct']:.1f}%")

                    except Exception as e:
                        print(f"‚úó {emb_name}, mcs={min_cluster_size}, nn={n_neighbors}, nc={n_components} ‚Üí Error: {str(e)[:50]}")

    print(f"\n{'='*60}")
    print(f"GRID SEARCH COMPLETE!")
    print(f"{'='*60}")

‚è≠Ô∏è INFER mode: Skipping grid search (loading existing results)
   Grid search takes ~30 minutes - results are pre-computed


In [10]:
# ============================================================
# Load or create results DataFrame
# ============================================================
results_path = f"{PROJECT_PATH}/results/hyperparameter_search_results.csv"

if MODE == 'infer':
    print("‚è≠Ô∏è INFER mode: Loading existing hyperparameter search results...")
    results_df = pd.read_csv(results_path)
    print(f"‚úÖ Loaded {len(results_df)} configurations from previous search")
else:
    # TRAIN MODE - Create from grid search results
    results_df = pd.DataFrame(results)
    results_df.to_csv(results_path, index=False)
    print(f"Results saved to {results_path}")

results_df = results_df.sort_values('combined_score', ascending=False)

print("\nTop 10 Configurations:")
display_cols = ['embedding_model', 'min_cluster_size', 'n_neighbors', 'n_components',
                'n_topics', 'coherence', 'diversity', 'outlier_pct', 'combined_score']
print(results_df[display_cols].head(10).to_string(index=False))

‚è≠Ô∏è INFER mode: Loading existing hyperparameter search results...
‚úÖ Loaded 60 configurations from previous search

Top 10 Configurations:
embedding_model  min_cluster_size  n_neighbors  n_components  n_topics  coherence  diversity  outlier_pct  combined_score
          mpnet                50           10            10        91   0.099916   0.759341    20.937094        0.435886
          mpnet                50           15             5        83   0.095807   0.785542    23.987399        0.435591
          mpnet                50           15            10        87   0.098014   0.768966    23.612361        0.432472
          mpnet                50           10             5        85   0.085545   0.765882    21.102110        0.430333
         minilm                50           10             5        83   0.089585   0.790361    28.442844        0.425015
          mpnet                50           25            10        82   0.098385   0.774390    28.257826        0.424994
   

In [11]:
# Visualize hyperparameter effects
fig = make_subplots(rows=2, cols=2, subplot_titles=[
    'Coherence by min_cluster_size',
    'Coherence by Embedding Model',
    'Topics vs Coherence',
    'Outlier % vs Coherence'
])

# 1. min_cluster_size effect
for emb in results_df['embedding_model'].unique():
    subset = results_df[results_df['embedding_model'] == emb]
    grouped = subset.groupby('min_cluster_size')['coherence'].mean()
    fig.add_trace(
        go.Scatter(x=grouped.index, y=grouped.values, name=emb, mode='lines+markers'),
        row=1, col=1
    )

# 2. Embedding model comparison
emb_comparison = results_df.groupby('embedding_model')['coherence'].agg(['mean', 'std'])
fig.add_trace(
    go.Bar(x=emb_comparison.index, y=emb_comparison['mean'],
           error_y=dict(type='data', array=emb_comparison['std']),
           name='Coherence', showlegend=False),
    row=1, col=2
)

# 3. Topics vs Coherence
fig.add_trace(
    go.Scatter(x=results_df['n_topics'], y=results_df['coherence'],
               mode='markers', name='Configs', showlegend=False,
               marker=dict(color=results_df['combined_score'], colorscale='Viridis')),
    row=2, col=1
)

# 4. Outlier % vs Coherence
fig.add_trace(
    go.Scatter(x=results_df['outlier_pct'], y=results_df['coherence'],
               mode='markers', name='Configs', showlegend=False,
               marker=dict(color=results_df['combined_score'], colorscale='Viridis')),
    row=2, col=2
)

fig.update_layout(height=700, title_text="Hyperparameter Analysis")

# Save and display
fig.write_html(f"{PROJECT_PATH}/results/hyperparameter_analysis.html")
print(f"‚úÖ Saved: {PROJECT_PATH}/results/hyperparameter_analysis.html")

try:
    fig.show()
except Exception as e:
    print(f"‚ÑπÔ∏è Interactive display unavailable. Open the HTML file to view.")

‚úÖ Saved: /Users/pavan/Downloads/CSULA SEM1/AI/Code Assignments/BERTopic-arXiv-Analysis/results/hyperparameter_analysis.html


## 5. Best Model Analysis

In [12]:
# ============================================================
# Best Model Configuration
# ============================================================
print("="*60)
print("BEST MODEL CONFIGURATION")
print("="*60)

if MODE == 'infer':
    # Load best config from saved file
    with open(f"{PROJECT_PATH}/results/best_config.json", 'r') as f:
        best_config = json.load(f)
    best_params = best_config['params']
    best_score = best_config['metrics'].get('coherence_npmi', 0) * 0.5 + \
                 best_config['metrics'].get('diversity', 0) * 0.3 + \
                 (1 - best_config['metrics'].get('outlier_pct', 0)/100) * 0.2

print(f"\nEmbedding Model: {best_params['embedding_model']}")
print(f"min_cluster_size: {best_params['min_cluster_size']}")
print(f"n_neighbors: {best_params['n_neighbors']}")
print(f"n_components: {best_params['n_components']}")

# Get best model metrics from results_df
best_metrics = results_df.iloc[0]
print(f"\nMetrics:")
print(f"  Topics: {int(best_metrics['n_topics'])}")
print(f"  Coherence (NPMI): {best_metrics['coherence']:.4f}")
print(f"  Diversity: {best_metrics['diversity']:.4f}")
print(f"  Outlier %: {best_metrics['outlier_pct']:.1f}%")
print(f"  Silhouette: {best_metrics['silhouette']:.4f}")

BEST MODEL CONFIGURATION

Embedding Model: mpnet
min_cluster_size: 50
n_neighbors: 10
n_components: 10

Metrics:
  Topics: 91
  Coherence (NPMI): 0.0999
  Diversity: 0.7593
  Outlier %: 20.9%
  Silhouette: 0.0339


## 6. Outlier Reduction

BERTopic can reassign outliers to their nearest topics using various strategies.

In [13]:
# ============================================================
# Outlier Reduction (TRAIN mode) or load results (INFER mode)
# ============================================================
if MODE == 'infer':
    print("‚è≠Ô∏è INFER mode: Loading outlier reduction results from saved config...")
    with open(f"{PROJECT_PATH}/results/best_config.json", 'r') as f:
        best_config = json.load(f)
    n_outliers_before = best_config['outlier_reduction']['before']
    n_outliers_after = best_config['outlier_reduction']['after']
    print(f"Outliers before reduction: {n_outliers_before}")
    print(f"Outliers after reduction: {n_outliers_after}")
    print(f"Outliers reduced by: {n_outliers_before - n_outliers_after} documents")
else:
    # TRAIN MODE - Run outlier reduction
    n_outliers_before = sum(1 for t in best_topics if t == -1)
    print(f"Outliers before reduction: {n_outliers_before} ({100*n_outliers_before/len(best_topics):.1f}%)")

    # Reduce outliers using c-TF-IDF strategy
    print("\nReducing outliers using c-TF-IDF similarity...")
    new_topics = best_model.reduce_outliers(
        documents,
        best_topics,
        strategy="c-tf-idf",
        threshold=0.1
    )

    # Update model with new topics
    best_model.update_topics(documents, topics=new_topics)

    # Count outliers after
    n_outliers_after = sum(1 for t in new_topics if t == -1)
    print(f"Outliers after reduction: {n_outliers_after} ({100*n_outliers_after/len(new_topics):.1f}%)")
    print(f"Outliers reduced by: {n_outliers_before - n_outliers_after} documents")

‚è≠Ô∏è INFER mode: Loading outlier reduction results from saved config...
Outliers before reduction: 4187
Outliers after reduction: 1304
Outliers reduced by: 2883 documents


In [14]:
# ============================================================
# Final Metrics after Outlier Reduction
# ============================================================
if MODE == 'infer':
    print("‚è≠Ô∏è INFER mode: Loading final metrics from saved config...")
    with open(f"{PROJECT_PATH}/results/best_config.json", 'r') as f:
        best_config = json.load(f)
    final_metrics = best_config['metrics']
    print("\nMetrics After Outlier Reduction:")
    print(f"  Topics: {final_metrics['n_topics']}")
    print(f"  Coherence (NPMI): {final_metrics['coherence_npmi']:.4f}")
    print(f"  Diversity: {final_metrics['diversity']:.4f}")
    print(f"  Outlier %: {final_metrics['outlier_pct']:.1f}%")
    print(f"  Silhouette: {final_metrics['silhouette']:.4f}")
else:
    # TRAIN MODE - Recompute metrics
    final_metrics = compute_metrics(
        best_model, documents, best_embeddings, new_topics,
        tokenized_docs, dictionary
    )

    print("\nMetrics After Outlier Reduction:")
    print(f"  Topics: {final_metrics['n_topics']}")
    print(f"  Coherence (NPMI): {final_metrics['coherence']:.4f}")
    print(f"  Diversity: {final_metrics['diversity']:.4f}")
    print(f"  Outlier %: {final_metrics['outlier_pct']:.1f}%")
    print(f"  Silhouette: {final_metrics['silhouette']:.4f}")

‚è≠Ô∏è INFER mode: Loading final metrics from saved config...

Metrics After Outlier Reduction:
  Topics: 91
  Coherence (NPMI): 0.0949
  Diversity: 0.5813
  Outlier %: 6.5%
  Silhouette: 0.0259


## 7. Embedding Model Comparison Summary

In [15]:
# Compare embedding models
emb_summary = results_df.groupby('embedding_model').agg({
    'coherence': ['mean', 'std', 'max'],
    'diversity': ['mean', 'max'],
    'n_topics': ['mean', 'min', 'max'],
    'outlier_pct': ['mean', 'min']
}).round(4)

print("\n" + "="*60)
print("EMBEDDING MODEL COMPARISON")
print("="*60)
print(emb_summary.to_string())

# Winner determination
mpnet_best = results_df[results_df['embedding_model'] == 'mpnet']['coherence'].max()
minilm_best = results_df[results_df['embedding_model'] == 'minilm']['coherence'].max()

print(f"\nüèÜ Best MPNet coherence: {mpnet_best:.4f}")
print(f"üèÜ Best MiniLM coherence: {minilm_best:.4f}")
print(f"\n{'MPNet' if mpnet_best > minilm_best else 'MiniLM'} wins by {abs(mpnet_best - minilm_best):.4f}!")


EMBEDDING MODEL COMPARISON
                coherence                 diversity          n_topics          outlier_pct         
                     mean     std     max      mean     max      mean min  max        mean      min
embedding_model                                                                                    
minilm             0.0833  0.0147  0.1051    0.7197  0.8060  148.6000  67  261     35.6077  28.4428
mpnet              0.0756  0.0161  0.0999    0.7095  0.7855  170.8333  81  320     28.0420  20.9371

üèÜ Best MPNet coherence: 0.0999
üèÜ Best MiniLM coherence: 0.1051

MiniLM wins by 0.0052!


In [16]:
# Visualization: MPNet vs MiniLM
fig = go.Figure()

for emb in ['mpnet', 'minilm']:
    subset = results_df[results_df['embedding_model'] == emb]
    fig.add_trace(go.Box(
        y=subset['coherence'],
        name=emb.upper(),
        boxpoints='all',
        jitter=0.3
    ))

fig.update_layout(
    title='Coherence Score Distribution: MPNet vs MiniLM',
    yaxis_title='Coherence (NPMI)',
    showlegend=False
)

fig.write_html(f"{PROJECT_PATH}/results/embedding_model_comparison.html")
print(f"‚úÖ Saved: {PROJECT_PATH}/results/embedding_model_comparison.html")

try:
    fig.show()
except Exception as e:
    print(f"‚ÑπÔ∏è Interactive display unavailable. Open the HTML file to view.")

‚úÖ Saved: /Users/pavan/Downloads/CSULA SEM1/AI/Code Assignments/BERTopic-arXiv-Analysis/results/embedding_model_comparison.html


## 8. Save Best Model

In [21]:
# Save best model
if MODE == "infer":
    best_model_path = f"{PROJECT_PATH}/models/bertopic_best_model"
    print(f"‚ÑπÔ∏è INFER MODE: Model already exists at {best_model_path}")
    print(f"‚ÑπÔ∏è Configuration already saved to best_config.json")
else:
    best_model_path = f"{PROJECT_PATH}/models/bertopic_best_model"
    os.makedirs(best_model_path, exist_ok=True)

    best_model.save(
        best_model_path,
        serialization="safetensors",
        save_ctfidf=True,
        save_embedding_model=False
    )

    print(f"Best model saved to {best_model_path}")

    # Save best configuration
    best_config = {
        'params': best_params,
        'metrics': {
            'n_topics': int(final_metrics['n_topics']),
            'coherence_npmi': float(final_metrics['coherence']),
            'diversity': float(final_metrics['diversity']),
            'outlier_pct': float(final_metrics['outlier_pct']),
            'silhouette': float(final_metrics['silhouette'])
        },
        'outlier_reduction': {
            'before': n_outliers_before,
            'after': n_outliers_after,
            'strategy': 'c-tf-idf'
        }
    }

    with open(f"{PROJECT_PATH}/results/best_config.json", 'w') as f:
        json.dump(best_config, f, indent=2)

    print("Configuration saved to best_config.json")

‚ÑπÔ∏è INFER MODE: Model already exists at /Users/pavan/Downloads/CSULA SEM1/AI/Code Assignments/BERTopic-arXiv-Analysis/models/bertopic_best_model
‚ÑπÔ∏è Configuration already saved to best_config.json


In [22]:
# Save topic assignments with best model
if MODE == "infer":
    print("‚ÑπÔ∏è INFER MODE: Topic assignments already saved.")
    assignments_path = f"{PROJECT_PATH}/results/topic_assignments_best.csv"
    if os.path.exists(assignments_path):
        results_df_final = pd.read_csv(assignments_path)
        print(f"‚úÖ Loaded existing assignments: {len(results_df_final)} documents")
else:
    results_df_final = df.copy()
    results_df_final['topic'] = new_topics

    # Add topic labels
    topic_info = best_model.get_topic_info()
    topic_labels = {row['Topic']: row['Name'] for _, row in topic_info.iterrows()}
    results_df_final['topic_name'] = results_df_final['topic'].map(topic_labels)

    results_df_final.to_csv(f"{PROJECT_PATH}/results/topic_assignments_best.csv", index=False)
    topic_info.to_csv(f"{PROJECT_PATH}/results/topic_info_best.csv", index=False)

    print("Topic assignments saved!")

‚ÑπÔ∏è INFER MODE: Topic assignments already saved.
‚úÖ Loaded existing assignments: 19998 documents


In [23]:
# Compute and save 2D embeddings for best model
if MODE == "infer":
    embeddings_2d_path = f"{PROJECT_PATH}/data/embeddings/embeddings_2d_best.npy"
    print("‚ÑπÔ∏è INFER MODE: 2D embeddings already computed.")
    if os.path.exists(embeddings_2d_path):
        embeddings_2d = np.load(embeddings_2d_path)
        print(f"‚úÖ Loaded 2D embeddings: shape {embeddings_2d.shape}")
else:
    print("Computing 2D UMAP projection...")
    umap_2d = UMAP(n_components=2, min_dist=0.1, metric='cosine', random_state=42)
    embeddings_2d = umap_2d.fit_transform(best_embeddings)

    np.save(f"{PROJECT_PATH}/data/embeddings/embeddings_2d_best.npy", embeddings_2d)
    print("2D embeddings saved!")

‚ÑπÔ∏è INFER MODE: 2D embeddings already computed.
‚úÖ Loaded 2D embeddings: shape (19998, 2)


## 9. Summary Report

In [24]:
# Generate comprehensive report
# In INFER mode, load from config if needed
if MODE == "infer":
    config_path = f"{PROJECT_PATH}/results/best_config.json"
    if os.path.exists(config_path):
        with open(config_path, 'r') as f:
            best_config = json.load(f)
        best_params = best_config.get('params', {})
        final_metrics = best_config.get('metrics', {})
        outlier_info = best_config.get('outlier_reduction', {})
        n_outliers_before = outlier_info.get('before', 0)
        n_outliers_after = outlier_info.get('after', 0)

report = f"""
{'='*70}
HYPERPARAMETER TUNING & MODEL COMPARISON REPORT
Project: Topic Modeling arXiv cs.AI with BERTopic
Team: Pavan Chauhan, Vedanta Nayak
{'='*70}

HYPERPARAMETER SEARCH
{'-'*50}
Total configurations tested: {len(results_df)}
Embedding models: MPNet, MiniLM
min_cluster_size: [10, 15, 20, 30, 50]
n_neighbors: [10, 15, 25]
n_components: [5, 10]

EMBEDDING MODEL COMPARISON
{'-'*50}
MPNet (all-mpnet-base-v2):
  - Best coherence: {results_df[results_df['embedding_model']=='mpnet']['coherence'].max():.4f}
  - Mean coherence: {results_df[results_df['embedding_model']=='mpnet']['coherence'].mean():.4f}
  - Parameters: 110M

MiniLM (all-MiniLM-L6-v2):
  - Best coherence: {results_df[results_df['embedding_model']=='minilm']['coherence'].max():.4f}
  - Mean coherence: {results_df[results_df['embedding_model']=='minilm']['coherence'].mean():.4f}
  - Parameters: 22M (5x smaller)

BEST MODEL
{'-'*50}
Embedding: {best_params.get('embedding_model', 'mpnet').upper()}
min_cluster_size: {best_params.get('min_cluster_size', 'N/A')}
n_neighbors: {best_params.get('n_neighbors', 'N/A')}
n_components: {best_params.get('n_components', 'N/A')}

OUTLIER REDUCTION
{'-'*50}
Strategy: c-TF-IDF similarity
Before: {n_outliers_before} outliers ({100*n_outliers_before/len(documents):.1f}%)
After: {n_outliers_after} outliers ({100*n_outliers_after/len(documents):.1f}%)
Improvement: {n_outliers_before - n_outliers_after} documents reassigned

FINAL METRICS
{'-'*50}
Topics: {final_metrics.get('n_topics', 'N/A')}
Coherence (NPMI): {final_metrics.get('coherence_npmi', final_metrics.get('coherence', 0)):.4f}
Topic Diversity: {final_metrics.get('diversity', 0):.4f} ({final_metrics.get('diversity', 0)*100:.1f}%)
Silhouette Score: {final_metrics.get('silhouette', 0):.4f}
Outlier %: {final_metrics.get('outlier_pct', 0):.1f}%

{'='*70}
"""

print(report)

with open(f"{PROJECT_PATH}/results/hyperparameter_tuning_report.txt", 'w') as f:
    f.write(report)

print(f"Report saved!")


HYPERPARAMETER TUNING & MODEL COMPARISON REPORT
Project: Topic Modeling arXiv cs.AI with BERTopic
Team: Pavan Chauhan, Vedanta Nayak

HYPERPARAMETER SEARCH
--------------------------------------------------
Total configurations tested: 60
Embedding models: MPNet, MiniLM
min_cluster_size: [10, 15, 20, 30, 50]
n_neighbors: [10, 15, 25]
n_components: [5, 10]

EMBEDDING MODEL COMPARISON
--------------------------------------------------
MPNet (all-mpnet-base-v2):
  - Best coherence: 0.0999
  - Mean coherence: 0.0756
  - Parameters: 110M

MiniLM (all-MiniLM-L6-v2):
  - Best coherence: 0.1051
  - Mean coherence: 0.0833
  - Parameters: 22M (5x smaller)

BEST MODEL
--------------------------------------------------
Embedding: MPNET
min_cluster_size: 50
n_neighbors: 10
n_components: 10

OUTLIER REDUCTION
--------------------------------------------------
Strategy: c-TF-IDF similarity
Before: 4187 outliers (20.9%)
After: 1304 outliers (6.5%)
Improvement: 2883 documents reassigned

FINAL METRICS

## Summary

This notebook has:
1. ‚úÖ Compared MPNet vs MiniLM embeddings
2. ‚úÖ Performed grid search over 60 hyperparameter combinations
3. ‚úÖ Identified optimal configuration
4. ‚úÖ Applied outlier reduction
5. ‚úÖ Saved best model and results

**Next step:** Run `04_evaluation.ipynb` for full evaluation with LDA baseline comparison.