# Imports

In [1]:
import pandas as pd
import numpy as np
import regex as re
import itertools
import os


from bs4 import BeautifulSoup
from markdown import markdown
from swifter import swifter
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.dimensionality import BaseDimensionalityReduction
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from umap import UMAP

from modules.evaluate_bertopic import evaluate_topics
from modules.constants import *
from modules.utilities import read_aidev

seed = 42

# Load Data

In [2]:
# df = pd.read_csv("Outputs/PerformancePRs/POP_PULL_Requests_LLM_filtered.csv")

# df_pr_type = read_aidev(FileName.POP_PR_TASK_TYPE)
# df_pr_type = df_pr_type[df_pr_type["type"] == "perf"]

# df_pop_all = read_aidev(FileName.POP_PULL_REQUEST)

# pr_type_ids = df_pr_type["id"].tolist()
# perf_pr_ids = df["id"].tolist()

# cnt = 0
# ids = []
# for id in pr_type_ids:
#     if id not in perf_pr_ids:
#         cnt += 1
#         perf_pr_ids.append(id)

# print(f"{cnt} PRs not found in our list")

# df = df_pop_all[df_pop_all["id"].isin(perf_pr_ids)]

# df.to_csv("Outputs/PerformancePRs/POP_PULL_Requests_LLM_filtered_final.csv", index = False)

In [3]:
df = pd.read_csv("Outputs/PerformancePRs/POP_PULL_Requests_LLM_filtered_final.csv")

In [4]:
data_title = df["title"].fillna("")
data_body = df["body"].fillna("")

data_title = df["title"].tolist()
data_body = df["body"].tolist()

docs = [str(i) + "\n" + str(j) for i, j in zip(data_title, data_body)]

# Embedding Generation

In [None]:
model = SentenceTransformer("Qwen/Qwen3-Embedding-8B", device="cuda:0")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
embeddings = model.encode(docs, batch_size = 4, show_progress_bar = True)

np.save("Qwen8PlainEmbeddings.npy",embeddings)

# UMAP

In [11]:
embeddings = np.load("./Outputs/Embeddings/Qwen8Embeddings.npy")

In [28]:
n_component = 20
n_neighbors = 3

umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_component, min_dist=0.1, metric='cosine', random_state=seed)
embeddings_reduced = umap_model.fit_transform(embeddings)


# BERTopic

In [29]:
os.makedirs("Outputs/BERTopic", exist_ok=True)

In [32]:
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=1, cluster_selection_epsilon=0.1, metric="euclidean", prediction_data=True)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(stop_words= "english", ngram_range=(1, 2), min_df=1)
representation_model = [MaximalMarginalRelevance(diversity=0.3)]

In [33]:
topic_model = BERTopic(
    embedding_model=None,
    umap_model=BaseDimensionalityReduction(),
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    ctfidf_model=ctfidf_model,
    calculate_probabilities=True,
    top_n_words=10,
    verbose=True,
)

topics, probs = topic_model.fit_transform(docs, embeddings=embeddings_reduced)
topic_info_df = topic_model.get_topic_info()
topic_info_df

2025-12-16 14:08:44,392 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-16 14:08:44,392 - BERTopic - Dimensionality - Completed ‚úì
2025-12-16 14:08:44,393 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-16 14:08:44,568 - BERTopic - Cluster - Completed ‚úì
2025-12-16 14:08:44,570 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-16 14:08:45,067 - BERTopic - Representation - Completed ‚úì


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,101,-1_uv_pip_calendar_unified,"[uv, pip, calendar, unified, booking, slot, im...",[feat: implement ISR for booking pages with Go...
1,0,96,0_workflow_jobs_job_ci,"[workflow, jobs, job, ci, cache, docker, turbo...",[ci: Add conditional testing for wef and webvi...
2,1,56,1_rosetta_run rosetta_transpiler_benchmark,"[rosetta, run rosetta, transpiler, benchmark, ...",[Update Dart transpiler benchmarking\n## Summa...
3,2,51,2_hydration_species_risedev_component,"[hydration, species, risedev, component, rised...",[Implement error node caching for improved Tre...
4,3,43,3_audio_hfmodel_tts_cpu,"[audio, hfmodel, tts, cpu, cpus, talis, cli, w...",[refactor(connector): split connector implemen...
5,4,40,4_moon_buffer_microsoftazurecosmostestsmicroso...,"[moon, buffer, microsoftazurecosmostestsmicros...",[[WIP] [tracking] Improve Sequence operator ha...
6,5,35,5_join_joins_compilerxgo_left,"[join, joins, compilerxgo, left, test compiler...",[Improve Clojure join compilation\n## Summary\...
7,6,32,6_psutil_e2e_asn_github,"[psutil, e2e, asn, github, validation, phy, te...",[stm32/eth: Improve Ethernet driver with link ...
8,7,32,7_ci dns_npm ci_command npm_npm,"[ci dns, npm ci, command npm, npm, ci, dns blo...",[Implement byPrototype filter with improved ty...
9,8,30,8_napi_azure_guidelines_benchmarks,"[napi, azure, guidelines, benchmarks, sampling...",[[gh-flow] Add minimal Azure provisioning for ...


In [34]:
topic_df = topic_model.get_document_info(docs)
df["Topic"] = topic_df["Topic"]
df["Probability"] = topic_df["Probability"]
df["Representative_document"] = topic_df["Representative_document"]
df

Unnamed: 0,id,number,title,body,agent,user_id,user,state,created_at,closed_at,merged_at,repo_id,repo_url,html_url,Topic,Probability,Representative_document
0,3164503419,40,Fix Claude animation flickering with vt10x-ins...,## üéØ Problem: Claude's Thinking Animation Caus...,Claude_Code,2891702,hjanuschka,closed,2025-06-20T22:47:18Z,2025-06-21T11:51:22Z,,1002552148,https://api.github.com/repos/amantus-ai/vibetu...,https://github.com/amantus-ai/vibetunnel/pull/40,41,1.000000,True
1,3273233066,1037,feat: implement comprehensive species tracking...,## Summary\nThis PR implements a comprehensive...,Claude_Code,7030001,tphakala,closed,2025-07-29T11:21:11Z,2025-07-29T13:49:45Z,2025-07-29T13:49:45Z,707764474,https://api.github.com/repos/tphakala/birdnet-go,https://github.com/tphakala/birdnet-go/pull/1037,2,1.000000,True
2,3219880512,10340,feat(backend): Integrate GCS file storage with...,## Summary\n\nThis PR introduces a complete cl...,Claude_Code,76959103,majdyz,closed,2025-07-10T15:52:56Z,2025-07-18T03:20:54Z,2025-07-18T03:20:54Z,614765452,https://api.github.com/repos/Significant-Gravi...,https://github.com/Significant-Gravitas/AutoGP...,16,0.085150,True
3,2876006908,3375,Improve list and collection materializers perf...,# Optimized Collection Materializers with Batc...,Claude_Code,3348134,strickvl,closed,2025-02-24T19:52:57Z,2025-04-20T19:47:42Z,,314197645,https://api.github.com/repos/zenml-io/zenml,https://github.com/zenml-io/zenml/pull/3375,-1,0.723261,False
4,3142181649,19,Replace CLI subprocess approach with Claude Co...,## Description\n\nReplace the current CLI subp...,Claude_Code,80381,sugyan,closed,2025-06-13T04:05:15Z,2025-06-13T14:14:33Z,2025-06-13T14:14:33Z,999285986,https://api.github.com/repos/sugyan/claude-cod...,https://github.com/sugyan/claude-code-webui/pu...,11,1.000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1216,3152003781,2037,Optimize Chat API/Job schema transfer by remov...,# Optimize Chat API/Job schema transfer by rem...,Devin,158243242,devin-ai-integration[bot],closed,2025-06-17T04:17:12Z,2025-06-17T07:08:49Z,2025-06-17T07:08:49Z,839216423,https://api.github.com/repos/liam-hq/liam,https://github.com/liam-hq/liam/pull/2037,48,1.000000,True
1217,2920951577,1064,feat: improve search functionality with pagina...,Closes #1063\n\nThis PR improves the search fu...,Devin,158243242,devin-ai-integration[bot],closed,2025-03-14T18:07:04Z,2025-03-15T05:36:51Z,,442321089,https://api.github.com/repos/Cap-go/capgo,https://github.com/Cap-go/capgo/pull/1064,28,0.100553,False
1218,2920955200,1065,feat(dashboard): add improved app filtering wi...,# Add search and filtering functionality to th...,Devin,158243242,devin-ai-integration[bot],closed,2025-03-14T18:08:42Z,2025-03-15T05:37:21Z,,442321089,https://api.github.com/repos/Cap-go/capgo,https://github.com/Cap-go/capgo/pull/1065,28,0.085347,False
1219,2920983723,1066,perf: optimize MAU loading mechanism for bette...,Closes #1063\n\nThis PR optimizes the MAU load...,Devin,158243242,devin-ai-integration[bot],closed,2025-03-14T18:19:38Z,2025-03-15T05:38:03Z,,442321089,https://api.github.com/repos/Cap-go/capgo,https://github.com/Cap-go/capgo/pull/1066,28,0.075680,False


In [35]:
topic_info_df.to_csv("./Outputs/BERTopic/Topic_Info.csv", index = False)
df.to_csv("./Outputs/BERTopic/All_PR_Topics.csv", index = False)

In [36]:
os.makedirs("./Outputs/BERTopic/Topics", exist_ok=True)

for topic in topic_info_df["Topic"].tolist():
    df_topic = df[df["Topic"] == topic]
    df_topic = df_topic.sort_values("Probability", ascending=False)
    df_topic.to_csv(f"./Outputs/BERTopic/Topics/topic_{topic}.csv", index = False)
    print(f"{topic} : {df_topic.shape[0]}")

-1 : 101
0 : 96
1 : 56
2 : 51
3 : 43
4 : 40
5 : 35
6 : 32
7 : 32
8 : 30
9 : 28
10 : 25
11 : 24
12 : 23
13 : 23
14 : 23
15 : 22
16 : 22
17 : 21
18 : 21
19 : 20
20 : 20
21 : 19
22 : 19
23 : 19
24 : 18
25 : 18
26 : 17
27 : 17
28 : 16
29 : 16
30 : 16
31 : 16
32 : 15
33 : 14
34 : 14
35 : 14
36 : 13
37 : 13
38 : 13
39 : 13
40 : 13
41 : 12
42 : 12
43 : 12
44 : 11
45 : 11
46 : 11
47 : 11
48 : 10
49 : 10
50 : 10
51 : 10


# Parameter Tuning

In [5]:
embeddings_combined = np.load("./Outputs/Embeddings/Qwen8Embeddings.npy")

In [7]:
def bertopic_grid_search(
    docs,
    embeddings,
    umap_params_grid,
    hdbscan_params_grid,
    vectorizer_params_grid=None
):
    """
    Perform grid search over UMAP, HDBSCAN, and vectorizer parameters
    using BERTopic + evaluation_metrics from evaluate_topics module.
    """

    all_results = []
    best_score = -999
    best_model = None
    best_config = None

    total_combinations = (
            len(umap_params_grid) *
            len(hdbscan_params_grid) *
            len(vectorizer_params_grid)
        )

    for umap_params, hdb_params, vect_params in tqdm(itertools.product(umap_params_grid, hdbscan_params_grid, vectorizer_params_grid),
        total=total_combinations,
        desc="Parameter search"
        ):

        # print("\n==============================")
        # print("Testing configuration:")
        # print("UMAP:", umap_params)
        # print("HDBSCAN:", hdb_params)
        # print("Vectorizer:", vect_params)
        # print("==============================")
        try:
        # 1. Build UMAP
            umap_model = UMAP(
                n_neighbors=umap_params.get("n_neighbors", 15),
                n_components=umap_params.get("n_components", 10),
                min_dist=umap_params.get("min_dist", 0.1),
                metric=umap_params.get("metric", "cosine"),
                random_state=seed
            )

            reduced_embeddings = umap_model.fit_transform(embeddings)

            # 2. Build HDBSCAN
            hdbscan_model = HDBSCAN(
                min_cluster_size=hdb_params.get("min_cluster_size", 10),
                min_samples=hdb_params.get("min_samples", 1),
                cluster_selection_epsilon=hdb_params.get("cluster_selection_epsilon", 0.1),
                metric=hdb_params.get("metric", "euclidean"),
                prediction_data=True
            )

            # 3. Build vectorizer
            vectorizer_model = CountVectorizer(
                stop_words="english",
                ngram_range=vect_params.get("ngram_range", (1, 2)),
                min_df=vect_params.get("min_df", 1)
            )

            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
            representation_model = [MaximalMarginalRelevance(diversity=0.3)]

            # 4. Train BERTopic
            topic_model = BERTopic(

                embedding_model=None,    # using precomputed embeddings
                umap_model=BaseDimensionalityReduction(),
                hdbscan_model=hdbscan_model,
                vectorizer_model=vectorizer_model,
                ctfidf_model=ctfidf_model,
                representation_model=representation_model,
                calculate_probabilities=True,
                verbose=False
            )

            topics, probs = topic_model.fit_transform(docs, embeddings=reduced_embeddings)

            # 5. Evaluate
            metrics = evaluate_topics(topic_model, docs, reduced_embeddings)

            coherence = metrics["coherence"]
            diversity = metrics["diversity"]
            silhouette = metrics["silhouette"]

            try:
                score = coherence + silhouette # Weighted score (adjust as needed)
            except:
                score = 0

            all_results.append({
                "umap": umap_params,
                "hdbscan": hdb_params,
                "vectorizer": vect_params,
                "coherence": coherence,
                "diversity": diversity,
                "silhouette": silhouette,
                "num_clusters": metrics["cluster_metrics"]["num_clusters"],
                "outliers_pct": metrics["cluster_metrics"]["outlier_percentage"],
                "score": score,
                "model": topic_model
            })

            if score > best_score:
                best_score = score
                best_model = topic_model
                best_config = (umap_params, hdb_params, vect_params)
        except:
            pass

    # Convert results to DataFrame (excluding model objects)
    df_results = pd.DataFrame([
        {k: v for k, v in r.items() if k != "model"} 
        for r in all_results
    ])

    return best_model, best_config, df_results


In [None]:
umap_grid = [
    {"n_components": 20, "n_neighbors": 3},
    {"n_components": 20, "n_neighbors": 5},

    {"n_components": 50, "n_neighbors": 3},
    {"n_components": 50, "n_neighbors": 5},

    {"n_components": 100, "n_neighbors": 3},
    {"n_components": 100, "n_neighbors": 5},
]

hdbscan_grid = [
    {"min_cluster_size": 5, "min_samples": 1},
    {"min_cluster_size": 10, "min_samples": 1},
    {"min_cluster_size": 15, "min_samples": 1},
]

vectorizer_grid = [
    {"ngram_range": (1, 2)},
]

best_model, best_config, results_df = bertopic_grid_search(docs, embeddings_combined, umap_grid, hdbscan_grid, vectorizer_grid)

print("\nBEST CONFIGURATION:")
print(best_config)


In [9]:
results_df

Unnamed: 0,umap,hdbscan,vectorizer,coherence,diversity,silhouette,num_clusters,outliers_pct,score
0,"{'n_components': 20, 'n_neighbors': 3}","{'min_cluster_size': 5, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.968182,0.582833,110,6.88,0.0
1,"{'n_components': 20, 'n_neighbors': 3}","{'min_cluster_size': 10, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.467653,0.95,0.57337,52,8.27,1.041023
2,"{'n_components': 20, 'n_neighbors': 3}","{'min_cluster_size': 15, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.455706,0.927273,0.521334,33,13.43,0.977039
3,"{'n_components': 20, 'n_neighbors': 5}","{'min_cluster_size': 5, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.963366,0.551298,101,12.04,0.0
4,"{'n_components': 20, 'n_neighbors': 5}","{'min_cluster_size': 10, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.456741,0.941667,0.489854,48,10.32,0.946595
5,"{'n_components': 20, 'n_neighbors': 5}","{'min_cluster_size': 15, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.403386,0.928571,0.445714,35,14.91,0.8491
6,"{'n_components': 50, 'n_neighbors': 3}","{'min_cluster_size': 5, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.97193,0.590982,114,9.5,0.0
7,"{'n_components': 50, 'n_neighbors': 3}","{'min_cluster_size': 10, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.944231,0.5602,52,5.41,0.0
8,"{'n_components': 50, 'n_neighbors': 3}","{'min_cluster_size': 15, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.429246,0.917647,0.524942,34,9.99,0.954189
9,"{'n_components': 50, 'n_neighbors': 5}","{'min_cluster_size': 5, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.97767,0.503365,103,8.11,0.0


In [26]:
umap_grid = [
    {"n_components": 20, "n_neighbors": 3},
    {"n_components": 20, "n_neighbors": 5},

    {"n_components": 15, "n_neighbors": 3},
    {"n_components": 15, "n_neighbors": 5},

    {"n_components": 12, "n_neighbors": 3},
    {"n_components": 12, "n_neighbors": 5},

    {"n_components": 10, "n_neighbors": 3},
    {"n_components": 10, "n_neighbors": 5},

    {"n_components": 5, "n_neighbors": 3},
    {"n_components": 5, "n_neighbors": 5},
]

hdbscan_grid = [
    {"min_cluster_size": 5, "min_samples": 1},
    {"min_cluster_size": 10, "min_samples": 1},
    {"min_cluster_size": 15, "min_samples": 1},
]

vectorizer_grid = [
    {"ngram_range": (1, 2)},
]

best_model, best_config, results_df = bertopic_grid_search(docs, embeddings_combined, umap_grid, hdbscan_grid, vectorizer_grid)

print("\nBEST CONFIGURATION:")
print(best_config)


Parameter search:   0%|          | 0/30 [00:00<?, ?it/s]Gensim CoherenceModel failed.
Traceback (most recent call last):
  File "/home/cs/grad/opumni/Research/MSR-MiningChallenge26/modules/evaluate_bertopic.py", line 100, in compute_topic_coherence
    coherence_model = CoherenceModel(
                      ^^^^^^^^^^^^^^^
  File "/home/cs/grad/opumni/Research/MSR-MiningChallenge26/.conda/lib/python3.12/site-packages/gensim/models/coherencemodel.py", line 214, in __init__
    self.topics = topics
    ^^^^^^^^^^^
  File "/home/cs/grad/opumni/Research/MSR-MiningChallenge26/.conda/lib/python3.12/site-packages/gensim/models/coherencemodel.py", line 429, in topics
    topic_token_ids = self._ensure_elements_are_ids(topic)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cs/grad/opumni/Research/MSR-MiningChallenge26/.conda/lib/python3.12/site-packages/gensim/models/coherencemodel.py", line 453, in _ensure_elements_are_ids
    raise ValueError('unable to interpret topi


BEST CONFIGURATION:
({'n_components': 20, 'n_neighbors': 3}, {'min_cluster_size': 10, 'min_samples': 1}, {'ngram_range': (1, 2)})


In [27]:
results_df

Unnamed: 0,umap,hdbscan,vectorizer,coherence,diversity,silhouette,num_clusters,outliers_pct,score
0,"{'n_components': 20, 'n_neighbors': 3}","{'min_cluster_size': 5, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.968182,0.582833,110,6.88,0.0
1,"{'n_components': 20, 'n_neighbors': 3}","{'min_cluster_size': 10, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.467653,0.95,0.57337,52,8.27,1.041023
2,"{'n_components': 20, 'n_neighbors': 3}","{'min_cluster_size': 15, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.455706,0.927273,0.521334,33,13.43,0.977039
3,"{'n_components': 20, 'n_neighbors': 5}","{'min_cluster_size': 5, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.963366,0.551298,101,12.04,0.0
4,"{'n_components': 20, 'n_neighbors': 5}","{'min_cluster_size': 10, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.456741,0.941667,0.489854,48,10.32,0.946595
5,"{'n_components': 20, 'n_neighbors': 5}","{'min_cluster_size': 15, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.403386,0.928571,0.445714,35,14.91,0.8491
6,"{'n_components': 15, 'n_neighbors': 3}","{'min_cluster_size': 5, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.97265,0.588669,117,10.81,0.0
7,"{'n_components': 15, 'n_neighbors': 3}","{'min_cluster_size': 10, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.446328,0.948,0.566037,50,7.62,1.012365
8,"{'n_components': 15, 'n_neighbors': 3}","{'min_cluster_size': 15, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.437287,0.93125,0.525525,32,14.74,0.962812
9,"{'n_components': 15, 'n_neighbors': 5}","{'min_cluster_size': 5, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.968627,0.54769,102,8.11,0.0
