# Imports

In [20]:
import pandas as pd
import numpy as np
import regex as re
import itertools
import os


from bs4 import BeautifulSoup
from markdown import markdown
from swifter import swifter
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.dimensionality import BaseDimensionalityReduction
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from umap import UMAP

from modules.evaluate_bertopic import evaluate_topics
from modules.constants import *
from modules.utilities import read_aidev

seed = 42

# Load Data

In [None]:
# df = pd.read_csv("Outputs/PerformancePRs/POP_PULL_Requests_LLM_filtered.csv")

# df_pr_type = read_aidev(FileName.POP_PR_TASK_TYPE)
# df_pr_type = df_pr_type[df_pr_type["type"] == "perf"]

# df_pop_all = read_aidev(FileName.POP_PULL_REQUEST)

# pr_type_ids = df_pr_type["id"].tolist()
# perf_pr_ids = df["id"].tolist()

# cnt = 0
# ids = []
# for id in pr_type_ids:
#     if id not in perf_pr_ids:
#         cnt += 1
#         perf_pr_ids.append(id)

# print(f"{cnt} PRs not found in our list")

# df = df_pop_all[df_pop_all["id"].isin(perf_pr_ids)]

# df.to_csv("Outputs/PerformancePRs/POP_PULL_Requests_LLM_filtered_final.csv", index = False)

61 PRs not found in our list


In [28]:
df = pd.read_csv("Outputs/PerformancePRs/POP_PULL_Requests_LLM_filtered_final.csv")


In [29]:
data_title = df["title"].fillna("")
data_body = df["body"].fillna("")

data_title = df["title"].tolist()
data_body = df["body"].tolist()

docs = [str(i) + "\n" + str(j) for i, j in zip(data_title, data_body)]

# Embedding Generation

In [None]:
model = SentenceTransformer("Qwen/Qwen3-Embedding-8B", device="cuda:0")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
embeddings = model.encode(docs, batch_size = 4, show_progress_bar = True)

np.save("Qwen8PlainEmbeddings.npy",embeddings)

# UMAP

In [3]:
embeddings = np.load("./Outputs/Embeddings/Qwen8Embeddings.npy")

In [4]:
n_component = 50
n_neighbors = 3

umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_component, min_dist=0.1, metric='cosine', random_state=seed)
embeddings_reduced = umap_model.fit_transform(embeddings)



[1mThe TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.[0m



# BERTopic

In [5]:
os.makedirs("Outputs/BERTopic", exist_ok=True)

In [None]:
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=1, cluster_selection_epsilon=0.1, metric="euclidean", prediction_data=True)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(stop_words= "english", ngram_range=(1, 2), min_df=1)
representation_model = [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)]

In [None]:
topic_model = BERTopic(
    embedding_model=model,
    umap_model=BaseDimensionalityReduction(),
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    ctfidf_model=ctfidf_model,
    calculate_probabilities=True,
    top_n_words=10,
    verbose=True,
)

topics, probs = topic_model.fit_transform(docs, embeddings=embeddings_reduced)
topic_info_df = topic_model.get_topic_info()
topic_info_df

2025-12-11 18:30:13,210 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-11 18:30:13,211 - BERTopic - Dimensionality - Completed ‚úì
2025-12-11 18:30:13,212 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-11 18:30:13,406 - BERTopic - Cluster - Completed ‚úì
2025-12-11 18:30:13,408 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-11 18:30:13,829 - BERTopic - Representation - Completed ‚úì


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,95,-1_calendar_unified_booking_tickets,"[calendar, unified, booking, tickets, history,...",[Optimize invoices page to load only invoices ...
1,0,101,0_workflow_jobs_job_cache,"[workflow, jobs, job, cache, ci, docker, minut...",[ci: Add conditional testing for wef and webvi...
2,1,78,1_nbsp_nbsp nbsp_tool_llm,"[nbsp, nbsp nbsp, tool, llm, maxage, token, up...",[feat: enhance text generation panel with exec...
3,2,56,2_moon_buffer_test microsoftazurecosmostestsmi...,"[moon, buffer, test microsoftazurecosmostestsm...",[Prevent over-counting Blob memory usage\n### ...
4,3,52,3_audio_image_psutil_thinking,"[audio, image, psutil, thinking, thinking fiel...",[stm32/eth: Improve Ethernet driver with link ...
5,4,48,4_testsvm_vm_ir_testsvm run,"[testsvm, vm, ir, testsvm run, slow testsvm, t...",[refactor(twap): implement strategy pattern fo...
6,5,47,5_command npm_ci dns_npm ci_npm,"[command npm, ci dns, npm ci, npm, ci, dns blo...",[fix(api): Update mocha configuration to use S...
7,6,43,6_hydration_species_risedev_component,"[hydration, species, risedev, component, psql,...",[Implement error node caching for improved Tre...
8,7,35,7_rosetta_run rosetta_benchmark_transpiler,"[rosetta, run rosetta, benchmark, transpiler, ...",[Add benchmark support to Fortran transpiler\n...
9,8,31,8_telemetry_phase_benchmarks_napi,"[telemetry, phase, benchmarks, napi, azure, be...",[Add performance benchmarking and AOT file siz...


In [10]:
topic_df = topic_model.get_document_info(docs)
df["Topic"] = topic_df["Topic"]
df["Probability"] = topic_df["Probability"]
df["Representative_document"] = topic_df["Representative_document"]
df

Unnamed: 0,id,number,title,body,agent,user_id,user,state,created_at,closed_at,merged_at,repo_id,repo_url,html_url,llm_output,Topic,Probability,Representative_document
0,3164503419,40,Fix Claude animation flickering with vt10x-ins...,## üéØ Problem: Claude's Thinking Animation Caus...,Claude_Code,2891702,hjanuschka,closed,2025-06-20T22:47:18Z,2025-06-21T11:51:22Z,,1002552148,https://api.github.com/repos/amantus-ai/vibetu...,https://github.com/amantus-ai/vibetunnel/pull/40,analysisWe need to classify as 'performance' o...,24,0.045818,True
1,3273233066,1037,feat: implement comprehensive species tracking...,## Summary\nThis PR implements a comprehensive...,Claude_Code,7030001,tphakala,closed,2025-07-29T11:21:11Z,2025-07-29T13:49:45Z,2025-07-29T13:49:45Z,707764474,https://api.github.com/repos/tphakala/birdnet-go,https://github.com/tphakala/birdnet-go/pull/1037,analysisWe need to classify as performance or ...,6,1.000000,True
2,3219880512,10340,feat(backend): Integrate GCS file storage with...,## Summary\n\nThis PR introduces a complete cl...,Claude_Code,76959103,majdyz,closed,2025-07-10T15:52:56Z,2025-07-18T03:20:54Z,2025-07-18T03:20:54Z,614765452,https://api.github.com/repos/Significant-Gravi...,https://github.com/Significant-Gravitas/AutoGP...,analysisWe need to classify as 'performance' o...,1,0.198623,True
3,2876006908,3375,Improve list and collection materializers perf...,# Optimized Collection Materializers with Batc...,Claude_Code,3348134,strickvl,closed,2025-02-24T19:52:57Z,2025-04-20T19:47:42Z,,314197645,https://api.github.com/repos/zenml-io/zenml,https://github.com/zenml-io/zenml/pull/3375,analysisWe need to classify as 'performance' o...,26,1.000000,True
4,3142181649,19,Replace CLI subprocess approach with Claude Co...,## Description\n\nReplace the current CLI subp...,Claude_Code,80381,sugyan,closed,2025-06-13T04:05:15Z,2025-06-13T14:14:33Z,2025-06-13T14:14:33Z,999285986,https://api.github.com/repos/sugyan/claude-cod...,https://github.com/sugyan/claude-code-webui/pu...,analysisWe need to classify as 'performance' o...,19,1.000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1155,3152003781,2037,Optimize Chat API/Job schema transfer by remov...,# Optimize Chat API/Job schema transfer by rem...,Devin,158243242,devin-ai-integration[bot],closed,2025-06-17T04:17:12Z,2025-06-17T07:08:49Z,2025-06-17T07:08:49Z,839216423,https://api.github.com/repos/liam-hq/liam,https://github.com/liam-hq/liam/pull/2037,analysisWe need to classify as 'performance' o...,24,1.000000,False
1156,2920951577,1064,feat: improve search functionality with pagina...,Closes #1063\n\nThis PR improves the search fu...,Devin,158243242,devin-ai-integration[bot],closed,2025-03-14T18:07:04Z,2025-03-15T05:36:51Z,,442321089,https://api.github.com/repos/Cap-go/capgo,https://github.com/Cap-go/capgo/pull/1064,analysisWe need to classify as performance or ...,17,1.000000,False
1157,2920955200,1065,feat(dashboard): add improved app filtering wi...,# Add search and filtering functionality to th...,Devin,158243242,devin-ai-integration[bot],closed,2025-03-14T18:08:42Z,2025-03-15T05:37:21Z,,442321089,https://api.github.com/repos/Cap-go/capgo,https://github.com/Cap-go/capgo/pull/1065,analysisWe need to classify as performance or ...,17,1.000000,False
1158,2920983723,1066,perf: optimize MAU loading mechanism for bette...,Closes #1063\n\nThis PR optimizes the MAU load...,Devin,158243242,devin-ai-integration[bot],closed,2025-03-14T18:19:38Z,2025-03-15T05:38:03Z,,442321089,https://api.github.com/repos/Cap-go/capgo,https://github.com/Cap-go/capgo/pull/1066,analysisWe need to classify as 'performance' o...,17,1.000000,False


In [11]:
topic_info_df.to_csv("./Outputs/BERTopic/Topic_Info.csv", index = False)
df.to_csv("./Outputs/BERTopic/All_PR_Topics.csv", index = False)

In [14]:
os.makedirs("./Outputs/BERTopic/Topics", exist_ok=True)

for topic in topic_info_df["Topic"].tolist():
    df_topic = df[df["Topic"] == topic]
    df_topic = df_topic.sort_values("Probability", ascending=False)
    df_topic.to_csv(f"./Outputs/BERTopic/Topics/topic_{topic}.csv", index = False)
    print(f"{topic} : {df_topic.shape[0]}")

-1 : 95
0 : 101
1 : 78
2 : 56
3 : 52
4 : 48
5 : 47
6 : 43
7 : 35
8 : 31
9 : 31
10 : 31
11 : 29
12 : 28
13 : 26
14 : 26
15 : 24
16 : 24
17 : 24
18 : 24
19 : 23
20 : 21
21 : 19
22 : 18
23 : 18
24 : 17
25 : 16
26 : 15
27 : 14
28 : 14
29 : 13
30 : 13
31 : 12
32 : 12
33 : 11
34 : 11
35 : 10
36 : 10
37 : 10
38 : 10
39 : 10
40 : 10


# Parameter Tuning

In [23]:
embeddings_combined = np.load("./Outputs/Embeddings/Qwen8Embeddings.npy")

In [None]:
def bertopic_grid_search(
    docs,
    embeddings,
    umap_params_grid,
    hdbscan_params_grid,
    vectorizer_params_grid=None
):
    """
    Perform grid search over UMAP, HDBSCAN, and vectorizer parameters
    using BERTopic + evaluation_metrics from evaluate_topics module.
    """

    all_results = []
    best_score = -999
    best_model = None
    best_config = None

    total_combinations = (
            len(umap_params_grid) *
            len(hdbscan_params_grid) *
            len(vectorizer_params_grid)
        )

    for umap_params, hdb_params, vect_params in tqdm(itertools.product(umap_params_grid, hdbscan_params_grid, vectorizer_params_grid),
        total=total_combinations,
        desc="Parameter search"
        ):

        # print("\n==============================")
        # print("Testing configuration:")
        # print("UMAP:", umap_params)
        # print("HDBSCAN:", hdb_params)
        # print("Vectorizer:", vect_params)
        # print("==============================")
        try:
        # 1. Build UMAP
            umap_model = UMAP(
                n_neighbors=umap_params.get("n_neighbors", 15),
                n_components=umap_params.get("n_components", 10),
                min_dist=umap_params.get("min_dist", 0.1),
                metric=umap_params.get("metric", "cosine"),
                random_state=seed
            )

            reduced_embeddings = umap_model.fit_transform(embeddings)

            # 2. Build HDBSCAN
            hdbscan_model = HDBSCAN(
                min_cluster_size=hdb_params.get("min_cluster_size", 10),
                min_samples=hdb_params.get("min_samples", 1),
                cluster_selection_epsilon=hdb_params.get("cluster_selection_epsilon", 0.1),
                metric=hdb_params.get("metric", "euclidean"),
                prediction_data=True
            )

            # 3. Build vectorizer
            vectorizer_model = CountVectorizer(
                stop_words="english",
                ngram_range=vect_params.get("ngram_range", (1, 2)),
                min_df=vect_params.get("min_df", 1)
            )

            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
            representation_model = [MaximalMarginalRelevance(diversity=0.3)]

            # 4. Train BERTopic
            topic_model = BERTopic(

                embedding_model=None,    # using precomputed embeddings
                umap_model=BaseDimensionalityReduction(),
                hdbscan_model=hdbscan_model,
                vectorizer_model=vectorizer_model,
                ctfidf_model=ctfidf_model,
                representation_model=representation_model,
                calculate_probabilities=True,
                verbose=False
            )

            topics, probs = topic_model.fit_transform(docs, embeddings=reduced_embeddings)

            # 5. Evaluate
            metrics = evaluate_topics(topic_model, docs, reduced_embeddings)

            coherence = metrics["coherence"]
            diversity = metrics["diversity"]
            silhouette = metrics["silhouette"]

            try:
                score = coherence + silhouette # Weighted score (adjust as needed)
            except:
                score = 0

            all_results.append({
                "umap": umap_params,
                "hdbscan": hdb_params,
                "vectorizer": vect_params,
                "coherence": coherence,
                "diversity": diversity,
                "silhouette": silhouette,
                "num_clusters": metrics["cluster_metrics"]["num_clusters"],
                "outliers_pct": metrics["cluster_metrics"]["outlier_percentage"],
                "score": score,
                "model": topic_model
            })

            if score > best_score:
                best_score = score
                best_model = topic_model
                best_config = (umap_params, hdb_params, vect_params)
        except:
            pass

    # Convert results to DataFrame (excluding model objects)
    df_results = pd.DataFrame([
        {k: v for k, v in r.items() if k != "model"} 
        for r in all_results
    ])

    return best_model, best_config, df_results


In [25]:
umap_grid = [
    {"n_components": 50, "n_neighbors": 3},
    {"n_components": 50, "n_neighbors": 5},

    {"n_components": 100, "n_neighbors": 3},
    {"n_components": 100, "n_neighbors": 5},
]

hdbscan_grid = [
    {"min_cluster_size": 5, "min_samples": 1},
    {"min_cluster_size": 10, "min_samples": 1},
    {"min_cluster_size": 15, "min_samples": 1},
]

vectorizer_grid = [
    {"ngram_range": (1, 2)},
]

best_model, best_config, results_df = bertopic_grid_search(docs, embeddings_combined, umap_grid, hdbscan_grid, vectorizer_grid)

print("\nBEST CONFIGURATION:")
print(best_config)


Parameter search:   0%|          | 0/12 [00:00<?, ?it/s]Gensim CoherenceModel failed.
Traceback (most recent call last):
  File "/home/cs/grad/opumni/Research/MSR-MiningChallenge26/modules/evaluate_bertopic.py", line 100, in compute_topic_coherence
    coherence_model = CoherenceModel(
                      ^^^^^^^^^^^^^^^
  File "/home/cs/grad/opumni/Research/MSR-MiningChallenge26/.conda/lib/python3.12/site-packages/gensim/models/coherencemodel.py", line 214, in __init__
    self.topics = topics
    ^^^^^^^^^^^
  File "/home/cs/grad/opumni/Research/MSR-MiningChallenge26/.conda/lib/python3.12/site-packages/gensim/models/coherencemodel.py", line 429, in topics
    topic_token_ids = self._ensure_elements_are_ids(topic)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cs/grad/opumni/Research/MSR-MiningChallenge26/.conda/lib/python3.12/site-packages/gensim/models/coherencemodel.py", line 453, in _ensure_elements_are_ids
    raise ValueError('unable to interpret topi


BEST CONFIGURATION:
({'n_components': 50, 'n_neighbors': 3}, {'min_cluster_size': 10, 'min_samples': 1}, {'ngram_range': (1, 2)})





In [26]:
results_df

Unnamed: 0,umap,hdbscan,vectorizer,coherence,diversity,silhouette,num_clusters,outliers_pct,score
0,"{'n_components': 50, 'n_neighbors': 3}","{'min_cluster_size': 5, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.976147,0.576621,109,8.02,0.0
1,"{'n_components': 50, 'n_neighbors': 3}","{'min_cluster_size': 10, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.431993,0.95122,0.546285,41,8.19,0.978278
2,"{'n_components': 50, 'n_neighbors': 3}","{'min_cluster_size': 15, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.422715,0.933333,0.516327,27,12.41,0.939042
3,"{'n_components': 50, 'n_neighbors': 5}","{'min_cluster_size': 5, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.973196,0.531784,97,12.41,0.0
4,"{'n_components': 50, 'n_neighbors': 5}","{'min_cluster_size': 10, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.44769,0.948837,0.440089,43,12.41,0.887778
5,"{'n_components': 50, 'n_neighbors': 5}","{'min_cluster_size': 15, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.436211,0.9,0.391353,26,16.38,0.827564
6,"{'n_components': 100, 'n_neighbors': 3}","{'min_cluster_size': 5, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.971053,0.580793,114,9.91,0.0
7,"{'n_components': 100, 'n_neighbors': 3}","{'min_cluster_size': 10, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.435197,0.953333,0.526906,45,12.76,0.962103
8,"{'n_components': 100, 'n_neighbors': 3}","{'min_cluster_size': 15, 'min_samples': 1}","{'ngram_range': (1, 2)}",0.404357,0.911538,0.502089,26,14.14,0.906445
9,"{'n_components': 100, 'n_neighbors': 5}","{'min_cluster_size': 5, 'min_samples': 1}","{'ngram_range': (1, 2)}",,0.970408,0.539261,98,11.47,0.0
