In [1]:
import pandas as pd
import numpy as np
import regex as re

import os

from bs4 import BeautifulSoup
from markdown import markdown
from swifter import swifter
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.dimensionality import BaseDimensionalityReduction
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from umap import UMAP

from modules.constants import *

seed = 42

In [2]:
def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(text=True))

    return text

def clean_text(text):
    text = markdown_to_text(text).strip()

    text = text.lower()

    for agent in AGENTS:
        text = text.replace(agent.lower(), "")

    text = re.sub(r"http\S+", " <URL> ",text) #Removing URLs 
    
    # html=re.compile(r'<.*?>') 
    
    # text = html.sub(r'',text) #Removing html tags
    
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # Emoticons
                               "\U0001F300-\U0001F5FF"  # Symbols & pictographs
                               "\U0001F680-\U0001F6FF"  # Transport & map symbols
                               "\U0001F700-\U0001F77F"  # Alchemical symbols
                               "\U0001F780-\U0001F7FF"  # Geometric shapes
                               "\U0001F800-\U0001F8FF"  # Supplemental arrows
                               "\U0001F900-\U0001F9FF"  # Supplemental symbols and pictographs
                               "\U0001FA00-\U0001FA6F"  # Symbols and pictographs extended-A
                               "\U0001FA70-\U0001FAFF"  # Symbols and pictographs extended-B
                               "\U00002702-\U000027B0"  # Dingbats
                               "\U000024C2-\U0001F251"  # Enclosed characters
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    #common_punctuation = r'.,?!:;"\'-\\/'
    #text = re.sub(rf'[^\w\s{re.escape(common_punctuation)}]', '', text)
    #text = re.sub(r'\s+', ' ', text).strip()
    return text

In [3]:
model = SentenceTransformer("google/embeddinggemma-300m", device="cuda:0" )
tokenizer = model.tokenizer

def get_embeddings(
    text: str,
    token_limit: int = 2000,
    stride: int = 1024
) -> np.ndarray:

    tokens = tokenizer.encode(text, add_special_tokens=False)
    total_tokens = len(tokens)

    embeddings = []
    start = 0
    while start < total_tokens:
        end = min(start + token_limit, total_tokens)
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunk_text = "task: clustering | query: " + chunk_text

        # Encode chunk with task type
        emb = model.encode(
            chunk_text,
            convert_to_numpy=True,
            normalize_embeddings=True
        )
        embeddings.append(emb)
        start += stride

    if not embeddings:
        print(f"No embeddings generated — check your input text or model setup. text: {text}")

    pooled_embedding = np.mean(np.vstack(embeddings), axis=0)

    return pooled_embedding

In [4]:
def prepare_embeddings(df):
    embeddings = []
    embeddings_title = []
    data = []
    ids = []
    for _, row in tqdm(df.iterrows()):
        title_emb = get_embeddings(row["title"]) if row["title"].strip() else None
        body_emb  = get_embeddings(row["body"])  if row["body"].strip() else None

        if title_emb is None and body_emb is None:
            continue

        if title_emb is None:
            print("title is none")
            title_emb = np.zeros(body_emb.shape)
        if body_emb is None:
            print("body is none")
            body_emb = np.zeros(title_emb.shape)
            

        ids.append(row["id"])
        embeddings.append(np.concatenate((title_emb, body_emb), axis=0))
        embeddings_title.append(title_emb)
        data.append(row["title"] + row["body"])
        
    return np.array(embeddings_title), np.array(embeddings)

def get_data(df):
    data = []
    data_title = []
    for _, row in tqdm(df.iterrows()):
        data.append(row["title"] + row["body"])
        data_title.append(row["title"])

    return np.array(data_title), np.array(data)

def load_embeddings(type):
    emb_title = None
    emb = None
    
    if type == "ai":
        emb_title = np.load("./Outputs/Embeddings/AIEmbeddings_title.npy")
        emb = np.load("./Outputs/Embeddings/AIEmbeddings.npy")
    elif type == "human":
        emb_title = np.load("./Outputs/Embeddings/HumanEmbeddings_title.npy")
        emb = np.load("./Outputs/Embeddings/HumanEmbeddings.npy")

    return emb_title, emb

In [5]:
df_ai = pd.read_csv("Outputs/PerformancePRs/POP_PULL_Requests_LLM_filtered.csv")
df_ai["title"] = df_ai["title"].fillna("").apply(lambda x: clean_text(x))
df_ai["body"] = df_ai["body"].fillna("").apply(lambda x: clean_text(x))

df_human = pd.read_csv("Outputs/PerformancePRs/HUMAN_PULL_Requests_llm_filtered.csv")
df_human["title"] = df_human["title"].fillna("").apply(lambda x: clean_text(x))
df_human["body"] = df_human["body"].fillna("").apply(lambda x: clean_text(x))


Call to deprecated method findAll. (Replaced by find_all) -- Deprecated since version 4.0.0.


Call to deprecated method findAll. (Replaced by find_all) -- Deprecated since version 4.0.0.



In [None]:
os.makedirs("Outputs/Embeddings", exist_ok=True)

embeddings_ai_title, embeddings_ai = prepare_embeddings(df_ai) 
np.save("./Outputs/Embeddings/AIEmbeddings.npy", embeddings_ai)
np.save("./Outputs/Embeddings/AIEmbeddings_title.npy", embeddings_ai_title)

data_ai_title, data_ai = get_data(df_ai)

embeddings_human_title, embeddings_human = prepare_embeddings(df_human) 
np.save("./Outputs/Embeddings/HumanEmbeddings.npy", embeddings_human)
np.save("./Outputs/Embeddings/HumanEmbeddings_title.npy", embeddings_human_title)

data_human_title, data_human = get_data(df_human)


In [36]:
embeddings_ai_title, embeddings_ai = load_embeddings("ai") 
data_ai_title, data_ai = get_data(df_ai)

embeddings_human_title, embeddings_human = load_embeddings("human") 
data_human_title, data_human = get_data(df_human)

1160it [00:00, 18495.09it/s]
273it [00:00, 18575.73it/s]


In [67]:
os.makedirs("Outputs/BERTopic", exist_ok=True)

hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
umap_model = UMAP(n_neighbors=5, n_components=10, min_dist=0.0, metric='cosine', random_state=seed)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(stop_words= "english", ngram_range=(1, 1))
representation_model = [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)]

In [69]:
topic_model = BERTopic(
    embedding_model=model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    ctfidf_model=ctfidf_model,
    calculate_probabilities=True,
    top_n_words=10,
    verbose=True,
    nr_topics="auto"
)

docs = data_ai
embs = embeddings_ai

topics, probs = topic_model.fit_transform(docs, embeddings=embs)

2025-12-07 13:03:42,383 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-07 13:03:45,235 - BERTopic - Dimensionality - Completed ✓
2025-12-07 13:03:45,236 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-07 13:03:45,371 - BERTopic - Cluster - Completed ✓
2025-12-07 13:03:45,372 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-12-07 13:03:45,504 - BERTopic - Representation - Completed ✓
2025-12-07 13:03:45,504 - BERTopic - Topic reduction - Reducing number of topics
2025-12-07 13:03:45,523 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-07 13:04:07,584 - BERTopic - Representation - Completed ✓
2025-12-07 13:04:07,585 - BERTopic - Topic reduction - Reduced number of topics from 58 to 43


In [70]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,208,-1_github_dev_fixes_coding,"[github, dev, fixes, coding, gradle, pnpm, age...",[add get-sqldscserverprotocol public command w...
1,0,360,0_fixes_coding_lint_implementation,"[fixes, coding, lint, implementation, ui, esli...",[feat: implement async notification and teleme...
2,1,109,1_compiler_onnxscript_compile_runtime,"[compiler, onnxscript, compile, runtime, x86, ...",[convert wormholecontract to sol_storage! macr...
3,2,47,2_alpha_agi_insight_v1_insight_browser_v1_alph...,"[alpha_agi_insight_v1, insight_browser_v1, alp...",[[alpha_factory] tighten insight bundle size c...
4,3,42,3_mochi_benchmark_testswifttranspiler_rosetta_...,"[mochi_benchmark, testswifttranspiler_rosetta_...",[add benchmark support to c++ transpiler tests...
5,4,20,4_optimize_ci_github_caching_cache,"[optimize_ci, github, caching, cache, git, cir...",[add vcpkg dependency caching to windows ci wo...
6,5,19,5_compilehashouterjoin_compilejoinquery_outer_...,"[compilehashouterjoin, compilejoinquery, outer...",[improve join performance with hashed left joi...
7,6,18,6_gpu_model_runner_webgpu_test_paged_attention...,"[gpu_model_runner, webgpu, test_paged_attentio...",[[core] freeze gc during cuda graph capture to...
8,7,18,7_buildslotswithdateranges_scheduling_calendar...,"[buildslotswithdateranges, scheduling, calenda...",[feat: optimize slot generation with inverted ...
9,8,16,8_mvn_wget_reentrantfilelocktestsummary_commit...,"[mvn, wget, reentrantfilelocktestsummary, comm...",[implement kv batch putmanysummary\n\nadd putm...


In [71]:
new_topics = topic_model.reduce_outliers(docs, topics, embeddings=embs, strategy="embeddings")
#new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities")
topic_model.update_topics(docs, topics=new_topics,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    ctfidf_model=ctfidf_model )
topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,463,0_coding_fixes_cache_optimizations,"[coding, fixes, cache, optimizations, implemen...",[feat: implement async notification and teleme...
1,1,137,1_compiler_testvm_ir_compile_onnxscript,"[compiler, testvm_ir, compile, onnxscript, x86...",[convert wormholecontract to sol_storage! macr...
2,2,47,2_alpha_agi_insight_v1_insight_browser_v1_alph...,"[alpha_agi_insight_v1, insight_browser_v1, alp...",[[alpha_factory] tighten insight bundle size c...
3,3,46,3_mochi_benchmark_testswifttranspiler_rosetta_...,"[mochi_benchmark, testswifttranspiler_rosetta_...",[add benchmark support to c++ transpiler tests...
4,4,31,4_github_caching_cache_git,"[github, caching, cache, git, circleci, compil...",[add vcpkg dependency caching to windows ci wo...
5,5,21,5_compilehashouterjoin_compilejoinquery_outer_...,"[compilehashouterjoin, compilejoinquery, outer...",[improve join performance with hashed left joi...
6,6,21,6_gpu_model_runner_webgpu_test_paged_attention...,"[gpu_model_runner, webgpu, test_paged_attentio...",[[core] freeze gc during cuda graph capture to...
7,7,18,7_buildslotswithdateranges_scheduling_calendar...,"[buildslotswithdateranges, scheduling, calenda...",[feat: optimize slot generation with inverted ...
8,8,17,8_mvn_wget_java_jsonparser,"[mvn, wget, java, jsonparser, parsersummary, t...",[implement kv batch putmanysummary\n\nadd putm...
9,9,16,9_dotnet_csproj_tests_inlining,"[dotnet, csproj, tests, inlining, net9, struct...",[apply aggressiveinlining attributessummary\n\...


In [72]:
topic_model = BERTopic(
    embedding_model=model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    ctfidf_model=ctfidf_model,
    calculate_probabilities=True,
    top_n_words=20,
    verbose=True,
    nr_topics="auto"
)
docs = data_human
embs = embeddings_human

topics, probs = topic_model.fit_transform(docs, embeddings=embs)

2025-12-07 13:04:48,992 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-07 13:04:49,313 - BERTopic - Dimensionality - Completed ✓
2025-12-07 13:04:49,314 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-07 13:04:49,329 - BERTopic - Cluster - Completed ✓
2025-12-07 13:04:49,330 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-12-07 13:04:49,366 - BERTopic - Representation - Completed ✓
2025-12-07 13:04:49,366 - BERTopic - Topic reduction - Reducing number of topics
2025-12-07 13:04:49,371 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-07 13:05:00,761 - BERTopic - Representation - Completed ✓
2025-12-07 13:05:00,762 - BERTopic - Topic reduction - Reduced number of topics from 14 to 14


In [73]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,58,-1_fixes_dotnet_implementation_benchmark,"[fixes, dotnet, implementation, benchmark, inv...",[reduce http headers validation overheadwhen a...
1,0,41,0_github_lint_fixes_mlflow,"[github, lint, fixes, mlflow, changeset, pnpm,...",[swap doc preview and test steps to view the p...
2,1,35,1_cuda_optimisations_bugfixes_fix,"[cuda, optimisations, bugfixes, fix, optimisat...",[disable cache on ci on windows because downlo...
3,2,24,2_coderabbit_chatgpt_refactor_release,"[coderabbit, chatgpt, refactor, release, docke...",[grida canvas - skia-safe rust backend - stand...
4,3,16,3_linqfixes_linq_compiling_runtime,"[linqfixes, linq, compiling, runtime, concurre...",[add comprehensive vibetunnel protocol benchma...
5,4,14,4_builtins_bun_tests_typescript,"[builtins, bun, tests, typescript, builtin, de...",[report memory cost of sourcemaps to gcwhat do...
6,5,13,5_parser_refactor_buffer_chunksprefetch,"[parser, refactor, buffer, chunksprefetch, pre...",[@remotion/renderer: consider --memory flag fr...
7,6,12,6_torch_memory_saver_c_compiler_compiler_gemma...,"[torch_memory_saver, c_compiler, compiler, gem...",[[mlas] dequantizelinear int8/uint8description...
8,7,12,7_refactor_workersupdateuserbalancestatscachew...,"[refactor, workersupdateuserbalancestatscachew...",[cached repetitive data lookups for creator an...
9,8,12,8_execute_gpt_4v_request_crewagentparser_jsonp...,"[execute_gpt_4v_request, crewagentparser, json...",[ speed up function execute_gpt_4v_request by ...


In [74]:
new_topics = topic_model.reduce_outliers(docs, topics, embeddings=embs, strategy="embeddings")
#new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities")
topic_model.update_topics(docs, topics=new_topics,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    ctfidf_model=ctfidf_model )
topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,54,0_github_refactor_lint_fixes,"[github, refactor, lint, fixes, mlflow, pnpm, ...",[swap doc preview and test steps to view the p...
1,1,35,1_cuda_optimisations_bugfixes_fix,"[cuda, optimisations, bugfixes, fix, optimisat...",[disable cache on ci on windows because downlo...
2,2,27,2_coderabbit_fixes_release_refactor,"[coderabbit, fixes, release, refactor, renderi...",[grida canvas - skia-safe rust backend - stand...
3,3,35,3_benchmark_dotnet_msbuild_netperf,"[benchmark, dotnet, msbuild, netperf, performa...",[add comprehensive vibetunnel protocol benchma...
4,4,18,4_builtins_bun_tests_typescript,"[builtins, bun, tests, typescript, builtin, de...",[report memory cost of sourcemaps to gcwhat do...
5,5,16,5_parser_parse_chunksprefetch_preload,"[parser, parse, chunksprefetch, preload, resiz...",[@remotion/renderer: consider --memory flag fr...
6,6,15,6_cxx_compiler_c_compiler_cmake_compiler,"[cxx_compiler, c_compiler, cmake, compiler, to...",[[mlas] dequantizelinear int8/uint8description...
7,7,13,7_refactor_caching_invoices_coderabbit,"[refactor, caching, invoices, coderabbit, impr...",[cached repetitive data lookups for creator an...
8,8,12,8_execute_gpt_4v_request_crewagentparser_jsonp...,"[execute_gpt_4v_request, crewagentparser, json...",[ speed up function execute_gpt_4v_request by ...
9,9,13,9_sdk_flutter_cache_manager_apphost_git,"[sdk, flutter_cache_manager, apphost, git, ui,...",[finishing up work on appcontrol manager v.1.9...
