In [48]:
from datasets import load_dataset, Dataset
import pandas as pd
import torch
import numpy as np

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [3]:
def read_excels(file_paths):
    data_frames = [pd.read_excel(file) for file in file_paths]
    combined_df = pd.concat(data_frames, ignore_index=True)
    return combined_df

In [4]:
def read_csvs(file_paths):
    data_frames = [pd.read_csv(file) for file in file_paths]
    combined_df = pd.concat(data_frames, ignore_index=True)
    return combined_df

In [5]:
read_csv_files = ['../dataset/dataset_summary_failed_case.csv']

In [6]:
read_files = ['../dataset/dataset_summary_failed_case.xlsx']

In [7]:
failed_cases_df_initial = read_csvs(read_csv_files)

In [8]:
failed_cases_df = read_excels(read_files)

In [9]:
failed_cases_df.head()

Unnamed: 0,file_hash,syntactic_error_word,syntactic_error_message,precessed_error_message,syntactic_error_description
0,d3204438087d7d924f50237ca1438a01c5d8bf5e26bb23...,SyntaxError,unmatched ')',unmatched ')',"File ""results/experiment_outputs/20251004T0504..."
1,f131e8b1b384f595ae4976583b055a565560f5f1d70164...,SyntaxError,closing parenthesis ']' does not match opening...,closing parenthesis ']' does not match opening...,"File ""results/experiment_outputs/20251004T0504..."
2,410c8bffc380e1f887d96db79780392e309662f3b6e135...,SyntaxError,invalid syntax. Perhaps you forgot a comma?,"invalid whitespace in between expression, math...","File ""results/experiment_outputs/20251003T1658..."
3,46e112f054ecc3b819ca39178321dc01d23be5e49c2151...,SyntaxError,too many nested parentheses,too many nested parentheses,"File ""results/experiment_outputs/20251003T0117..."
4,71ddfa42351aae24b5c2bfe98c77117fd09f683ac3da97...,SyntaxError,closing parenthesis ']' does not match opening...,closing parenthesis ']' does not match opening...,"File ""results/experiment_outputs/20251004T0504..."


In [10]:
failed_cases_df_initial.head()

Unnamed: 0,file_hash,syntactic_error_word,syntactic_error_message,precessed_error_message,syntactic_error_description
0,d3204438087d7d924f50237ca1438a01c5d8bf5e26bb23...,SyntaxError,unmatched ')',unmatched ')',"File ""results/experiment_outputs/20251004T0504..."
1,f131e8b1b384f595ae4976583b055a565560f5f1d70164...,SyntaxError,closing parenthesis ']' does not match opening...,closing parenthesis ']' does not match opening...,"File ""results/experiment_outputs/20251004T0504..."
2,410c8bffc380e1f887d96db79780392e309662f3b6e135...,SyntaxError,invalid syntax. Perhaps you forgot a comma?,invalid syntax. perhaps you forgot a comma?,"File ""results/experiment_outputs/20251003T1658..."
3,46e112f054ecc3b819ca39178321dc01d23be5e49c2151...,SyntaxError,too many nested parentheses,too many nested parentheses,"File ""results/experiment_outputs/20251003T0117..."
4,71ddfa42351aae24b5c2bfe98c77117fd09f683ac3da97...,SyntaxError,closing parenthesis ']' does not match opening...,closing parenthesis ']' does not match opening...,"File ""results/experiment_outputs/20251004T0504..."


In [11]:
failed_cases_df_initial = failed_cases_df_initial[failed_cases_df_initial["precessed_error_message"] != "invalid syntax"]

In [12]:
failed_cases_df_initial = failed_cases_df_initial[failed_cases_df_initial["precessed_error_message"] != "version none is invalid"]
failed_cases_df = failed_cases_df[failed_cases_df["precessed_error_message"] != "version none is invalid"]

In [13]:
failed_cases_df_initial["precessed_error_message"].to_list()

["unmatched ')'",
 "closing parenthesis ']' does not match opening parenthesis '('",
 'invalid syntax. perhaps you forgot a comma?',
 'too many nested parentheses',
 "closing parenthesis ']' does not match opening parenthesis '('",
 'unterminated string literal',
 "closing parenthesis ')' does not match opening parenthesis '{'",
 "'(' was never closed",
 "invalid character '‚úó'",
 "expected 'except' or 'finally' block",
 'unexpected indent',
 'f-string expression part cannot include a backslash',
 'f-string: invalid syntax',
 'too many nested parentheses',
 'invalid syntax. perhaps you forgot a comma?',
 'cannot assign to literal',
 'unterminated string literal',
 "f-string: single '}' is not allowed",
 "f-string: expecting a valid expression after '{'",
 "f-string: expecting '!', or ':', or '}'",
 'unterminated string literal',
 "f-string: expecting '}'",
 "closing parenthesis ')' does not match opening parenthesis '[' on line 37",
 "'[' was never closed",
 "f-string: expecting a val

In [14]:
failed_cases_df["precessed_error_message"].to_list()

["unmatched ')'",
 "closing parenthesis ']' does not match opening parenthesis '('",
 'invalid whitespace in between expression, mathematical operands missing',
 'too many nested parentheses',
 "closing parenthesis ']' does not match opening parenthesis '('",
 'code object added inside tuple',
 'unterminated string literal',
 "closing parenthesis ')' does not match opening parenthesis '{'",
 "'(' was never closed",
 "invalid character '‚úó'",
 "expected 'except' or 'finally' block",
 'unexpected indent',
 'f-string expression part cannot include a backslash',
 "missing closing ']' and ')'",
 'f-string: invalid syntax',
 'except without try block',
 'too many nested parentheses',
 'incomplete in-line ternary operator, bitwise operation on byte object',
 'invalid whitespace in between expression, mathematical operands missing',
 'cannot assign to literal',
 'unterminated string literal',
 "f-string: single '}' is not allowed",
 'two key-value pars inside the dictionary comprehension inst

In [15]:
combined = failed_cases_df_initial["precessed_error_message"].to_list() + failed_cases_df["precessed_error_message"].to_list()

In [16]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [17]:
X = model.encode(combined, normalize_embeddings=True)  # already L2 normalized

In [18]:
kmeans = KMeans(n_clusters=6, n_init=50, random_state=42)
labels = kmeans.fit_predict(X)
score = silhouette_score(X, labels, metric="cosine")

In [19]:
map_of_labels = pd.DataFrame(columns=["error_message", "label"])

In [20]:
for t, lbl in zip(combined, labels):
    new_df = pd.DataFrame([[t, lbl]], columns=["error_message", "label"])
    map_of_labels = pd.concat([map_of_labels, new_df], ignore_index=True)
print("silhouette (cosine):", score)

silhouette (cosine): 0.3810822665691376


In [21]:
import joblib

In [23]:
# 7) Save models so you can reuse them on other datasets
joblib.dump(X, "error_vectors.joblib")
joblib.dump(kmeans, "error_kmeans.joblib")
joblib.dump(labels, "error_labels.joblib")

['error_labels.joblib']

In [24]:
current_df = pd.read_csv("../dataset/decompiled_syntax_errors.csv") 

In [35]:
current_df.tail()

Unnamed: 0,file_hash,file,error_message,error_description,error,error_msg_clean,cluster
378,00dbd7e6a4e3cba0b7aebf59ad0b208e091f87cb851a58...,decompiled_exchange.cpython-312.py,unterminated string literal (detected at line ...,"File ""/home/diogenes/pylingual_colaboration/py...",unterminated string literal (detected at line ...,unterminated string literal,Cluster_5
379,02439a0892576359290cda6ecdd77983b95e44c55cea5c...,decompiled_queries.cpython-312.py,unexpected character after line continuation c...,"File ""/home/diogenes/pylingual_colaboration/py...",unexpected character after line continuation c...,unexpected character after line continuation c...,Cluster_4
380,0233993dedeaa9aadddba7824abd009455d1843260b60c...,decompiled_domain.cpython-312.py,invalid syntax. Perhaps you forgot a comma? (l...,"File ""/home/diogenes/pylingual_colaboration/py...",invalid syntax. Perhaps you forgot a comma? (d...,invalid syntax. Perhaps you forgot a comma?,Cluster_1
381,019e8e6a12df417f798bcb4a0bed9b98cfd93d5fb93bb1...,decompiled_constants.cpython-312.py,unterminated string literal (detected at line ...,"File ""/home/diogenes/pylingual_colaboration/py...",unterminated string literal (detected at line ...,unterminated string literal,Cluster_5
382,001709536658c0d399f5f5ae9b6178a0b28b28bddea312...,decompiled_utils.cpython-312.py,"unexpected indent (line 60, col 20)","File ""/home/diogenes/pylingual_colaboration/py...",unexpected indent (decompiled_utils.cpython-31...,unexpected indent,Cluster_0


In [49]:
unique_labels = np.unique(labels)

In [51]:
centroid_list = []

In [54]:
for lab in unique_labels:
    rows = X[labels == lab]       # subset of rows in this cluster
    if rows.shape[0] == 0:
        continue  # just in case there is an empty cluster

    centroid = rows.mean(axis=0)          # 1 x n_features (dense matrix-like)
    centroid = np.asarray(centroid).ravel()  # -> (n_features,)
    centroid_list.append(centroid)

In [None]:
from sklearn.preprocessing import normalize
from scipy.sparse import vstack
# Now stack as dense 2D array: (k_clusters, n_features)
centroids = np.vstack(centroid_list)

# L2-normalize rows so dot product ‚âà cosine similarity
centroids = normalize(centroids)  

In [56]:
X_current = model.encode(current_df["error_msg_clean"].to_list(), normalize_embeddings=True)

In [57]:
scores = X_current @ centroids.T

In [58]:
best_idx = np.asarray(scores.argmax(axis=1)).ravel()
assigned_labels = unique_labels[best_idx]

In [59]:
current_df["cluster"] = assigned_labels

In [61]:
current_df["cluster"]= current_df["cluster"].apply(lambda x: f"cluster_{x}")

In [63]:
current_df.to_csv("../dataset/decompiled_syntax_errors_clustered.csv", index=False)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

pca = PCA(n_components=2, random_state=42)
XY = pca.fit_transform(X)

# Plot
plt.figure(figsize=(7,6))
uniq = np.unique(labels)
for lab in uniq:
    m = labels == lab
    marker = "x" if lab == -1 else "o"
    plt.scatter(XY[m,0], XY[m,1], s=18, alpha=0.8, label=f"Cluster {lab}", marker=marker)

plt.title("Clusters (PCA 2D)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend(loc="best", fontsize=9, frameon=True)
plt.tight_layout()
plt.show()


In [None]:
map_of_labels.head()

In [36]:
# Define the output file name
output_filename = "../dataset/cluster_messages_with_initial_file_hash.xlsx"

# Use pd.ExcelWriter as a context manager
with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
    for i in range(6):
        
        # Filter for the current cluster
        selected_cluster = map_of_labels[map_of_labels["label"] == i]
        
        # Define the sheet name
        sheet_name = f"Cluster_{i}"
        
        # Write this cluster's DataFrame to a specific sheet
        selected_cluster.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"All clusters saved to {output_filename}")

All clusters saved to ../dataset/cluster_messages_with_initial_file_hash.xlsx


In [None]:
# from sklearn.neighbors import NearestNeighbors
# import numpy as np
# import matplotlib.pyplot as plt

# # --- Assume 'embeddings' already exists from the previous step ---
# # embeddings = model.encode(texts)

# # 1. Set min_samples
# # Let's start with a value of 5
# min_samples = 5 

# # 2. Calculate nearest neighbor distances
# # n_neighbors should be min_samples
# nn = NearestNeighbors(n_neighbors=min_samples)
# nn.fit(X)
# distances, indices = nn.kneighbors(X)

# # 3. Get the distance to the k-th neighbor (k=min_samples)
# # We sort the k-th distances (the last column of 'distances')
# kth_distances = np.sort(distances[:, min_samples-1], axis=0)

# # 4. Plot the k-distance graph
# plt.plot(kth_distances)
# plt.ylabel(f"Distance to {min_samples}-th Nearest Neighbor (eps)")
# plt.xlabel("Points (sorted by distance)")
# plt.title("K-Distance Graph (Finding the Elbow)")
# plt.grid(True)
# plt.show()

In [None]:
# from sklearn.cluster import DBSCAN

# # --- Assume 'embeddings' and 'texts' exist ---

# # 1. Set parameters from your analysis
# # (We'll use hypothetical values from our "elbow" plot)
# chosen_eps = 0.2 
# chosen_min_samples = 5

# # 2. Initialize and run DBSCAN
# dbscan = DBSCAN(eps=chosen_eps, min_samples=chosen_min_samples)
# dbscan.fit(X)

# # 3. Get the cluster assignments
# cluster_labels = dbscan.labels_

# print(f"Cluster assignments: {cluster_labels}")
# # Example output: [ 0  0 -1  1  1  0]

In [None]:
# # Find the number of unique clusters (excluding noise)
# num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
# print(f"\nFound {num_clusters} clusters and {np.sum(cluster_labels == -1)} noise points.")

# # Print each cluster's texts
# for i in range(num_clusters):
#     print(f"\n--- Cluster {i} ---")
#     for j in range(len(combined)):
#         if cluster_labels[j] == i:
#             print(combined[j])

# # Print the noise points
# print("\n--- Noise Points (Label -1) ---")
# for j in range(len(combined)):
#     if cluster_labels[j] == -1:
#         print(combined[j])

In [None]:
# print(len(cluster_labels))

In [None]:
# import matplotlib.pyplot as plt
# from sklearn.decomposition import PCA
# import pandas as pd
# import numpy as np

# # --- ASSUMPTIONS ---
# # You must have these variables from your previous steps:
# # embeddings: Your (N_samples, N_features) numpy array of BERT embeddings
# # cluster_labels: The (N_samples,) numpy array of labels from dbscan.fit()

# # 1. Reduce Dimensions with PCA
# print("Running PCA to reduce dimensions for plotting...")
# pca = PCA(n_components=2, random_state=42)
# reduced_embeddings = pca.fit_transform(X)

# # 2. Create a DataFrame for easy plotting
# plot_df = pd.DataFrame({
#     'pca_dim1': reduced_embeddings[:, 0],
#     'pca_dim2': reduced_embeddings[:, 1],
#     'cluster': cluster_labels
# })

# # 3. Separate noise points from clustered points
# noise_df = plot_df[plot_df['cluster'] == -1]
# clustered_df = plot_df[plot_df['cluster'] != -1]

# # 4. Create the scatter plot
# plt.figure(figsize=(12, 8))

# # Plot noise points first (grey, smaller, slightly transparent)
# plt.scatter(noise_df['pca_dim1'], noise_df['pca_dim2'],
#             c='grey',
#             s=10,
#             alpha=0.5,
#             label='Noise (Cluster -1)')

# # Plot the clustered points
# # We use 'cluster' for color and a colormap
# scatter = plt.scatter(clustered_df['pca_dim1'], clustered_df['pca_dim2'],
#                       c=clustered_df['cluster'],
#                       cmap='viridis',  # 'viridis', 'tab20', 'Spectral' are good colormaps
#                       s=25,
#                       alpha=0.8,
#                       label='Clustered Points')

# # 5. Add labels, title, and legend
# plt.title('DBSCAN Clustering Results (Visualized with PCA)')
# plt.xlabel('PCA Component 1')
# plt.ylabel('PCA Component 2')

# # Create a legend for the clusters
# # This gets the unique cluster colors and labels them
# unique_clusters = np.unique(clustered_df['cluster'])
# if len(unique_clusters) > 0:
#     handles, labels = scatter.legend_elements()
#     # Add the "Noise" label manually to the legend
#     handles.append(plt.Line2D([0], [0], marker='o', color='w', label='Noise',
#                                markerfacecolor='grey', markersize=5))
#     labels.append('Noise (-1)')
#     plt.legend(handles=handles, labels=labels, title="Cluster")
# else:
#     plt.legend(title="Cluster") # Will just show noise if no clusters found
    
# plt.grid(True)
# # plt.savefig("dbscan_cluster_plot.png")

# # print("Plot saved as 'dbscan_cluster_plot.png'")