Parse failures

This report shows different stack traces from the projects organized into clusters based on similarities. Each cluster represents a group of related themes. The legend provides a representative sample from each cluster to help understand the overall theme within the stack traces.
_Note_: The representation of the stack traces are being projected into 2D for visualization purposes only, which is why some clusters might look spread out.

In [None]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from typing import Dict
import torch
from transformers import AutoModel
from moderne_pkg.clustering_src.constants import SEPARATOR_TOKEN, CLS_TOKEN
from moderne_pkg.clustering_src.utils import pool_and_normalize
from moderne_pkg.clustering_src.datasets_loader import prepare_tokenizer
from moderne_pkg.clustering_src.preprocessing_utils import truncate_sentences
from abc import ABC, abstractmethod
import warnings
import moderne_pkg.helpers as helpers

plt.style.use("ggplot")
plt.rcParams["figure.dpi"] = 600

warnings.simplefilter("ignore")

args = helpers.get_notebook_args_from_env()

# read data table file
file_name = args["filteredDataTableFileName"]
df = pd.read_csv(file_name, on_bad_lines="skip", skip_blank_lines=True)

# Let's clean the data to prevent mixed types in stackTrace column
df.dropna(subset=["stackTrace"])
df["stackTrace"] = df["stackTrace"].astype(str)

# Exit early if there are no stack traces and render a plot with a message
if len(df) == 0:
    fig, ax = plt.subplots()
    ax.text(
        0.5,
        0.5,
        "No stack traces found",
        ha="center",
        va="center",
        fontsize=16,
        bbox=dict(facecolor="lightgray", edgecolor="black"),
    )
    plt.show()

# We need more than one stack trace
elif len(df) == 1:
    fig, ax = plt.subplots()
    ax.text(
        0.5,
        0.5,
        f'Only 1 stack trace found:\n{df["stackTrace"][0][:80]}',
        ha="center",
        va="center",
        fontsize=16,
        bbox=dict(facecolor="lightgray", edgecolor="black"),
    )
    plt.show()

else:
    # encoder models
    DEVICE = "cpu"
    # set max input length dynamically
    MAX_INPUT_LEN = int(df["stackTrace"].str.len().quantile(0.9))

    # max token is between 10 and 512, and is a percentage of the max input length
    MAX_TOKEN_LEN = max(min(int((MAX_INPUT_LEN / (len(df)) * 100)), 512), 10)

    def set_device(
        inputs: Dict[str, torch.Tensor], device: str
    ) -> Dict[str, torch.Tensor]:
        output_data = {}
        for k, v in inputs.items():
            output_data[k] = v.to(device)

        return output_data

    class BaseEncoder(torch.nn.Module, ABC):
        def __init__(self, device, max_input_len, maximum_token_len, model_name):
            super().__init__()

            self.model_name = model_name
            self.tokenizer = prepare_tokenizer(model_name)
            self.encoder = AutoModel.from_pretrained(model_name).to(DEVICE).eval()
            self.device = device
            self.max_input_len = max_input_len
            self.maximum_token_len = maximum_token_len

        @abstractmethod
        def forward(
            self,
        ):
            pass

        def encode(self, input_sentences, batch_size=64, **kwargs):
            truncated_input_sentences = truncate_sentences(
                input_sentences, self.max_input_len
            )

            n_batches = len(truncated_input_sentences) // batch_size + int(
                len(truncated_input_sentences) % batch_size > 0
            )

            embedding_batch_list = []

            for i in range(n_batches):
                start_idx = i * batch_size
                end_idx = min((i + 1) * batch_size, len(truncated_input_sentences))

                with torch.no_grad():
                    embedding_batch_list.append(
                        self.forward(truncated_input_sentences[start_idx:end_idx])
                        .detach()
                        .cpu()
                    )

            input_sentences_embedding = torch.empty(0)  # Initialize an empty tensor

            if len(embedding_batch_list) >= 1:
                # ensure a non-empty list of Tensors
                input_sentences_embedding = torch.cat(embedding_batch_list)

            return [emb.squeeze().numpy() for emb in input_sentences_embedding]

    class BigCodeEncoder(BaseEncoder):
        def __init__(self, device, max_input_len, maximum_token_len):
            super().__init__(
                device,
                max_input_len,
                maximum_token_len,
                model_name="moderne_pkg/starencoder",
            )

        def forward(self, input_sentences):
            inputs = self.tokenizer(
                [
                    f"{CLS_TOKEN}{sentence}{SEPARATOR_TOKEN}"
                    for sentence in input_sentences
                ],
                padding="longest",
                max_length=self.maximum_token_len,
                truncation=True,
                return_tensors="pt",
            )

            outputs = self.encoder(**set_device(inputs, self.device))
            embedding = pool_and_normalize(
                outputs.hidden_states[-1], inputs.attention_mask
            )

            return embedding

    bigcode_model = BigCodeEncoder(DEVICE, MAX_INPUT_LEN, MAX_TOKEN_LEN)

    # Apply the function to a stackTrace and create new column with the results
    df["embeddings"] = bigcode_model.encode(list(df["stackTrace"]))
    embdz = df["embeddings"]
    stacktraces = df["stackTrace"]
    embds = np.array([embd for embd in embdz])

    perplexity = max(min(30, len(embds) - 1), 0)

    two_d_visualization = np.array(
        TSNE(
            n_components=2, random_state=0, n_iter=250, perplexity=perplexity
        ).fit_transform(np.array(embds)),
        dtype="float",
    )

    # find best k
    sil = []
    potential_k = [2, 3, 5, 7, 8, 11]

    # dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
    for k in potential_k:
        if len(embds) >= k:
            kmeans = KMeans(
                n_clusters=k, random_state=0, n_init="auto", algorithm="elkan"
            ).fit(embds)
            labels = kmeans.labels_

            if len(np.unique(labels)) > 2:
                sample_size = int(len(embds) * 0.2) if len(embds) >= 100 else None
                sil.append(
                    silhouette_score(
                        embds, labels, metric="euclidean", sample_size=sample_size
                    )
                )
            else:
                sil.append(0)
        else:
            sil.append(0)

    best_k = potential_k[sil.index(max(sil))]

    kmeans = np.array(
        KMeans(
            n_clusters=best_k, random_state=0, n_init="auto", algorithm="elkan"
        ).fit_predict(embds),
        dtype="float",
    )

    x = two_d_visualization[:, 0]
    y = two_d_visualization[:, 1]

    # Creating a colormap with unique colors for each label
    num_labels = len(set(kmeans))
    colors = helpers.get_moderne_qualitative_palette(500)
    marker_styles = ["o", "s", "D", "^", "x", "*", "+", "."]

    # Plotting the data with unique colors
    scatter_plots = []
    for i in range(num_labels):
        mask = np.array(kmeans) == i
        scatter_plot = plt.scatter(
            np.array(x)[mask],
            np.array(y)[mask],
            c=colors[i % 5],
            marker=marker_styles[i % 8],
            label=f"{i}",
        )
        scatter_plots.append(scatter_plot)

    # Create a dictionary to store the first cluster label and sample stack trace for each cluster
    cluster_labels = {}
    for cluster, cluster_label in enumerate(range(num_labels)):
        mask = kmeans == cluster
        if np.any(mask):
            sample_stack_trace = df["stackTrace"][mask].iloc[0].split("\n")[0][:80]
            cluster_labels[cluster] = (cluster_label, sample_stack_trace)

    handles = []
    labels = []

    # Create the legend handles and labels
    for cluster in range(num_labels):
        count = df["stackTrace"][kmeans == cluster_labels[cluster][0]].describe()[
            "count"
        ]
        handles.append(scatter_plots[cluster_labels[cluster][0]])
        labels.append(f"{cluster_labels[cluster][1]} (count: {count})")

    # Create the legend outside of the plot
    plt.legend(
        handles,
        labels,
        loc="upper left",
        bbox_to_anchor=(-0.1, -0.1),
        fontsize="small",
        title="Sample from cluster",
    )

    # Displaying the plot
    plt.show()