# RQ3

## Setup

* Follow the instructions on ```Readme.md```
* The classification threshold can be changed in ```src/config.py```
* Inside the *pipeline* folder, run ```python -m src.rq3 --model_name [model_name]```
    * Ex: ```python -m src.rq3 --model_name microsoft/codebert-base```
* For running the default models (without finetuning) add ```-default``` to the model name
    * Ex: ```python -m src.rq3 --model_name microsoft/codebert-base-default```
* Tested models: ```'microsoft/codebert-base', 'Salesforce/codet5-base'```
* Supported languages: ```'python', 'java' , 'cs', 'c'```
* Results are saved on ```results/RQ3```

## Save dataset for huggingface

In [1]:
import json
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

from src.config import FINAL_DATASET

# Load single list of data
with open(FINAL_DATASET, "r", encoding="utf-8") as f:
    data = json.load(f)

# Train/test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
})

dataset_dict.save_to_disk("../dataset/kamino_clones_dataset")

 


Saving the dataset (0/1 shards):   0%|          | 0/685 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/172 [00:00<?, ? examples/s]

## Results

In [15]:
import pandas as pd

# Load CSV
csv_path = "../results/RQ3/clone_detection.csv"
df = pd.read_csv(csv_path)


THRESHOLD = 0.7        # set to None to disable filtering
SORT_BY = "f1"         # precision | recall | f1 | None
DESCENDING = True


if THRESHOLD is not None:
    df = df[df["threshold"] == THRESHOLD]


if SORT_BY is not None:
    df = df.sort_values(by=SORT_BY, ascending=not DESCENDING)

display(
    df[[
        "model",
        "dataset",
        "lan",
        "pairs",
        "threshold",
        "precision",
        "recall",
        "f1",
    ]].round(4).reset_index(drop=True)
)


Unnamed: 0,model,dataset,lan,pairs,threshold,precision,recall,f1
0,microsoft/codebert-base,GPTCloneBench,csharp,9816,0.7,0.9951,0.8667,0.9265
1,microsoft/codebert-base,Kamino,python,19850,0.7,0.9847,0.8714,0.9246
2,Salesforce/codet5-base,GPTCloneBench,csharp,9816,0.7,0.99,0.8667,0.9243
3,Salesforce/codet5-base,Kamino,python,19850,0.7,0.9825,0.8222,0.8952
4,microsoft/codebert-base,GPTCloneBench,java,14192,0.7,0.974,0.8178,0.8891
5,microsoft/codebert-base,GPTCloneBench,python,5640,0.7,0.981,0.8057,0.8847
6,Salesforce/codet5-base,GPTCloneBench,python,5640,0.7,0.9224,0.8092,0.8621
7,Salesforce/codet5-base,GPTCloneBench,java,14192,0.7,0.9468,0.7852,0.8585
8,Salesforce/codet5-base-default,GPTCloneBench,csharp,9816,0.7,0.6902,0.8329,0.7549
9,Salesforce/codet5-base-default,GPTCloneBench,java,14192,0.7,0.648,0.7854,0.7101


In [1]:
import pandas as pd
import os

# --- Load CSV ---
csv_path = "../results/RQ3/clone_detection.csv"
df = pd.read_csv(csv_path)

# --- Simplify and classify models ---
def simplify_model(model):
    name = model.split("/")[-1].replace("-base", "")
    if "-default" in name:
        return name.replace("-default", ""), "d"
    else:
        return name, "f"

df["model_base"], df["type"] = zip(*df["model"].apply(simplify_model))

# Sort models and thresholds
df = df.sort_values(["model_base", "threshold", "type"])

# --- Compare and bold best values ---
def bold_best(x, y):
    return f"\\textbf{{{x:.3f}}}" if x > y else f"{x:.3f}"

def format_val(x):
    return f"{x:.3f}"

# Prepare LaTeX table
latex_table = r"""\begin{table}[tb]
\centering
\footnotesize
\caption{Performance metrics per model and threshold.}
\begin{tabular}{lccrrr}
\hline
\textbf{Model} & \textbf{$\theta$} & \textbf{Type} & \textbf{Precision} & \textbf{Recall} & \textbf{F1} \\
\hline
"""

# Group by model_base
for model_name, group in df.groupby("model_base"):
    group = group.sort_values("threshold")
    thresholds = sorted(group["threshold"].unique())

    for i, theta in enumerate(thresholds):
        subset = group[group["threshold"] == theta]
        if len(subset) == 2:  # both default and finetuned exist
            d_row = subset[subset["type"] == "d"].iloc[0]
            f_row = subset[subset["type"] == "f"].iloc[0]

            # Bold the best metrics between default and finetuned
            p_d = bold_best(d_row["precision"], f_row["precision"])
            p_f = bold_best(f_row["precision"], d_row["precision"])

            r_d = bold_best(d_row["recall"], f_row["recall"])
            r_f = bold_best(f_row["recall"], d_row["recall"])

            f1_d = bold_best(d_row["f1"], f_row["f1"])
            f1_f = bold_best(f_row["f1"], d_row["f1"])

            # Write multirow only for the first threshold of the model
            model_cell = f"\\multirow{{{len(thresholds)*2}}}{{*}}{{{model_name}}}" if i == 0 else ""
            latex_table += f"{model_cell} & {theta:.1f} & \\textit{{d}} & {p_d} & {r_d} & {f1_d} \\\\\n"
            latex_table += f"& & \\textit{{f}} & {p_f} & {r_f} & {f1_f} \\\\\n"

        else:  # only one type available
            row = subset.iloc[0]
            model_cell = f"\\multirow{{{len(thresholds)}}}{{*}}{{{model_name}}}" if i == 0 else ""
            latex_table += f"{model_cell} & {theta:.1f} & \\textit{{{row['type']}}} & {format_val(row['precision'])} & {format_val(row['recall'])} & {format_val(row['f1'])} \\\\\n"

latex_table += r"""\hline
\end{tabular}
\label{tab:RQ3}
\end{table}"""

# --- Save LaTeX table to file ---
output_path = "../results/RQ3/model_metrics_table.tex"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as f:
    f.write(latex_table)

print(latex_table)


ParserError: Error tokenizing data. C error: Expected 8 fields in line 50, saw 9


In [2]:
from datasets import load_from_disk

# Load your dataset from a local folder
ds = load_from_disk("../dataset/kamino_clones_dataset")

# If this is a DatasetDict (train/validation/test), you can inspect splits:
print(ds)

# Count number of items:
if isinstance(ds, dict):
    # DatasetDict case
    for split, subset in ds.items():
        print(split, len(subset))
else:
    # Single Dataset case
    print("Total elements:", len(ds))


DatasetDict({
    train: Dataset({
        features: ['id', 'language', 'original_code', 'test', 'description', 'metadata', 'clones'],
        num_rows: 685
    })
    test: Dataset({
        features: ['id', 'language', 'original_code', 'test', 'description', 'metadata', 'clones'],
        num_rows: 172
    })
})
train 685
test 172
