# RQ3

## Setup

* Follow the instructions on ```Readme.md```
* The classification threshold can be changed in ```src/config.py```
* Inside the *pipeline* folder, run ```python -m src.rq3``` for testing all models OR 
```python -m src.rq3 --model_name``` for a specific model
    * Ex: ```python -m src.rq3 --model_name microsoft/codebert-base```
* For running the default models (without finetuning) add ```-default``` to the model name
    * Ex: ```python -m src.rq3 --model_name microsoft/codebert-base-default```
* Tested models: ```'microsoft/codebert-base', 'Salesforce/codet5-base'```
* Supported languages: ```'python', 'java' , 'cs', 'c'```
* Results are saved on ```results/RQ3```

## Save dataset for huggingface

In [1]:
import json
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

from src.config import FINAL_DATASET

# Load single list of data
with open(FINAL_DATASET, "r", encoding="utf-8") as f:
    data = json.load(f)

# Train/test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
})

dataset_dict.save_to_disk("../dataset/kamino_clones_dataset")

 


Saving the dataset (0/1 shards):   0%|          | 0/685 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/172 [00:00<?, ? examples/s]

## Results

In [14]:
import pandas as pd

# Load CSV
csv_path = "../results/RQ3/clone_detection.csv"
df = pd.read_csv(csv_path)


THRESHOLD = 0.7        # set to None to disable filtering
SORT_BY = "f1"         # precision | recall | f1 | None
DESCENDING = True


if THRESHOLD is not None:
    df = df[df["threshold"] == THRESHOLD]


if SORT_BY is not None:
    df = df.sort_values(by=SORT_BY, ascending=not DESCENDING)

display(
    df[[
        "model",
        "dataset",
        "lan",
        "pairs",
        "threshold",
        "precision",
        "recall",
        "f1",
    ]].round(4).reset_index(drop=True)
)


Unnamed: 0,model,dataset,lan,pairs,threshold,precision,recall,f1
0,microsoft/codebert-base,GPTCloneBench,csharp,9816,0.7,0.9951,0.8667,0.9265
1,microsoft/codebert-base,Kamino,python,19850,0.7,0.9847,0.8714,0.9246
2,Salesforce/codet5-base,GPTCloneBench,csharp,9816,0.7,0.99,0.8667,0.9243
3,Salesforce/codet5-base,Kamino,python,19850,0.7,0.9825,0.8222,0.8952
4,microsoft/codebert-base,GPTCloneBench,java,14192,0.7,0.974,0.8178,0.8891
5,microsoft/codebert-base,GPTCloneBench,python,5640,0.7,0.981,0.8057,0.8847
6,Salesforce/codet5-base,GPTCloneBench,python,5640,0.7,0.9224,0.8092,0.8621
7,Salesforce/codet5-base,GPTCloneBench,java,14192,0.7,0.9468,0.7852,0.8585
8,Salesforce/codet5-base-default,GPTCloneBench,csharp,9816,0.7,0.6902,0.8329,0.7549
9,Salesforce/codet5-base-default,GPTCloneBench,java,14192,0.7,0.648,0.7854,0.7101


In [None]:
import pandas as pd

# Load CSV
df = pd.read_csv(csv_path)

# Add type column
df['type'] = df['model'].apply(lambda x: 'pretrained' if '-default' in x else 'finetuned')

# Simplify model names
df['base_model'] = df['model'].apply(lambda x: x.split('/')[-1].replace('-default','').replace('-base',''))

# Sort by dataset, pairs descending
df = df.sort_values(by=['dataset','pairs'], ascending=[True, False])

# Column definition
col_def = "L{1.2cm}L{1.5cm}L{1cm}R{1.2cm}|R{0.8cm}R{0.8cm}R{0.8cm}R{0.8cm}|R{0.8cm}R{0.8cm}R{0.8cm}R{0.8cm}"

print("\\begin{table}[ht]")
print("\\footnotesize")
print("\\addtolength{\\tabcolsep}{-2pt}")
print("\\caption{Results for RQ3}")
print(f"\\begin{{tabular}}{{{col_def}}}")
print("\\toprule")
# Header
print("\\multirow{2}{*}{\\textbf{Model}} & \\multirow{2}{*}{\\textbf{Dataset}} & \\multirow{2}{*}{\\textbf{Lang}} & \\multirow{2}{*}{\\textbf{Pairs}} & " +
      "\\multicolumn{4}{c|}{\\textbf{Pretrained}} & \\multicolumn{4}{c}{\\textbf{Finetuned}} \\\\")
print("& & & & \\textbf{Prec.} & \\textbf{Rec.} & \\textbf{F1} & \\textbf{MCC} & \\textbf{Prec.} & \\textbf{Rec.} & \\textbf{F1} & \\textbf{MCC} \\\\")
print("\\midrule")

# Loop through models
for bm in df['base_model'].unique():
    df_bm = df[df['base_model']==bm]
    model_rows = len(df_bm['lan'].unique())  # number of language groups
    
    model_first = True
    for ds in df_bm['dataset'].unique():
        df_ds = df_bm[df_bm['dataset']==ds]
        dataset_rows = len(df_ds['lan'].unique())
        dataset_first = True
        
        for lang in df_ds['lan'].unique():
            df_lang = df_ds[df_ds['lan']==lang].sort_values(by='pairs', ascending=False).iloc[0]
            
            pre_row = df_ds[(df_ds['lan']==lang) & (df_ds['type']=='pretrained')].iloc[0]
            fin_row = df_ds[(df_ds['lan']==lang) & (df_ds['type']=='finetuned')].iloc[0]
            
            model_cell = f"\\multirow{{{model_rows}}}{{*}}{{{bm}}}" if model_first else ""
            dataset_cell = f"\\multirow{{{dataset_rows}}}{{*}}{{{ds}}}" if dataset_first else ""
            
            line = f"{model_cell} & {dataset_cell} & {lang} & {pre_row['pairs']} & " \
                   f"{pre_row['precision']:.2f} & {pre_row['recall']:.2f} & {pre_row['f1']:.2f} & N/A & " \
                   f"{fin_row['precision']:.2f} & {fin_row['recall']:.2f} & {fin_row['f1']:.2f} & N/A \\\\"
            print(line)
            
            model_first = False
            dataset_first = False
    print("\\midrule")

print("\\bottomrule")
print("\\end{tabular}")
print("\\label{tab:rq3Results}")
print("\\end{table}")


\begin{table}[ht]
\footnotesize
\caption{Comparison of Pretrained and Finetuned Models}
\begin{tabular}{L{2cm}|L{2cm}|L{1.5cm}|R{1.5cm}|R{1cm}R{1cm}R{1cm}R{1cm}|R{1cm}R{1cm}R{1cm}R{1cm}}
\toprule
\multirow{2}{*}{\textbf{Model}} & \multirow{2}{*}{\textbf{Dataset}} & \multirow{2}{*}{\textbf{Lang}} & \multirow{2}{*}{\textbf{Pairs}} & \multicolumn{4}{c|}{\textbf{Pretrained}} & \multicolumn{4}{c}{\textbf{Finetuned}} \\
& & & & \textbf{Prec.} & \textbf{Rec.} & \textbf{F1} & \textbf{MCC} & \textbf{Prec.} & \textbf{Rec.} & \textbf{F1} & \textbf{MCC} \\
\midrule
\multirow{4}{*}{codet5} & \multirow{4}{*}{GPTCloneBench} & java & 14192 & 0.65 & 0.79 & 0.71 & N/A & 0.95 & 0.79 & 0.86 & N/A \\
 &  & csharp & 9816 & 0.69 & 0.83 & 0.75 & N/A & 0.99 & 0.87 & 0.92 & N/A \\
 &  & c & 6890 & 0.72 & 0.58 & 0.65 & N/A & 0.93 & 0.49 & 0.65 & N/A \\
 &  & python & 5640 & 0.60 & 0.68 & 0.64 & N/A & 0.92 & 0.81 & 0.86 & N/A \\
 & \multirow{1}{*}{Kamino} & python & 19850 & 0.71 & 0.72 & 0.72 & N/A & 0.98 & 0.72 

In [2]:
from datasets import load_from_disk

# Load your dataset from a local folder
ds = load_from_disk("../dataset/kamino_clones_dataset")

# If this is a DatasetDict (train/validation/test), you can inspect splits:
print(ds)

# Count number of items:
if isinstance(ds, dict):
    # DatasetDict case
    for split, subset in ds.items():
        print(split, len(subset))
else:
    # Single Dataset case
    print("Total elements:", len(ds))


DatasetDict({
    train: Dataset({
        features: ['id', 'language', 'original_code', 'test', 'description', 'metadata', 'clones'],
        num_rows: 685
    })
    test: Dataset({
        features: ['id', 'language', 'original_code', 'test', 'description', 'metadata', 'clones'],
        num_rows: 172
    })
})
train 685
test 172
