# CodeBERT Similarity Calculations
A consolidated notebook to run PSG, SG, and TPU-SG similarity scoring and persist results.

**Sections**
1. Configure Paths and Environment
2. Import Dependencies and Utilities
3. Load Reference File and Discover Candidates
4. Run Similarity Scoring Loop (All Three Scripts)
5. Append and Merge CSV Outputs
6. Quick Summary and Verification

In [None]:
# 1. Configure Paths and Environment
from pathlib import Path
import sys

if "__file__" in globals():
    SCRIPT_DIR = Path(__file__).resolve().parent
else:
    # Notebook execution: assume cwd is the repo root
    SCRIPT_DIR = Path.cwd() / "codebertscore-similarity/analyser-results"
    if not SCRIPT_DIR.exists():
        # Fallback to cwd if running from inside analyser-results already
        SCRIPT_DIR = Path.cwd()

# Derive project root based on known layout
if SCRIPT_DIR.name == "analyser-results" and SCRIPT_DIR.parent.name == "codebertscore-similarity":
    PROJECT_ROOT = SCRIPT_DIR.parent.parent
else:
    PROJECT_ROOT = SCRIPT_DIR

CODE_BERT_PACKAGE = PROJECT_ROOT / "codebertscore-similarity"
sys.path.insert(0, str(CODE_BERT_PACKAGE))

# Per-script configuration matching the original files
CONFIGS = {
    "psg": {
        "reference": PROJECT_ROOT / "codebertscore-similarity/references/TFLite_detection_video.py",
        "candidate_glob": "*/exported_valid_code/psg/*.py",
        "output_csv": PROJECT_ROOT / "codebertscore-similarity/analyser-results/similarity_results_psg.csv",
        "lang": "python",
    },
    "sg": {
        "reference": PROJECT_ROOT / "codebertscore-similarity/references/object_color_classify.ino",
        "candidate_glob": "*/exported_valid_code/sg/*.ino",
        "output_csv": PROJECT_ROOT / "codebertscore-similarity/analyser-results/similarity_results_sg.csv",
        "lang": "c_sharp",
    },
    "tpusg": {
        "reference": PROJECT_ROOT / "codebertscore-similarity/references/TFLite_detection_video_TPU.py",
        "candidate_glob": "*/exported_valid_code/tpusg/*.py",
        "output_csv": PROJECT_ROOT / "codebertscore-similarity/analyser-results/similarity_results_tpusg.csv",
        "lang": "python",
    },
}

CANDIDATE_ROOT = PROJECT_ROOT / "data_analysis/2026"

NameError: name '__file__' is not defined

In [None]:
# 2. Import Dependencies and Utilities
import os
from datetime import datetime
import pandas as pd
import torch
import code_bert_score


def load_file_content(file_path: Path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None


def extract_model_name(candidate_path: Path):
    # .../<run>/exported_valid_code/<variant>/file.ext
    try:
        return candidate_path.parents[2].name
    except IndexError:
        return candidate_path.stem


def ensure_header(csv_path: Path, columns):
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    if not csv_path.exists():
        pd.DataFrame(columns=columns).to_csv(csv_path, index=False)


def get_processed_ids(csv_path: Path):
    if not csv_path.exists():
        return set()
    try:
        df = pd.read_csv(csv_path)
        if "Candidate_ID" in df.columns:
            return set(df["Candidate_ID"].astype(str))
    except Exception as e:
        print(f"Error reading existing CSV {csv_path}: {e}")
    return set()


In [None]:
# 3. Load Reference File and Discover Candidates
references = {}
candidate_sets = {}

for key, cfg in CONFIGS.items():
    ref_content = load_file_content(cfg["reference"])
    if not ref_content:
        print(f"Failed to load reference for {key}: {cfg['reference']}")
        continue
    references[key] = ref_content
    candidate_files = sorted(CANDIDATE_ROOT.glob(cfg["candidate_glob"]))
    candidate_sets[key] = candidate_files
    print(f"[{key}] reference loaded; candidates: {len(candidate_files)}")

# Device selection
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

In [None]:
# 4. Run Similarity Scoring Loop (All Three Scripts)
results = {}
common_columns = [
    "Candidate_ID",
    "Candidate_Path",
    "Model",
    "Precision",
    "Recall",
    "F1",
    "F3",
    # "Reference_File",
    # "Language",
    "Processor",
    # "Timestamp",
]

for key, cfg in CONFIGS.items():
    ensure_header(cfg["output_csv"], common_columns)
    processed_ids = get_processed_ids(cfg["output_csv"])
    candidate_files = candidate_sets.get(key, [])
    ref_content = references.get(key)
    if not ref_content:
        continue

    loop_records = []
    for candidate_path in candidate_files:
        candidate_id = candidate_path.relative_to(PROJECT_ROOT).as_posix()
        if candidate_id in processed_ids:
            continue

        candidate_content = load_file_content(candidate_path)
        if not candidate_content:
            continue

        try:
            P, R, F1, F3 = code_bert_score.score(
                cands=[candidate_content],
                refs=[ref_content],
                lang=cfg["lang"],
                device=DEVICE,
                verbose=False,
            )

            p_val = P[0].item()
            r_val = R[0].item()
            f1_val = F1[0].item()
            f3_val = F3[0].item()

            model_name = extract_model_name(candidate_path)
            timestamp = datetime.now().isoformat()

            record = {
                "Candidate_ID": candidate_id,
                "Candidate_Path": candidate_path.as_posix(),
                "Model": model_name,
                "Precision": p_val,
                "Recall": r_val,
                "F1": f1_val,
                "F3": f3_val,
                # "Reference_File": cfg["reference"],
                # "Language": cfg["lang"],
                "Processor": key,
                # "Timestamp": timestamp,
            }
            loop_records.append(record)
        except Exception as e:
            print(f"[{key}] Error processing {candidate_path}: {e}")

    if loop_records:
        df_loop = pd.DataFrame(loop_records)
        df_loop.to_csv(cfg["output_csv"], mode="a", header=False, index=False)
        results[key] = df_loop
        print(f"[{key}] wrote {len(df_loop)} rows to {cfg['output_csv']}")
    else:
        print(f"[{key}] no new rows written")

In [None]:
# 5. Append and Merge CSV Outputs
merged = []
for key, cfg in CONFIGS.items():
    if cfg["output_csv"].exists():
        try:
            df = pd.read_csv(cfg["output_csv"])
            df["run"] = key
            merged.append(df)
        except Exception as e:
            print(f"[{key}] Error reading {cfg['output_csv']}: {e}")

if merged:
    combined = pd.concat(merged, ignore_index=True)
    print(f"Combined rows: {len(combined)}")
else:
    combined = pd.DataFrame()
    print("No CSVs to combine yet.")

In [None]:
# 6. Quick Summary and Verification
if not combined.empty:
    display(combined.head())
    display(combined.tail())
    print("Counts by run:")
    print(combined.groupby("run")["Candidate_ID"].count())
    print("F1 stats:")
    print(combined["F1"].describe())
else:
    print("No data to summarize yet.")