In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import fbeta_score, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader
import torch
import gc
import os

import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from typing import Tuple, List, Optional

RANDOM_SEED = 42
N_FOLDS = 5
EPOCHS=3
THRESHOLDS = [0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]  # Thresholds to test

FULL_TEST = False

N_SAMPLES_PER_CLASS = 500


os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"


In [None]:
# Clean up
objects_to_delete = ['model', 'train_loss', 'evaluator', 'train_dataloader', 'trainer']

for obj in objects_to_delete:
    if obj in globals():
        del globals()[obj]

# 2. Force Python garbage collection (frees CPU RAM)
gc.collect()

# 3. Clear the PyTorch allocator cache (frees GPU memory)
torch.cuda.empty_cache()

# 4. Check available memory
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Reserved:  {torch.cuda.memory_reserved() / 1024**3:.2f} GB")



# --- 1. Data preparation ---

In [None]:
df_full = pd.read_parquet("df_merged.parquet")


In [None]:
# 2. Define the columns that determine whether a row is a duplicate
# For the LaBSE model, the uniqueness of the input text pair is key.
duplicate_subset = ['polish_label', 'name_variant']


# 3. Remove duplicates
# keep='first' keeps the first occurrence and drops subsequent repeats.
# inplace=False (default) returns a new DataFrame.
df_full_clean = df_full.drop_duplicates(subset=duplicate_subset, keep='first')


# 4. Reset the index (VERY IMPORTANT for KFold)
# drop=True removes the old 'index' column so it doesn't clutter the DataFrame.
df_full_clean = df_full_clean.reset_index(drop=True)


print(f"Number of rows after removing duplicates: {len(df_full_clean)}")
print(f"Removed {len(df_full) - len(df_full_clean)} duplicates.")


In [None]:

if FULL_TEST:
    # 1. Full test (dtandard 85% / 15%)
    df_cv_pool, df_oos = train_test_split(df_full_clean, test_size=0.15, random_state=RANDOM_SEED, stratify=df_full_clean['truth'])
else:
    # retrieve same train / test split as before
    y_full = df_full_clean["truth"].astype(int).values
    feature_cols = [c for c in df_full.columns if c.startswith(("coverage", "len"))]

    rng = np.random.default_rng(RANDOM_SEED)

    pos_idx = np.where(y_full == 1)[0]
    neg_idx = np.where(y_full == 0)[0]
    n = min(N_SAMPLES_PER_CLASS, len(pos_idx), len(neg_idx))

    sample_idx = np.concatenate([
        rng.choice(pos_idx, size=n, replace=False),
        rng.choice(neg_idx, size=n, replace=False),
    ])


    mask = ~np.isin(np.arange(len(df_full_clean)), sample_idx)
    df_not_in_sample = df_full_clean[mask]

    df_oos = df_full_clean.loc[~df_full_clean.index.isin(sample_idx)]
    df_cv_pool = df_full_clean.loc[df_full_clean.index.isin(sample_idx)]



print(f"Dane do Cross-Validation: {len(df_cv_pool)}")
print(f"Dane do OOS Evaluation: {len(df_oos)}")


In [None]:
def compute_metrics(model, df, thresholds):
    """
    Computes metrics for a given model and DataFrame across a list of thresholds.
    """
    # Generate embeddings
    sentences1 = df['polish_label'].tolist()
    sentences2 = df['name_variant'].tolist()
    gt_labels = [1 if x else 0 for x in df['truth'].tolist()]  # Assumes `truth` is bool or 0/1

    # Encode (use batching for speed)
    embs1 = model.encode(sentences1, convert_to_tensor=True, show_progress_bar=False)
    embs2 = model.encode(sentences2, convert_to_tensor=True, show_progress_bar=False)

    # Cosine similarity
    cosine_scores = util.cos_sim(embs1, embs2).diagonal().cpu().numpy()

    results = {}
    for t in thresholds:
        pred_labels = (cosine_scores >= t).astype(int)

        # Compute metrics
        f2 = fbeta_score(gt_labels, pred_labels, beta=2, zero_division=0)
        acc = accuracy_score(gt_labels, pred_labels)
        prec = precision_score(gt_labels, pred_labels, zero_division=0)
        rec = recall_score(gt_labels, pred_labels, zero_division=0)

        results[t] = {
            'f2': f2,
            'acc': acc,
            'prec': prec,
            'rec': rec
        }

    return results


# --- 2. MAIN CROSS-VALIDATION LOOP ---

In [None]:
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# Structure to store results from each fold
# fold_results = [
#    { 0.35: {'val_f2': ..., 'oos_f2': ...}, 0.40: {...} },  <-- Fold 1
#    { ... }                                                 <-- Fold 2
# ]
all_folds_data = []

final_df_cv_pool = df_cv_pool


for fold_idx, (train_idx, val_idx) in enumerate(kf.split(final_df_cv_pool)):
    print(f"\n--- FOLD {fold_idx+1}/{N_FOLDS} ---")

    # Split the data
    train_df = final_df_cv_pool.iloc[train_idx]
    val_df = final_df_cv_pool.iloc[val_idx]

    # Prepare training
    train_examples = []
    for _, row in train_df.iterrows():
        label = 0.9 if row['truth'] else 0.1  # Soft labels for LaBSE
        train_examples.append(InputExample(texts=[row['polish_label'], row['name_variant']], label=label))
    print(len(train_examples))

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)

    # Initialize the model
    model = SentenceTransformer("sentence-transformers/LaBSE")
    train_loss = losses.CosineSimilarityLoss(model=model)

    # Training (short, e.g., 1–3 epochs)
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=EPOCHS,
        warmup_steps=100,
        show_progress_bar=True
    )

    # --- EVALUATION ---
    print("Evaluating Validation subset...")
    val_metrics = compute_metrics(model, val_df, THRESHOLDS)

    print("Evaluating OOS pool...")
    oos_metrics = compute_metrics(model, df_oos, THRESHOLDS)

    # Collect results for this fold
    fold_data = {}
    for t in THRESHOLDS:
        fold_data[t] = {
            # Validation subset (per your screenshot: F2, Accuracy)
            'val_f2': val_metrics[t]['f2'],
            'val_acc': val_metrics[t]['acc'],

            # OOS pool (per screenshot: Precision, Recall, F2)
            'oos_prec': oos_metrics[t]['prec'],
            'oos_rec': oos_metrics[t]['rec'],
            'oos_f2': oos_metrics[t]['f2']
        }
    all_folds_data.append(fold_data)


# --- 3. RESULTS AGGERGATION AND TABLE FORMATION ---

In [None]:
final_summary = []

for t in THRESHOLDS:
    # Extract results for this threshold from all folds
    metrics_at_t = [fold[t] for fold in all_folds_data]

    # Convert to a DataFrame for easy mean/std computation
    df_metrics = pd.DataFrame(metrics_at_t)

    row = {'Threshold': t}

    # For each column, compute "mean ± std"
    for col in df_metrics.columns:
        mean = df_metrics[col].mean()
        std = df_metrics[col].std()
        row[col] = f"{mean:.3f} ± {std:.3f}"

    final_summary.append(row)

df_final_table = pd.DataFrame(final_summary)
df_final_table.set_index('Threshold', inplace=True)


In [None]:
# Display (transposed so it looks like the screenshot, if you prefer)
print("\n=== FINAL CROSS-VALIDATION TABLE ===")
display(df_final_table.T)  # .T puts metrics in rows and thresholds in columns


In [None]:
if FULL_TEST:
    filename_base = "labse_big_finetune_results"
else:
    filename_base = "labse_smaller_finetune_results"


# 1. Save to CSV (for easy inspection)
csv_path = f"output/{filename_base}.csv"
# Make sure the output directory exists
os.makedirs("output", exist_ok=True)

df_final_table.to_csv(csv_path)
print(f"Saved the table (CSV) to: {csv_path}")


# 2. Save to Parquet (to preserve precision and data types)
parquet_path = f"output/{filename_base}.parquet"
df_final_table.to_parquet(parquet_path)
print(f"Saved the table (Parquet) to: {parquet_path}")


# Export LATEX results

In [None]:
def export_latex_table(
    df_results: pd.DataFrame, 
    num_folds: int, 
    model_name: str, 
    selected_thresholds: Optional[List[float]] = None,
    label_suffix: str = "model"
) -> None:
    """
    Generates and prints a LaTeX table code snippet from a results DataFrame, 
    formatted specifically for academic papers. Allows filtering by specific thresholds.

    Args:
        df_results (pd.DataFrame): DataFrame containing result strings (e.g., "0.850 ± 0.010").
                                   Expected index: Thresholds (float).
        num_folds (int): Number of folds used in cross-validation.
        model_name (str): Name of the model.
        selected_thresholds (List[float], optional): List of thresholds to include in the table.
                                                     If None, all thresholds from df_results are used.
        label_suffix (str): Suffix for the LaTeX label.

    Raises:
        ValueError: If any of the selected_thresholds are not present in df_results index.
    """
    
    # 1. Filter Thresholds if specific list provided
    if selected_thresholds is not None:
        # Check for missing thresholds
        missing = [t for t in selected_thresholds if t not in df_results.index]
        if missing:
            available = df_results.index.tolist()
            raise ValueError(
                f"Thresholds {missing} not found in results.\n"
                f"Available thresholds are: {available}"
            )
        
        # Filter the DataFrame ensuring the order matches selected_thresholds
        df_filtered = df_results.loc[selected_thresholds]
    else:
        df_filtered = df_results

    # 2. Transpose: Thresholds become columns, Metrics become rows
    df_T = df_filtered.T
    
    # 3. Define Metric Mapping
    metric_map = {
        'val_f2': 'F$_2$ score',
        'val_acc': 'Accuracy',
        'oos_prec': 'Precision',
        'oos_rec': 'Recall',
        'oos_f2': 'F$_2$ score'
    }
    
    # 4. Helper function to format a row
    def format_row_content(metric_key, row_series):
        latex_cells = []
        
        # Parse values to find max for bolding (only among selected thresholds)
        values_dict = {}
        for col_idx, val_str in row_series.items():
            clean_val = str(val_str).replace('±', '').replace('$\pm$', '')
            try:
                mean_val = float(clean_val.split()[0])
                values_dict[col_idx] = mean_val
            except (ValueError, IndexError):
                values_dict[col_idx] = -1.0

        max_val = max(values_dict.values()) if values_dict else 0
        
        for col_idx, val_str in row_series.items():
            formatted_str = str(val_str).replace('±', r'$\pm$')
            
            # Bold if it's the max value in this specific view
            if values_dict.get(col_idx) == max_val and max_val > 0:
                formatted_str = f"\\textbf{{{formatted_str}}}"
            
            latex_cells.append(formatted_str)
            
        return f"{metric_map.get(metric_key, metric_key)} & " + " & ".join(latex_cells) + " \\\\"

    # 5. Prepare Headers
    thresholds = df_T.columns.tolist()
    header_cols = " & ".join([f"{t:.2f}" for t in thresholds])
    col_def = "c" * len(thresholds)
    
    # 6. Build LaTeX
    latex_code = [
        r"\begin{table*}[!ht]",
        r"    \centering",
        f"    \\caption{{{model_name}: cross-validated performance \\\\for different decision thresholds (mean $\\pm$ standard deviation over {num_folds} folds).}}",
        f"    \\label{{tab:{label_suffix}_thresholds}}",
        f"    \\begin{{tabular}}{{l{col_def}}}",
        r"        \hline",
        f"        & \\multicolumn{{{len(thresholds)}}}{{c}}{{Decision threshold $\\tau$}} \\\\",
        f"        \\cline{{2-{len(thresholds)+1}}}",
        f"        Metric & {header_cols} \\\\",
        r"        \hline",
        r"        \multicolumn{" + str(len(thresholds)+1) + r"}{l}{\textit{Validation subset}} \\"
    ]

    for metric in ['val_f2', 'val_acc']:
        if metric in df_T.index:
            latex_code.append("        " + format_row_content(metric, df_T.loc[metric]))

    latex_code.append(r"        \hline")
    latex_code.append(r"        \multicolumn{" + str(len(thresholds)+1) + r"}{l}{\textit{Out-of-sample evaluation pool}} \\")

    for metric in ['oos_prec', 'oos_rec', 'oos_f2']:
        if metric in df_T.index:
            latex_code.append("        " + format_row_content(metric, df_T.loc[metric]))

    latex_code.extend([
        r"        \hline",
        r"    \end{tabular}",
        r"\end{table*}"
    ])

    print("\n".join(latex_code))

def export_latex_table(
    df_results: pd.DataFrame, 
    num_folds: int, 
    model_name: str, 
    selected_thresholds: Optional[List[float]] = None,
    label_suffix: str = "model"
) -> None:
    """
    Generates and prints a LaTeX table code snippet from a results DataFrame, 
    formatted specifically for academic papers. Allows filtering by specific thresholds.

    Args:
        df_results (pd.DataFrame): DataFrame containing result strings (e.g., "0.850 ± 0.010").
                                   Expected index: Thresholds (float).
        num_folds (int): Number of folds used in cross-validation.
        model_name (str): Name of the model.
        selected_thresholds (List[float], optional): List of thresholds to include in the table.
                                                     If None, all thresholds from df_results are used.
        label_suffix (str): Suffix for the LaTeX label.

    Raises:
        ValueError: If any of the selected_thresholds are not present in df_results index.
    """
    
    # 1. Filter Thresholds if specific list provided
    if selected_thresholds is not None:
        # Check for missing thresholds
        missing = [t for t in selected_thresholds if t not in df_results.index]
        if missing:
            available = df_results.index.tolist()
            raise ValueError(
                f"Thresholds {missing} not found in results.\n"
                f"Available thresholds are: {available}"
            )
        
        # Filter the DataFrame ensuring the order matches selected_thresholds
        df_filtered = df_results.loc[selected_thresholds]
    else:
        df_filtered = df_results

    # 2. Transpose: Thresholds become columns, Metrics become rows
    df_T = df_filtered.T
    
    # 3. Define Metric Mapping
    metric_map = {
        'val_f2': 'F$_2$ score',
        'val_acc': 'Accuracy',
        'oos_prec': 'Precision',
        'oos_rec': 'Recall',
        'oos_f2': 'F$_2$ score'
    }
    
    # 4. Helper function to format a row
    def format_row_content(metric_key, row_series):
        latex_cells = []
        
        # Parse values to find max for bolding (only among selected thresholds)
        values_dict = {}
        for col_idx, val_str in row_series.items():
            clean_val = str(val_str).replace('±', '').replace('$\pm$', '')
            try:
                mean_val = float(clean_val.split()[0])
                values_dict[col_idx] = mean_val
            except (ValueError, IndexError):
                values_dict[col_idx] = -1.0

        max_val = max(values_dict.values()) if values_dict else 0
        
        for col_idx, val_str in row_series.items():
            formatted_str = str(val_str).replace('±', r'$\pm$')
            
            # Bold if it's the max value in this specific view
            if values_dict.get(col_idx) == max_val and max_val > 0:
                formatted_str = f"\\textbf{{{formatted_str}}}"
            
            latex_cells.append(formatted_str)
            
        return f"{metric_map.get(metric_key, metric_key)} & " + " & ".join(latex_cells) + " \\\\"

    # 5. Prepare Headers
    thresholds = df_T.columns.tolist()
    header_cols = " & ".join([f"{t:.2f}" for t in thresholds])
    col_def = "c" * len(thresholds)
    
    # 6. Build LaTeX
    latex_code = [
        r"\begin{table*}[!ht]",
        r"    \centering",
        f"    \\caption{{{model_name}: cross-validated performance \\\\for different decision thresholds (mean $\\pm$ standard deviation over {num_folds} folds).}}",
        f"    \\label{{tab:{label_suffix}_thresholds}}",
        f"    \\begin{{tabular}}{{l{col_def}}}",
        r"        \hline",
        f"        & \\multicolumn{{{len(thresholds)}}}{{c}}{{Decision threshold $\\tau$}} \\\\",
        f"        \\cline{{2-{len(thresholds)+1}}}",
        f"        Metric & {header_cols} \\\\",
        r"        \hline",
        r"        \multicolumn{" + str(len(thresholds)+1) + r"}{l}{\textit{Validation subset}} \\"
    ]

    for metric in ['val_f2', 'val_acc']:
        if metric in df_T.index:
            latex_code.append("        " + format_row_content(metric, df_T.loc[metric]))

    latex_code.append(r"        \hline")
    latex_code.append(r"        \multicolumn{" + str(len(thresholds)+1) + r"}{l}{\textit{Out-of-sample evaluation pool}} \\")

    for metric in ['oos_prec', 'oos_rec', 'oos_f2']:
        if metric in df_T.index:
            latex_code.append("        " + format_row_content(metric, df_T.loc[metric]))

    latex_code.extend([
        r"        \hline",
        r"    \end{tabular}",
        r"\end{table*}"
    ])

    print("\n".join(latex_code))

In [None]:
export_latex_table(
    df_results=df_final_table, 
    num_folds=N_FOLDS, 
    model_name="LaBSE", 
    label_suffix="labse_smaller_finetune",
    selected_thresholds=[0.35, 0.4, 0.45, 0.5, ] 
)

# ROC curve

In [None]:
# 1. Prepare the data (e.g., from the OOS Evaluation Pool)
df_eval = df_oos.copy()  # We use your "Evaluation Pool" dataset


# 2. Generate embeddings
embs1 = model.encode(df_eval['polish_label'].tolist(), convert_to_tensor=True, show_progress_bar=True)
embs2 = model.encode(df_eval['name_variant'].tolist(), convert_to_tensor=True, show_progress_bar=True)


# 3. Compute similarity (these are your y_scores!)
# util.cos_sim returns a matrix; we take the diagonal (pairs aligned row-by-row)
cosine_scores = util.cos_sim(embs1, embs2).diagonal().cpu().numpy()


# 4. Prepare labels (ground truth)
y_true = df_eval['truth'].astype(int).to_numpy()  # Make sure these are 0 and 1


In [None]:
def plot_roc_curve_scores(
        y_true: np.ndarray,
        y_scores: np.ndarray,
        title: str = "ROC curve",
        save_path: Optional[Path] = None,
        mark_threshold: Optional[float] = None,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, float]:
    """
    Plot an ROC curve based on pre-computed scores (e.g., cosine similarity)
    and optionally save it to a file. Keeps exact visual consistency with previous plots.
    Uses Seaborn's 'darkgrid' style to exactly match the reference chart.

    Args:
        y_true: Ground-truth binary labels (0 or 1).
        y_scores: Continuous scores (e.g., probabilities or cosine similarity).
        title: Chart title.
        save_path: Path to save the PDF/PNG.
        mark_threshold: Specific threshold value to mark on the curve with a dot.

    Returns:
        fpr, tpr, thresholds, auc_value
    """
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    auc_value = auc(fpr, tpr)

    # 1. Grid style (fixed as in the previous step — keep linewidth at 0.5)
    custom_dotted_style = (0, (1, 1.5))
    sns.set_theme(
        style="darkgrid",
        rc={
            "grid.linestyle": custom_dotted_style,
            "grid.linewidth": 0.5,   # Your fixed setting
            "grid.color": "white",
            "axes.edgecolor": "white"
        }
    )

    fig, ax = plt.subplots(figsize=(4.0, 4.0), dpi=300)

    # 2. Plot the data
    ax.plot(fpr, tpr, color='C0', lw=2, label=f"AUC = {auc_value:.3f}")
    ax.plot([0, 1], [0, 1], color='C1', linestyle='--', lw=1, label="Random")

    if mark_threshold is not None:
        idx = np.argmin(np.abs(thresholds - mark_threshold))
        ax.scatter(
            fpr[idx],
            tpr[idx],
            s=25,
            color='C0',
            marker='o',
            zorder=5,
            label=f"Threshold = {mark_threshold:.2f}"
        )

    # --- 3. FONT COSMETICS (reduce sizes) ---

    # Title: slightly smaller (usually 12–14, we use 11.5)
    ax.set_title(title, fontsize=11.5)

    # Axis labels: clearly smaller (we use 11.5)
    ax.set_xlabel("False positive rate", fontsize=11.5)
    ax.set_ylabel("True positive rate", fontsize=11.5)

    # Tick labels (0.0, 0.2...): smaller (we use 11)
    ax.tick_params(axis='both', which='major', labelsize=11)

    # Legend: smaller as well (we use 10.5)
    ax.legend(loc="lower right", fontsize=10.5)

    # ---------------------------------------------

    ax.set_xlim(-0.0, 1.0)
    ax.set_ylim(-0.0, 1.02)
    ax.set_aspect("equal", "box")

    fig.tight_layout()

    if save_path is not None:
        save_path.parent.mkdir(parents=True, exist_ok=True)
        fig.savefig(save_path, bbox_inches="tight")
        print(f"Plot saved to: {save_path}")
        plt.close(fig)
    else:
        plt.show()

    return fpr, tpr, thresholds, auc_value


In [None]:
# 5. Plot
if FULL_TEST:
    fig_name = "roc_labse_similarity_big_tuning"
else:
    fig_name = "roc_labse_similarity_small_tuning"

fig_path_labse = Path(f"figures/{fig_name}.pdf")
best_threshold = 0.50 # chosen threshols

fpr, tpr, ths, auc_val = plot_roc_curve_scores(
    y_true=y_true,
    y_scores=cosine_scores,
    title="LaBSE - ROC curve",
    save_path=fig_path_labse,
    mark_threshold=best_threshold
)

print(f"LaBSE AUC: {auc_val:.3f}")

# END