In [None]:
%%time
!pip install sentence-transformers einops hf_xet
!pip install ninja pyarrow
!pip install ftfy emoji
!pip install catboost
!pip install --no-deps dask-expr
!pip install faiss-cpu

In [None]:
%%time
import time, os, re, pickle
from datetime import datetime
import torch, ftfy, emoji, faiss
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as dd
import multiprocessing as mp
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.decomposition import PCA
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import warnings

warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# ==========================================
# 1. CONFIGURATION
# ==========================================
dataset = "EnSuperset"
datasetAugment = "EnToxiGen"
datasetFolder = './data/'

# CV strategy and results folder with date
k_folds = 3
start_date = None # Set to '20260104' to reuse results
if start_date is None:
    start_date = datetime.now().strftime('%Y%m%d')

resultsFolder = f'./results_{start_date}_{k_folds}foldCV/'
os.makedirs(resultsFolder, exist_ok=True)
print(f"Results folder: {resultsFolder}")

compDevice = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size_setting = 16
num_tree = 500
stop_rounds = 20
lrn_rate = 0.05
cb_depth = 8
eval_kpi = "AUC"
num_vars = 64
val_frac = 0.2

vectors = ["e5+pca", "snow+pca", "jina+pca", "potion"]
k_neighbors_list = [1, 2, 3]

augmentation_fractions = [0.0, 0.01, 0.02, 0.03, 0.04, 0.05]
#augmentation_fractions = [0.0, 0.005, 0.010, 0.015, 0.020, 0.025, 0.030, 0.035, 0.040, 0.045, 0.05]
replace_with_neighbors = False
filter_augmentation_pool = False

# Global plot settings
PLOT_MIN_AUC = 0.9
PLOT_MAX_AUC = 1.0

# Ultrafast prototyping
FAKE_MODEL_MODE = True

# Random augmentation
BASELINE_REPEATS = 5 # Runs for random augmentation at max fraction
SHOW_BASELINE_RANGE = True # Show min-max range for random baseline

In [None]:
# ==========================================
# 2. DATA PROCESSING & HELPER FUNCTIONS
# ==========================================
def fix_punctuation(text, toneDown=True):
    if hasattr(text, '__len__'):
        text = ftfy.fix_text(text)
        rules = [(r'https?://\S+', ''), (r'([,\.?!])\1{2,}', r'\1\1'),
                 (r'([,\.?!])(?=[^\s])', r'\1 '), (r'\s+([,\.?!])', r'\1'),
                 (r'\s{2,}', ' '), (r'(\d)(?=[^\s\d,\.?!-])', r'\1 '),
                 (r'(?<=[^\s\d-])(\d)', r' \1')]
        if toneDown:
            rules.append((r'[?!]', '.'))
        text = emoji.demojize(text, delimiters=(" ::", ":: "))
        for pattern, replacement in rules:
            text = re.sub(pattern, replacement, text)
        text = text.strip()
    else:
        text = ''
    return text

def clean_and_vectorize(df, sentvec="e5", device="cuda"):
    if sentvec == "jina":
        st = SentenceTransformer('jinaai/' + sentvec + '-embeddings-v3', trust_remote_code=True, device=device)
    elif sentvec == "snow":
        st = SentenceTransformer('Snowflake/' + sentvec + 'flake-arctic-embed-l-v2.0', device=device)    
    elif sentvec == "potion":
        st = SentenceTransformer('minishlab/' + sentvec + '-base-2M', device=device)
    else:
        st = SentenceTransformer('intfloat/multilingual-' + sentvec + '-large-instruct', device=device)
    
    n_partitions = mp.cpu_count()
    ddf = dd.from_pandas(df, npartitions=n_partitions)
    texts = ddf.apply(lambda x: fix_punctuation(x.iloc[1]), axis=1, meta=pd.Series(dtype="str")).compute(scheduler="processes").tolist()
    embeddings = st.encode(texts, batch_size=batch_size_setting, show_progress_bar=True, normalize_embeddings=True)
    
    df_embeddings = pd.DataFrame(embeddings)
    df_embeddings.columns = [f'X{i+1}' for i in range(df_embeddings.shape[1])]
    return df_embeddings

def get_error_mask(y_true, probs, optimal_threshold):
    """
    Mark high-confidence false positives and false negatives
    given ground-truth labels and predicted probabilities.
    """
    if probs is None:
        return None

    false_alarms = (y_true == 0) & (probs > optimal_threshold)
    false_negatives = (y_true == 1) & (probs < optimal_threshold)
    return false_alarms | false_negatives

def normalize_query_vectors(query_vectors):
    query_vectors = np.ascontiguousarray(query_vectors, dtype=np.float32)
    faiss.normalize_L2(query_vectors)
    return query_vectors

def build_faiss_index(vectors, ids, use_gpu=False):
    """
    Build a cosine-similarity FAISS index with GLOBAL ids.

    vectors: np.ndarray [n_samples, dim]
    ids:     np.ndarray [n_samples], global indices into X_aug_full / y_augment_full
    """
    vectors = normalize_query_vectors(vectors)
    d = vectors.shape[1]

    base_index = faiss.IndexFlatIP(d)
    index = faiss.IndexIDMap(base_index)

    if use_gpu:
        try:
            res = faiss.StandardGpuResources()
            index = faiss.index_cpu_to_gpu(res, 0, index)
        except Exception as e:
            print(f" GPU failed ({e}), using CPU")

    index.add_with_ids(vectors, ids.astype("int64"))
    return index

def find_knn_augmentation_indices_faiss(X_train, y_train, train_probs, optimal_threshold,
                                        X_aug_pool, y_aug_pool, k, target_aug_size,
                                        faiss_indices_by_label, pool_probs=None):
    """
    Smart k-NN augmentation with optional filtering of 'bad' pool points.

    pool_probs[i] = p(y=1 | X_aug_pool[i]) from CatBoost.
    Pool points that are high-confidence errors w.r.t. optimal_threshold
    (same criterion as for train) are skipped as neighbors.
    """
    problematic_mask = get_error_mask(y_train, train_probs, optimal_threshold)
    problematic_indices_global = np.where(problematic_mask)[0]
    n_problematic = len(problematic_indices_global)

    if n_problematic == 0 or target_aug_size == 0:
        empty = np.array([], dtype=int)
        return empty, empty

    X_problematic = X_train[problematic_mask].astype(np.float32, copy=False)
    y_problematic = y_train[problematic_mask]
    prob_predictions = train_probs[problematic_mask]

    priority_scores = np.abs(prob_predictions - optimal_threshold)
    sorted_indices = np.argsort(priority_scores)[::-1]

    pool_errors = None
    if pool_probs is not None:
        pool_errors = get_error_mask(y_aug_pool, pool_probs, optimal_threshold)

    selected_aug_indices = []
    selected_problematic_global = []

    # boolean used mask instead of set
    used = np.zeros(len(y_aug_pool), dtype=bool)

    for local_prob_idx in sorted_indices:
        if len(selected_aug_indices) >= target_aug_size:
            break

        prob_vector = X_problematic[local_prob_idx:local_prob_idx + 1]
        prob_label = y_problematic[local_prob_idx]

        faiss_index = faiss_indices_by_label[prob_label]
        k_search = min(k, faiss_index.ntotal)
        if k_search == 0:
            continue

        _, nn_ids = faiss_index.search(prob_vector, k_search)
        candidates = nn_ids[0]

        # Build a mask over candidates instead of multiple continues
        mask = np.ones_like(candidates, dtype=bool)

        if pool_errors is not None:
            mask &= ~pool_errors[candidates]

        # label consistency (should normally be redundant)
        mask &= (y_aug_pool[candidates] == prob_label)

        # not already used
        mask &= ~used[candidates]

        valid_candidates = candidates[mask]
        if valid_candidates.size == 0:
            continue

        # how many we still need
        remaining = target_aug_size - len(selected_aug_indices)
        if remaining <= 0:
            break

        take = valid_candidates[:remaining]
        selected_aug_indices.extend(take.tolist())
        used[take] = True
        selected_problematic_global.extend(
            [problematic_indices_global[local_prob_idx]] * len(take)
        )

        if len(selected_aug_indices) >= target_aug_size:
            break

    return np.array(selected_aug_indices, dtype=int), np.array(selected_problematic_global, dtype=int)

def fit_catboost_and_predict(X_train, y_train, X_val, y_val, X_train_full, X_test, X_aug_pool=None,
                             cb_device='CPU', num_tree=200, eval_kpi="AUC"):
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    model = CatBoostClassifier(iterations=num_tree, learning_rate=lrn_rate, depth=cb_depth, task_type=cb_device, allow_writing_files=False,
                               eval_metric=eval_kpi, loss_function='Logloss', scale_pos_weight=scale_pos_weight, verbose=0)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), metric_period=5,
              early_stopping_rounds=stop_rounds, use_best_model=True, verbose=False)
    train_probs = model.predict_proba(X_train_full)[:, 1]
    val_probs   = model.predict_proba(X_val)[:, 1]
    test_probs  = model.predict_proba(X_test)[:, 1]
    aug_pool_probs = None
    if X_aug_pool is not None:
        aug_pool_probs = model.predict_proba(X_aug_pool)[:, 1]
    return train_probs, val_probs, test_probs, aug_pool_probs

def get_intermediate_results_filename(vec_name, frac, k, run, fold_i):
    if frac == 0.0 and k == 0 and run == 0:
        return f"{vec_name}_fold{fold_i+1}.pkl"
    frac_str = f"{int(frac*1000):03d}"
    if run == 0:
        filename = f"{vec_name}_frac{frac_str}_k{k}_fold{fold_i+1}.pkl"
    else:
        filename = f"{vec_name}_frac{frac_str}_r{run}_fold{fold_i+1}.pkl"
    return filename

def save_intermediate_results(vec_name, frac, k, run, fold_i, y_train, y_val, y_test, 
                             train_probs, val_probs, test_probs, results_folder):
    filename = get_intermediate_results_filename(vec_name, frac, k, run, fold_i)
    filepath = os.path.join(results_folder, filename)
    
    results_dict = {
        'y_train': y_train, 'y_val': y_val, 'y_test': y_test,
        'train_probs': train_probs, 'val_probs': val_probs, 'test_probs': test_probs,
        'vectorizer': vec_name, 'fraction': frac, 'k_neighbors': k, 'run': run,'fold': fold_i
    }
    
    with open(filepath, 'wb') as f:
        pickle.dump(results_dict, f)
    print(f"      Saved: {filename}")
    return filepath

def load_intermediate_results(vec_name, frac, k, run, fold_i, results_folder):
    filename = get_intermediate_results_filename(vec_name, frac, k, run, fold_i)
    filepath = os.path.join(results_folder, filename)
    
    if os.path.exists(filepath):
        with open(filepath, 'rb') as f:
            results_dict = pickle.load(f)
        print(f"      Loaded: {filename}")
        return results_dict
    return None

def generate_fake_predictions(y_true, base_auc=0.7, noise_level=0.1, random_state=None):
    rng = np.random.default_rng(random_state)
    y_true = np.asarray(y_true)
    scores = y_true + rng.normal(0, noise_level, len(y_true))
    ranks = scores.argsort().argsort()
    probs = ranks / (len(ranks) - 1)
    return base_auc * probs + (1 - base_auc) * rng.random(len(probs))

In [None]:
%%time
# ==========================================
# 3. LOAD DATASETS
# ==========================================
print("Loading datasets...")
if dataset == 'EnSuperset':
    df = pd.read_csv(datasetFolder + dataset + '.csv', engine='python', header=0)
    if 'labels' in df.columns:
        target_col = 'labels'
        text_col = [c for c in df.columns if c != target_col][0]
        df = df[[target_col, text_col]]
    elif isinstance(df.iloc[0, 0], str): 
        df = df[df.columns[::-1]]
else:
    df = pd.read_csv(datasetFolder + dataset + '.csv', engine='python', header=0)
    if 'prompt_label' in df.columns:
        target_col = 'prompt_label'
        text_col = [c for c in df.columns if c != target_col][0]
        df = df[[target_col, text_col]]
    elif isinstance(df_augment.iloc[0, 0], str):
        df = df[df.columns[::-1]]
df.columns = range(df.shape[1])
y_main = df[0].to_numpy()

if datasetAugment == 'EnToxiGen':
    df_augment = pd.read_csv(datasetFolder + datasetAugment + '.csv', engine='python', header=0)
    if 'prompt_label' in df_augment.columns:
        target_col = 'prompt_label'
        text_col = [c for c in df_augment.columns if c != target_col][0]
        df_augment = df_augment[[target_col, text_col]]
    elif isinstance(df_augment.iloc[0, 0], str):
        df_augment = df_augment[df_augment.columns[::-1]]
else:
    df_augment = pd.read_csv(datasetFolder + datasetAugment + '.csv', engine='python', header=0)
    if 'labels' in df_augment.columns:
        target_col = 'labels'
        text_col = [c for c in df_augment.columns if c != target_col][0]
        df_augment = df_augment[[target_col, text_col]]
    elif isinstance(df_augment.iloc[0, 0], str): 
        df_augment = df_augment[df_augment.columns[::-1]]
df_augment.columns = range(df_augment.shape[1])
y_augment_full = df_augment[0].to_numpy()

In [None]:
# ==========================================
# 4. PRE-CALCULATE VECTORS AND BUILD FAISS INDICES
# ==========================================
vectors_cache = {v: {} for v in vectors}
faiss_indices_cache = {v: {} for v in vectors}
vector_dims = {}
use_faiss_gpu = False

for sentvec in vectors:
    print(f"\nProcessing: {sentvec}")
    parts = sentvec.split("+", 1)
    sentvecModel = parts[0]
    transformPCA = len(parts) > 1 and parts[1] == "pca"

    # Main set vectors
    f_path = f"{datasetFolder}{dataset}-X-{sentvecModel}.parquet"
    if not os.path.exists(f_path):
        df_vec = clean_and_vectorize(df, sentvec=sentvecModel, device=compDevice)
        df_vec.to_parquet(f_path, engine="pyarrow")
    X_main = pd.read_parquet(f_path).to_numpy()

    # Augmentation pool vectors (ToxiGen)
    f_path_aug = f"{datasetFolder}{datasetAugment}-X-{sentvecModel}.parquet"
    if not os.path.exists(f_path_aug):
        df_vec_aug = clean_and_vectorize(df_augment, sentvec=sentvecModel, device=compDevice)
        df_vec_aug.to_parquet(f_path_aug, engine="pyarrow")
    X_aug = pd.read_parquet(f_path_aug).to_numpy()

    # Optional joint PCA
    if transformPCA:
        pca = PCA(n_components=num_vars)
        merged = np.vstack([X_main, X_aug])
        pca.fit(merged)
        merged_transformed = pca.transform(merged)
        n_main = len(X_main)
        X_main = merged_transformed[:n_main]
        X_aug = merged_transformed[n_main:]
        del merged, merged_transformed
        vector_dims[sentvec] = num_vars
    else:
        vector_dims[sentvec] = X_main.shape[1]

    vectors_cache[sentvec]["main"] = X_main
    vectors_cache[sentvec]["aug"] = X_aug

    # Build label-specific FAISS indices with GLOBAL ids
    global_ids = np.arange(len(y_augment_full), dtype=np.int64)
    faiss_indices_cache[sentvec] = {}

    for label in [0, 1]:
        label_mask = (y_augment_full == label)
        X_aug_label = X_aug[label_mask].copy()
        ids_label = global_ids[label_mask]

        if len(ids_label) == 0:
            # No samples of this label in the pool
            faiss_indices_cache[sentvec][label] = faiss.IndexIDMap(
                faiss.IndexFlatIP(vector_dims[sentvec])
            )
            continue

        index = build_faiss_index(X_aug_label, ids_label, use_gpu=use_faiss_gpu)
        faiss_indices_cache[sentvec][label] = index

    print("\n" + "=" * 60)
    print("Vector preparation and FAISS indexing complete!")
    print("=" * 60)

In [None]:
%%time
# ==========================================
# 5. SMART k-NN AUGMENTATION EXPERIMENT
# ==========================================
max_frac = max(augmentation_fractions)
final_results = {
    v: {k: {f: None for f in augmentation_fractions} for k in [0] + k_neighbors_list}
    for v in vectors
}
baseline_results = {v: [] for v in vectors}

skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
cb_device = 'GPU' if compDevice == 'cuda' else 'CPU'

# Optional: precompute probabilities on the augmentation pool per vectorizer.
# If unused, leave as None and filtering will be skipped.
aug_pool_probs_per_vec = {v: None for v in vectors}

for vec_name in vectors:
    print(f"\n>>> {vec_name}")

    X_main = vectors_cache[vec_name]['main']
    X_aug_full = vectors_cache[vec_name]['aug']

    # ----- Baseline (no augmentation) -----
    pooled_y_true, pooled_y_probs = [], []

    for fold_i, (train_idx, test_idx) in enumerate(skf.split(X_main, y_main)):
        if not FAKE_MODEL_MODE:
            existing = load_intermediate_results(vec_name, 0.0, 0, 0, fold_i, resultsFolder)
            if existing:
                pooled_y_true.extend(existing['y_test'])
                pooled_y_probs.extend(existing['test_probs'])
                continue

        X_train_fold, X_test_fold = X_main[train_idx], X_main[test_idx]
        y_train_fold, y_test_fold = y_main[train_idx], y_main[test_idx]

        X_in_train, X_in_val, y_in_train, y_in_val = train_test_split(
            X_train_fold, y_train_fold, test_size=val_frac,
            stratify=y_train_fold, random_state=42
        )

        if FAKE_MODEL_MODE:
            train_probs = generate_fake_predictions(y_train_fold, base_auc=0.69, noise_level=0.1)
            val_probs   = generate_fake_predictions(y_in_val,   base_auc=0.69, noise_level=0.1)
            test_probs  = generate_fake_predictions(y_test_fold, base_auc=0.69, noise_level=0.1)
        else:
            train_probs, val_probs, test_probs, _ = fit_catboost_and_predict(
                X_in_train, y_in_train, X_in_val, y_in_val,
                X_train_fold, X_test_fold, X_aug_pool=None,
                cb_device=cb_device, num_tree=num_tree, eval_kpi=eval_kpi
            )

            save_intermediate_results(vec_name, 0.0, 0, 0, fold_i, y_train_fold, y_in_val, y_test_fold,
                                      train_probs, val_probs, test_probs, resultsFolder)

        pooled_y_true.extend(y_test_fold)
        pooled_y_probs.extend(test_probs)

    baseline_auc = roc_auc_score(pooled_y_true, pooled_y_probs)
    final_results[vec_name][0][0.0] = baseline_auc
    for k in k_neighbors_list:
        final_results[vec_name][k][0.0] = baseline_auc
    print(f" Base AUC: {baseline_auc:.4f}")

    # ----- k-NN augmentation -----
    for frac in augmentation_fractions:
        if frac == 0.0:
            continue

        print(f"\n frac={frac:.3f}")
        target_aug_size = int(len(y_main) * frac)

        for k in k_neighbors_list:
            print(f" k={k}")
            pooled_y_true, pooled_y_probs = [], []

            for fold_i, (train_idx, test_idx) in enumerate(skf.split(X_main, y_main)):
                if not FAKE_MODEL_MODE:
                    existing = load_intermediate_results(vec_name, frac, k, 0, fold_i, resultsFolder)
                    if existing:
                        pooled_y_true.extend(existing['y_test'])
                        pooled_y_probs.extend(existing['test_probs'])
                        continue

                X_train_fold, X_test_fold = X_main[train_idx], X_main[test_idx]
                y_train_fold, y_test_fold = y_main[train_idx], y_main[test_idx]

                all_indices = np.arange(len(y_train_fold))
                train_sub_idx, val_sub_idx = train_test_split(
                    all_indices, test_size=val_frac,
                    stratify=y_train_fold, random_state=42
                )

                X_in_train = X_train_fold[train_sub_idx]
                y_in_train = y_train_fold[train_sub_idx]
                X_in_val   = X_train_fold[val_sub_idx]
                y_in_val   = y_train_fold[val_sub_idx]

                if FAKE_MODEL_MODE:
                    base_auc_plus = 0.7 + frac / k
                    train_probs = generate_fake_predictions(y_train_fold, base_auc=base_auc_plus, noise_level=0.1)
                    val_probs   = generate_fake_predictions(y_in_val,   base_auc=base_auc_plus, noise_level=0.1)
                    test_probs  = generate_fake_predictions(y_test_fold, base_auc=base_auc_plus, noise_level=0.1)
                else:
                    if filter_augmentation_pool:
                        X_aug_pool = X_aug_full
                    else:
                        X_aug_pool = None
                    
                    # Initial model to get train_probs and threshold
                    init_train_probs, _, _, aug_pool_probs = fit_catboost_and_predict(
                        X_in_train, y_in_train, X_in_val, y_in_val,
                        X_train_fold, X_test_fold, X_aug_pool=X_aug_pool,
                        cb_device=cb_device, num_tree=num_tree, eval_kpi=eval_kpi
                    )

                    fpr, tpr, thresholds = roc_curve(y_train_fold, init_train_probs)
                    optimal_threshold = thresholds[np.argmin(np.abs(tpr + fpr - 1))]

                    aug_indices, problematic_global = find_knn_augmentation_indices_faiss(
                        X_train_fold, y_train_fold, init_train_probs, optimal_threshold,
                        X_aug_full, y_augment_full, k, target_aug_size,
                        faiss_indices_cache[vec_name], aug_pool_probs)

                    if len(aug_indices) > 0:
                        X_aug = X_aug_full[aug_indices]
                        y_aug = y_augment_full[aug_indices]

                        if replace_with_neighbors:
                            in_train_mask = np.zeros(len(y_train_fold), dtype=bool)
                            in_train_mask[train_sub_idx] = True
                            problematic_in_train = problematic_global[in_train_mask[problematic_global]]

                            pos_in_train = -np.ones(len(y_train_fold), dtype=int)
                            pos_in_train[train_sub_idx] = np.arange(len(train_sub_idx))
                            problematic_positions = pos_in_train[problematic_in_train]

                            keep_mask = np.ones(len(X_in_train), dtype=bool)
                            keep_mask[problematic_positions] = False

                            X_in_train_kept = X_in_train[keep_mask]
                            y_in_train_kept = y_in_train[keep_mask]

                            X_in_train_aug = np.concatenate([X_in_train_kept, X_aug], axis=0)
                            y_in_train_aug = np.concatenate([y_in_train_kept, y_aug], axis=0)
                        else:
                            X_in_train_aug = np.concatenate([X_in_train, X_aug], axis=0)
                            y_in_train_aug = np.concatenate([y_in_train, y_aug], axis=0)

                        shuffle_idx = np.random.permutation(len(X_in_train_aug))
                        X_in_train_aug = X_in_train_aug[shuffle_idx]
                        y_in_train_aug = y_in_train_aug[shuffle_idx]
                    else:
                        X_in_train_aug, y_in_train_aug = X_in_train, y_in_train

                    train_probs, val_probs, test_probs, _ = fit_catboost_and_predict(
                        X_in_train_aug, y_in_train_aug, X_in_val, y_in_val,
                        X_train_fold, X_test_fold, X_aug_pool=None,
                        cb_device=cb_device, num_tree=num_tree, eval_kpi=eval_kpi
                    )

                    save_intermediate_results(vec_name, frac, k, 0, fold_i, y_train_fold, y_in_val, y_test_fold,
                                              train_probs, val_probs, test_probs, resultsFolder)

                pooled_y_true.extend(y_test_fold)
                pooled_y_probs.extend(test_probs)

            pooled_auc = roc_auc_score(pooled_y_true, pooled_y_probs)
            final_results[vec_name][k][frac] = pooled_auc
            print(f" AUC: {pooled_auc:.4f} (Δ={pooled_auc - baseline_auc:+.4f})")

    # ----- Random augmentation baseline at max fraction -----
    print("\n")
    for repeat in range(BASELINE_REPEATS):
        aug_size = int(len(y_main) * max_frac)
        np.random.seed(42 + repeat)
        aug_indices = np.random.choice(len(y_augment_full), size=aug_size, replace=False)
        X_aug = X_aug_full[aug_indices]
        y_aug = y_augment_full[aug_indices]

        pooled_y_true, pooled_y_probs = [], []

        for fold_i, (train_idx, test_idx) in enumerate(skf.split(X_main, y_main)):
            if not FAKE_MODEL_MODE:
                existing = load_intermediate_results(vec_name, max_frac, 0, repeat + 1, fold_i, resultsFolder)
                if existing:
                    pooled_y_true.extend(existing['y_test'])
                    pooled_y_probs.extend(existing['test_probs'])
                    continue

            X_train_fold, X_test_fold = X_main[train_idx], X_main[test_idx]
            y_train_fold, y_test_fold = y_main[train_idx], y_main[test_idx]

            X_in_train, X_in_val, y_in_train, y_in_val = train_test_split(
                X_train_fold, y_train_fold, test_size=val_frac,
                stratify=y_train_fold, random_state=42
            )

            X_in_train_aug = np.concatenate([X_in_train, X_aug], axis=0)
            y_in_train_aug = np.concatenate([y_in_train, y_aug], axis=0)
            shuf = np.random.permutation(len(X_in_train_aug))
            X_in_train_aug = X_in_train_aug[shuf]
            y_in_train_aug = y_in_train_aug[shuf]

            if FAKE_MODEL_MODE:
                base_auc_rand = 0.7 - max_frac / 2
                train_probs = generate_fake_predictions(y_train_fold, base_auc=base_auc_rand, noise_level=0.1)
                val_probs   = generate_fake_predictions(y_in_val,   base_auc=base_auc_rand, noise_level=0.1)
                test_probs  = generate_fake_predictions(y_test_fold, base_auc=base_auc_rand, noise_level=0.1)
            else:
                train_probs, val_probs, test_probs, _ = fit_catboost_and_predict(
                    X_in_train_aug, y_in_train_aug, X_in_val, y_in_val,
                    X_train_fold, X_test_fold, X_aug_pool=None,
                    cb_device=cb_device, num_tree=num_tree, eval_kpi=eval_kpi)

                save_intermediate_results(vec_name, max_frac, 0, repeat + 1, fold_i, y_train_fold, y_in_val, y_test_fold,
                                          train_probs, val_probs, test_probs, resultsFolder)

            pooled_y_true.extend(y_test_fold)
            pooled_y_probs.extend(test_probs)

        repeat_auc = roc_auc_score(pooled_y_true, pooled_y_probs)
        baseline_results[vec_name].append(repeat_auc)
        print(f" Rand AUC: {repeat_auc:.4f} ({repeat + 1}/{BASELINE_REPEATS})")

    mean_auc = np.mean(baseline_results[vec_name])
    min_auc, max_auc = np.min(baseline_results[vec_name]), np.max(baseline_results[vec_name])
    print(f" Mean AUC: {mean_auc:.4f} (Δ={mean_auc - baseline_auc:+.4f})")
    print(f" Range AUC: [{min_auc:.4f}, {max_auc:.4f}]")

print("\n")

In [None]:
# ==========================================
# 6. PLOTTING k-NN RESULTS WITH BASELINE AT MAX FRACTION
# ==========================================
print("\nGenerating plots...")

n_plots = len(vectors)
n_cols = 2
n_rows = (n_plots + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 6 * n_rows), squeeze=False)

for idx, vec_name in enumerate(vectors):
    row = idx // n_cols
    col = idx % n_cols
    ax = axes[row][col]
    
    baseline_auc = final_results[vec_name][0][0.0]
    aug_fracs = [f for f in augmentation_fractions if f > 0.0]
    
    # Original baseline (k=0, frac=0) - Black dashed line
    ax.axhline(y=baseline_auc, color='black', linestyle='--', linewidth=2, alpha=0.7,
               label=f"NoAug AUC={baseline_auc:.3f} (baseline)")
    
    # Random augmentation baseline statistics
    baseline_mean = np.mean(baseline_results[vec_name])
    baseline_min = np.min(baseline_results[vec_name])
    baseline_max = np.max(baseline_results[vec_name])

    # Random baseline: Light gray band for min-max range
    if SHOW_BASELINE_RANGE:
        ax.axhspan(baseline_min, baseline_max, color='lightgray', alpha=0.3, 
                   label=f'rand min AUC={baseline_min:.3f} @ {max_frac:.3f}')    
    
    # Random baseline: Dark gray dashed line at mean
    ax.axhline(y=baseline_mean, color='darkgray', linestyle='--', linewidth=2, alpha=0.7,
               label=f'rand mean AUC={baseline_mean:.3f} @ {max_frac:.3f}')

    # Random baseline: Light gray band for min-max range
    if SHOW_BASELINE_RANGE:
        ax.axhspan(baseline_min, baseline_max, color='lightgray', alpha=0.3, 
                   label=f'rand max AUC={baseline_max:.3f} @ {max_frac:.3f}')    
    
    # k-NN results
    colors = plt.cm.viridis(np.linspace(0.2, 0.9, len(k_neighbors_list)))
    
    for k_idx, k in enumerate(k_neighbors_list):
        aucs = [final_results[vec_name][k][f] for f in aug_fracs]
        max_auc = max(aucs)
        max_idx = aucs.index(max_auc)
        max_frac_knn = aug_fracs[max_idx]
        
        ax.plot(aug_fracs, aucs, marker='o', color=colors[k_idx], linewidth=2, markersize=6,
                label=f"k={k} max AUC={max_auc:.3f} @ {max_frac_knn:.3f}")
    
    ax.set_xlabel("Fraction of Augmentation Data", fontsize=11, fontweight='bold')
    ax.set_ylabel("AUC ROC", fontsize=11, fontweight='bold')
    dims = vector_dims[vec_name] # Add dimensionality to title
    ax.set_title(f"{vec_name} ({dims} dims)", fontsize=12, fontweight='bold')
    ax.legend(loc='best', fontsize=8)
    ax.grid(True, axis='y', linestyle=':', linewidth=0.7, alpha=0.35)
    ax.set_axisbelow(True)
    ax.set_ylim(PLOT_MIN_AUC, PLOT_MAX_AUC)

# Hide unused subplots
if n_plots % n_cols != 0:
    for idx in range(n_plots, n_rows * n_cols):
        row = idx // n_cols
        col = idx % n_cols
        axes[row][col].set_visible(False)

plot_path = f"{resultsFolder}{k_folds}foldCV_results_kNN_vs_Rand_Augment.png"
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
print(f"Plot saved: {plot_path}")

In [None]:
# ==========================================
# 7. SAVE AUC ROC RESULTS
# ==========================================
results_list = []
for vec in vectors:
    baseline = final_results[vec][0][0.0]
    dims = vector_dims[vec]
    results_list.append({"Vectorizer": vec, "Dims": dims, "Method": "NoAug", "Frac": 0.0, "k": 0, "Run": 0, "AUC": baseline, "Delta": 0.0})
    for k in k_neighbors_list:
        for frac in augmentation_fractions:
            auc = final_results[vec][k][frac]
            if auc is not None:
                results_list.append({"Vectorizer": vec, "Dims": dims, "Method": "kNN", "Frac": frac, "k": k, "Run": 0, "AUC": auc, "Delta": auc - baseline})
    for run, auc in enumerate(baseline_results[vec]):
        results_list.append({"Vectorizer": vec, "Dims": dims, "Method": "Rand", "Frac": max_frac, "k": 0, "Run": run+1, "AUC": auc, "Delta": auc - baseline})
df_res = pd.DataFrame(results_list)
csv_path = f"{resultsFolder}{k_folds}foldCV_results_kNN_vs_Rand_Augment.csv"
df_res.to_csv(csv_path, index=False)
print(f"CSV saved: {csv_path}")
print("\nEXPERIMENT COMPLETE")