In [None]:
!pip install datasets==2.16.0 huggingface-hub==0.34.0 scikit-learn jenga pandas numpy setuptools nltk category_encoders ftfy cleanlab seaborn -q

## Create Directories

In [None]:
import os

current_dir = os.getcwd()
if 'notebook' in current_dir:
    BASE_DIR = os.path.dirname(current_dir)
else:
    BASE_DIR = current_dir

os.makedirs(BASE_DIR, exist_ok=True)
os.chdir(BASE_DIR)

# Create folder structure
os.makedirs('results/', exist_ok=True)
os.makedirs('figures/', exist_ok=True)
os.makedirs('data/', exist_ok=True)

RESULTS_DIR = os.path.join(BASE_DIR, "results/")
FIGURES_DIR = os.path.join(BASE_DIR, "figures/")
DATA_DIR = os.path.join(BASE_DIR, "data/")

print(f"Folder structure created")

Folder structure created


## Imports

### Import libraries

In [None]:
# Import libraries

import os, sys
import pandas as pd
import numpy as np
from datetime import datetime
import gc
import time
from contextlib import contextmanager

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error
from scipy.stats import ttest_rel, ks_2samp, norm
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
import ast
from datasets import load_dataset

import sklearn
sklearn.set_config(enable_metadata_routing=True)

import warnings
warnings.filterwarnings("ignore")

### Import corruption & cleaning modules


In [None]:
sys.path.append(os.path.join(BASE_DIR, "python_scripts/"))

# Import unified corruption functions
import corruptions as corrupt

# Import unified cleaning functions
import cleaning_functions as clean

print("Imported corruption & cleaning modules")

Imported corruption & cleaning modules


## Building the Model

### Configurations

In [None]:
RANDOM_STATE = 42
TEST_SIZE = 0.20
SAMPLES_PER_CLASS = 20000
EXPECTED_TOTAL = SAMPLES_PER_CLASS * 5

### Preprocessing and Model Functions

In [None]:
# feature extraction: Dual TF-IDF (word + char n-grams)
text_features = FeatureUnion([
    ("word_tfidf", TfidfVectorizer(
        tokenizer=str.split,
        lowercase=False,
        ngram_range=(1, 2),
        min_df=5,
        max_df=0.8,
        max_features=10000,
        sublinear_tf=True,
    )),
    ("char_tfidf", TfidfVectorizer(
        analyzer="char",            # Character-level n-grams
        ngram_range=(3, 4),
        min_df=10,
        max_features=5000,
        sublinear_tf=True,
    ))
])

# logistic regression model with sample weights support
def build_model():
    return Pipeline([
        ("features", text_features),
        ("clf", LogisticRegression(
            max_iter=2000,
            C=1.0,
            random_state=RANDOM_STATE,
            n_jobs=-1
        ).set_fit_request(sample_weight=True))
])

## Utility Functions

In [None]:
def clear_memory():
    _ = gc.collect()

@contextmanager
def timer():
    # context manager to time a block of code
    start = time.perf_counter()
    yield lambda: time.perf_counter() - start

def prepare_baseline_data(df: pd.DataFrame) -> pd.DataFrame:
    # prepare clean baseline with row_id for fixed splits
    df = df.copy()
    df["text"] = df["text"].astype(str)
    df["label"] = pd.to_numeric(df["label"], errors="coerce")
    df = df.dropna(subset=["label"])
    df["label"] = df["label"].astype(int)
    df = df.reset_index(drop=True)
    df["row_id"] = df.index
    return df

def compute_metrics(y_true, y_pred):
    # compute all ML metrics
    return {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "precision": float(precision_score(y_true, y_pred, average="weighted", zero_division=0)),
        "recall": float(recall_score(y_true, y_pred, average="weighted", zero_division=0)),
        "f1": float(f1_score(y_true, y_pred, average="weighted", zero_division=0)),
        "mae": float(mean_absolute_error(y_true, np.clip(y_pred, 1, 5)))
    }

## Loading the Data

In [None]:
print("Loading the raw dataset...")
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)

df_raw = dataset["full"].to_pandas()
if 'rating' in df_raw.columns:
  df_raw = df_raw.rename(columns={'rating': 'label'})
print(f"Loaded: {len(df_raw):,} rows")

Loading the raw dataset...
Loaded: 701,528 rows


In [None]:
df_raw = df_raw.groupby('label', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), SAMPLES_PER_CLASS), random_state=9)
)

baseline_data_path = os.path.join(DATA_DIR, "baseline_data.csv")
df_raw.to_csv(baseline_data_path, index=False)
print(f"Sampled to Balanced Pool: {len(df_raw):,} total rows")
print(f"Class Distribution: {df_raw['label'].value_counts().to_dict()}")

Sampled to Balanced Pool: 100,000 total rows
Class Distribution: {1.0: 20000, 2.0: 20000, 3.0: 20000, 4.0: 20000, 5.0: 20000}


## Training and Evaluation Function

In [None]:
def train_and_evaluate(df_train, df_test, train_ids, test_ids):
    # train and evaluate the model on given train and test splits
    # returns metrics and stats
    train_valid = df_train[df_train["row_id"].isin(train_ids)].copy()
    test_valid = df_test[df_test["row_id"].isin(test_ids)].copy()

    if len(train_valid) < 20 or len(test_valid) < 10:
        return None, {"error": "insufficient_data"}

    X_train = train_valid["text"].astype(str).values
    y_train = pd.to_numeric(train_valid["label"], errors="coerce").astype(int).values
    X_test = test_valid["text"].astype(str).values
    y_test = pd.to_numeric(test_valid["label"], errors="coerce").astype(int).values

    # sample Weights (required for Model aware cleaning)
    # check if 'sample_weight' column exists in the training split
    weights = None
    if "sample_weight" in train_valid.columns:
        weights = train_valid["sample_weight"].values

    model = build_model()
    if weights is not None:
        # we pass sample_weight to the fit method
        model.fit(X_train, y_train, sample_weight=weights)
    else:
        model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # basic ML evaluation metrics
    metrics = compute_metrics(y_test, y_pred)
    stats = {
        "train_total": len(train_valid),
        "train_valid": len(train_valid),
        "test_total": len(test_valid),
        "test_valid": len(test_valid),
    }

    del model, X_train, y_train, X_test, y_test, train_valid, test_valid
    clear_memory()

    return metrics, stats

## Running the Model on Baseline Data

### Prepare baseline data

In [None]:
df_baseline = prepare_baseline_data(df_raw)

### Run the model on baseline data

In [None]:
# create fixed train/test split
df_train_clean, df_test_clean = train_test_split(
    df_baseline, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=df_baseline["label"]
)
train_ids = set(df_train_clean["row_id"].tolist())
test_ids = set(df_test_clean["row_id"].tolist())
print(f"Split: {len(df_train_clean):,} train / {len(df_test_clean):,} test")

# train and evaluate on baseline data
print("Training the model on baseline data...")
with timer() as t:
    metrics_base, stats_base = train_and_evaluate(df_baseline, df_baseline, train_ids, test_ids)
baseline_acc = metrics_base['accuracy']
print(f"Baseline Accuracy: {baseline_acc:.4f} ({baseline_acc*100:.2f}%)")
print(f"MAE: {metrics_base['mae']:.4f}")
print(f"Evaluation took: {t():.2f} seconds")

# save baseline results
baseline_info = {
    "timestamp": datetime.now().isoformat(),
    "dataset_size": len(df_baseline),
    "train_size": len(train_ids),
    "test_size": len(test_ids),
    "baseline_accuracy": baseline_acc,
    "eval_time": t(),
    **metrics_base
}

baseline_results_path = os.path.join(RESULTS_DIR, "baseline_results.csv")
pd.DataFrame([baseline_info]).to_csv(baseline_results_path, index=False)

Split: 80,000 train / 20,000 test
Training the model on baseline data...
Baseline Accuracy: 0.5152 (51.52%)
MAE: 0.6400
Evaluation took: 177.09 seconds


### Corrupt the data -> Train and Evaluate -> Clean -> Re-train and evaluate

In [None]:
def run_batch_experiments(batch_id, corruption_list):
    """Run corruption, cleaning, and evaluation for a batch of experiments."""
    batch_results = []
    for idx, (exp_id, corrupt_func, kwargs) in enumerate(corruption_list, start=1):
        print("\n----------------------------------------------------------")
        print(f"[{idx}/{len(corruption_list)}] {exp_id} in {batch_id}")
        print("----------------------------------------------------------\n")

        # Corrupt
        with timer() as t_corrupt:
            corrupted_df = corrupt_func(df_baseline.copy(), **kwargs)
        corrupt_time = t_corrupt()
        print(f"Corruption took: {corrupt_time:.2f} seconds")

        corrupted_data_baseline = os.path.join(DATA_DIR, f"{exp_id}_data.csv")
        corrupted_df.to_csv(corrupted_data_baseline, index=False)

        # cleaning strategies
        cleaning_strategies = [
            ("0_Corrupted", lambda d: (d, {"name": "No Cleaning", "structural_drop": 0, "strategy_drop": 0})),
            ("1_Basic", lambda d: clean.clean_basic(d)),
            ("2_Heuristic", lambda d: clean.clean_heuristic(d)),
            ("3_Semantic", lambda d: clean.clean_semantic(d)),
            ("4_ModelAware", lambda d: clean.clean_model_aware(d, build_model())),
        ]

        level_0_acc = None

        for clean_idx, (clean_name, clean_func) in enumerate(cleaning_strategies):
            print(f"\n  >>> {clean_name}")
            n_before = len(corrupted_df)
            cleaning_time = 0 #default for level 0
            if clean_idx > 0:
                print(f"Cleaning...")
                with timer() as t_clean:
                    cleaned_df, meta = clean_func(corrupted_df.copy())
                cleaning_time = t_clean()
            else:
                #for level 0, no cleaning applied
                cleaned_df, meta = clean_func(corrupted_df.copy())

                # drop rows where label is not numeric to avoid crash
                cleaned_df['label'] = pd.to_numeric(cleaned_df['label'], errors='coerce')
                initial_len = len(cleaned_df)
                cleaned_df = cleaned_df.dropna(subset=['label'])
                meta['structural_drop'] = initial_len - len(cleaned_df)
            print(f"Cleaning took: {cleaning_time:.2f} seconds")

            # cleaning stats
            if meta.get("dropped", 0) > 0:
                print(f"Dropped: {meta['dropped']} rows")
            if meta.get("modified", 0) > 0:
                print(f"Modified: {meta['modified']} rows")

            print(f"Current size: {len(cleaned_df):,} rows")

            # evaluate
            print(f"Training & evaluating...")
            with timer() as t_eval:
                metrics, stats = train_and_evaluate(cleaned_df, cleaned_df, train_ids, test_ids)
            eval_time = t_eval()
            print(f"Evaluation took: {eval_time:.2f} seconds")

            if metrics is None:
                print(f"Skipped (insufficient data)")
                continue

            current_acc = metrics['accuracy']

            # corrupted accuracy for recovery calculation
            if clean_idx == 0:
                level_0_acc = metrics['accuracy']
                acc_delta = 0.0
                acc_improv_pct = 0.0
                recovery_pct = 0.0

            # recovery computation
            if clean_idx > 0 and level_0_acc is not None:
                acc_drop = baseline_acc - level_0_acc
                acc_delta = current_acc - level_0_acc
                acc_improv_pct = (acc_delta / level_0_acc * 100) if level_0_acc > 0 else 0
                acc_recovery = current_acc - level_0_acc
                recovery_pct = (acc_recovery / acc_drop * 100) if acc_drop != 0 else 0
            else:
                recovery_pct = 0

            print(f"Acc: {current_acc:.4f} | F1: {metrics['f1']:.4f} | MAE: {metrics['mae']:.4f}")
            if clean_idx > 0:
                print(f"Accuracy Change vs Corrupt: {acc_delta:.4f} ({acc_improv_pct:.2f}%)")
                print(f"Recovery from baseline: {recovery_pct:.1f}%")

            # 'stats' (from eval), 'metrics' (performance), and 'meta' (from cleaning functions)
            result_row = {
                "batch": batch_id,
                "experiment": exp_id,
                "cleaning_name": clean_name,
                "cleaning_num": clean_idx,
                **stats, **metrics,
                "baseline_acc": baseline_acc,
                "level_0_acc": level_0_acc or 0,
                "recovery_pct": recovery_pct,
                "corrupt_time": corrupt_time,
                "cleaning_time": cleaning_time,
                "eval_time": eval_time,
                "n_before": n_before,
                "n_after": len(cleaned_df),
                **meta
            }
            batch_results.append(result_row)
            if clean_idx == 0:
                level_0_acc = current_acc

        del corrupted_df
        clear_memory()

    # Save and return
    batch_df = pd.DataFrame(batch_results)
    batch_path = os.path.join(RESULTS_DIR, f"{batch_id}_results.csv")
    batch_df.to_csv(batch_path, index=False)
    print(f"\n{batch_id} complete! Results saved.")
    return batch_df

In [None]:
# Batch 1
batch1_corruptions = [
    ("01_missing_text", corrupt.apply_missing_text, {}),
    ("02_broken_chars", corrupt.apply_broken_characters, {}),
    ("03_swapped_text", corrupt.apply_swapped_text, {}),
    ("04_missing_labels", corrupt.apply_missing_labels, {}),
]

batch1_df = run_batch_experiments("batch1", batch1_corruptions)


----------------------------------------------------------
[1/4] 01_missing_text in batch1
----------------------------------------------------------

Corruption took: 0.14 seconds

  >>> 0_Corrupted
Cleaning took: 0.00 seconds
Current size: 100,000 rows
Training & evaluating...
Evaluation took: 106.72 seconds
Acc: 0.4177 | F1: 0.4021 | MAE: 1.0486

  >>> 1_Basic
Cleaning...
Cleaning took: 9.12 seconds
Current size: 100,000 rows
Training & evaluating...
Evaluation took: 97.75 seconds
Acc: 0.4209 | F1: 0.4059 | MAE: 1.0458
Accuracy Change vs Corrupt: 0.0032 (0.77%)
Recovery from baseline: 3.3%


  >>> 2_Heuristic
Cleaning...
Cleaning took: 5.50 seconds
Current size: 68,669 rows
Training & evaluating...
Evaluation took: 77.12 seconds
Acc: 0.5116 | F1: 0.5082 | MAE: 0.6399
Accuracy Change vs Corrupt: 0.0940 (22.50%)
Recovery from baseline: 96.3%


  >>> 3_Semantic
Cleaning...
Cleaning took: 24.09 seconds
Modified: 99331 rows
Current size: 100,000 rows
Training & evaluating...
Evaluation 

In [None]:
# Batch 2
batch2_corruptions = [
    ("05_swapped_labels", corrupt.apply_swapped_labels, {}),
    ("06_combined_broken_chars_missing_text", corrupt.apply_combined_broken_chars_missing_text, {}),
    ("07_combined_swap_text_labels", corrupt.apply_combined_swap_text_labels, {}),
    ("08_heavy_missing", corrupt.apply_heavy_missing, {}),
    ("09_all_corruptions", corrupt.apply_all_corruptions, {})
]

batch2_df = run_batch_experiments("batch2", batch2_corruptions)


----------------------------------------------------------
[1/5] 05_swapped_labels in batch2
----------------------------------------------------------

Corruption took: 0.03 seconds

  >>> 0_Corrupted
Cleaning took: 0.00 seconds
Current size: 100,000 rows
Training & evaluating...
Evaluation took: 118.40 seconds
Acc: 0.4702 | F1: 0.4668 | MAE: 0.7694

  >>> 1_Basic
Cleaning...
Cleaning took: 11.27 seconds
Current size: 100,000 rows
Training & evaluating...
Evaluation took: 127.54 seconds
Acc: 0.4763 | F1: 0.4729 | MAE: 0.7609
Accuracy Change vs Corrupt: 0.0061 (1.31%)
Recovery from baseline: 13.6%


  >>> 2_Heuristic
Cleaning...
Cleaning took: 5.42 seconds
Current size: 98,533 rows
Training & evaluating...
Evaluation took: 115.15 seconds
Acc: 0.4722 | F1: 0.4688 | MAE: 0.7639
Accuracy Change vs Corrupt: 0.0020 (0.43%)
Recovery from baseline: 4.5%


  >>> 3_Semantic
Cleaning...
Cleaning took: 24.80 seconds
Modified: 99039 rows
Current size: 100,000 rows
Training & evaluating...
Evaluat

### Combining batch1 and batch2 results

In [None]:
batch1_path = os.path.join(RESULTS_DIR, "batch1_results.csv")
batch1_df = pd.read_csv(batch1_path)
batch2_path = os.path.join(RESULTS_DIR, "batch2_results.csv")
batch2_df = pd.read_csv(batch2_path)
all_results_df = pd.concat([batch1_df, batch2_df], ignore_index=True)
combined_results_path = os.path.join(RESULTS_DIR, "all_results_combined.csv")
all_results_df.to_csv(combined_results_path, index=False)

print(f"Combined and saved results.")
print(f"Total evaluations: {len(all_results_df)}")

Combined and saved results.
Total evaluations: 45


## Analysis of Results


In [None]:
results_df = pd.read_csv(combined_results_path)
print(f"Loaded results DF with {len(results_df)} rows for analysis")

Loaded results DF with 45 rows for analysis


In [None]:
# Constants for analysis
PERFORMANCE_DIR = os.path.join(FIGURES_DIR, "performance_analysis/")
os.makedirs(PERFORMANCE_DIR, exist_ok=True)

COMP_DIR = os.path.join(FIGURES_DIR, "computation_analysis/")
os.makedirs(COMP_DIR, exist_ok=True)

CLEANLAB_DIR = os.path.join(FIGURES_DIR, "cleanlab_analysis/")
os.makedirs(CLEANLAB_DIR, exist_ok=True)

STATS_DIR = os.path.join(RESULTS_DIR, "stats")
os.makedirs(STATS_DIR, exist_ok=True)

STAT_TABLES_DIR = os.path.join(STATS_DIR, "tables")
os.makedirs(STAT_TABLES_DIR, exist_ok=True)

STAT_TESTS_DIR = os.path.join(STATS_DIR, "statistical_tests")
os.makedirs(STAT_TESTS_DIR, exist_ok=True)

BASELINE_ACC = results_df['baseline_acc'].iloc[0]
CLEANING_STRATEGIES_ORDER = ['1_Basic', '2_Heuristic', '3_Semantic', '4_ModelAware']
ALL_CLEANING_ORDER = ['0_Corrupted'] + CLEANING_STRATEGIES_ORDER
HEATMAP_COLS = ['0_Corrupted'] + CLEANING_STRATEGIES_ORDER
PLOT_STYLE = {
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'font.size': 10,
    'axes.labelsize': 11,
    'axes.titlesize': 12,
    'xtick.labelsize': 9,
    'ytick.labelsize': 9
}
sns.set_style("whitegrid")
plt.rcParams.update(PLOT_STYLE)

In [None]:
# Helper function for plot saving
def save_plot(plot_type, plot_title, filename, **kwargs):
    """Helper to create, save, and close plots"""
    print(f"Generating plot: {plot_title}")
    plt.tight_layout()
    full_path = os.path.join(PERFORMANCE_DIR if plot_type == "performance" else COMP_DIR if plot_type == "computation" else CLEANLAB_DIR, filename)
    plt.savefig(full_path, **kwargs)
    plt.close()
    print(f"Plot saved.")

### Corruption Impact Analysis on Performance

In [None]:
df = results_df.copy()

def interpret_damage(d):
    abs_d = abs(d)
    if abs_d < 0.03: return "Minimal"
    elif abs_d < 0.08: return "Moderate"
    elif abs_d < 0.15: return "Severe"
    else: return "Critical"

print("CORRUPTION IMPACT ANALYSIS (Baseline vs. Corrupted)")

corruption_df = df[df['cleaning_name'] == '0_Corrupted'].copy()

corruption_df['Damage'] = corruption_df['baseline_acc'] - corruption_df['accuracy']
corruption_df['Damage_Pct'] = (corruption_df['Damage'] / corruption_df['baseline_acc']) * 100
corruption_df['Damage_Level'] = corruption_df['Damage'].apply(interpret_damage)

report = corruption_df[[
    'experiment', 'baseline_acc', 'accuracy', 'Damage', 'Damage_Pct', 'Damage_Level'
]].copy()

report.columns = ['Experiment', 'Original_Acc', 'Corrupted_Acc', 'Abs_Damage', 'Damage_%', 'Severity']

# sort by damage
report = report.sort_values(by='Abs_Damage', ascending=False)

print(report.to_string(index=False))

# Key Findings
print("\nKEY FINDINGS: NOISE SENSITIVITY")

# exclude missing_labels because that causes a crash
most_dmg = report[report['Experiment'] != '04_missing_labels'].iloc[0]
least_dmg = report[report['Experiment'] != '04_missing_labels'].iloc[-1]

print(f"Most Destructive Corruption: {most_dmg['Experiment']}")
print(f"Accuracy dropped by {most_dmg['Damage_%']:.1f}% ({most_dmg['Severity']})")

print(f"\nMost Resilient Corruption:  {least_dmg['Experiment']}")
print(f"Accuracy dropped by {least_dmg['Damage_%']:.1f}% ({least_dmg['Severity']})")

avg_dmg = report['Abs_Damage'].mean()
print(f"\nAverage Accuracy Drop across all 4 types: {avg_dmg:.4f}")

del df

CORRUPTION IMPACT ANALYSIS (Baseline vs. Corrupted)
                           Experiment  Original_Acc  Corrupted_Acc  Abs_Damage  Damage_% Severity
                      01_missing_text       0.51525       0.417650    0.097600 18.942261   Severe
                     08_heavy_missing       0.51525       0.430601    0.084649 16.428718   Severe
                      03_swapped_text       0.51525       0.441750    0.073500 14.264920 Moderate
                   09_all_corruptions       0.51525       0.461300    0.053950 10.470645 Moderate
         07_combined_swap_text_labels       0.51525       0.462400    0.052850 10.257157 Moderate
                    05_swapped_labels       0.51525       0.470150    0.045100  8.753033 Moderate
06_combined_broken_chars_missing_text       0.51525       0.479750    0.035500  6.889859 Moderate
                      02_broken_chars       0.51525       0.502700    0.012550  2.435711  Minimal
                    04_missing_labels       0.51525       0.515182

### Statistical Tests

In [None]:
analysis_df = results_df.copy()
analysis_df = analysis_df[analysis_df["cleaning_name"].isin(ALL_CLEANING_ORDER)].copy()

# TEST 1: Statistical improvement vs corrupted baseline
# check whether each cleaning method significantly improves accuracy compared to corrupted data
stats_results = []

for exp in analysis_df["experiment"].unique():
    exp_data = analysis_df[analysis_df["experiment"] == exp]

    corrupted = exp_data[exp_data["cleaning_name"] == "0_Corrupted"]
    if corrupted.empty:
        continue

    acc_corrupt = corrupted["accuracy"].values[0]

    for strategy in CLEANING_STRATEGIES_ORDER:
        method = exp_data[exp_data["cleaning_name"] == strategy]
        if method.empty or strategy == "0_Corrupted":
            continue

        acc_clean = method["accuracy"].values[0]
        n_test = method["test_valid"].values[0]

        abs_gain = acc_clean - acc_corrupt
        recovery_pct = method["recovery_pct"].values[0]

        se = np.sqrt(
            (acc_corrupt * (1 - acc_corrupt) +
             acc_clean * (1 - acc_clean)) / n_test
        )

        p_value = 2 * (1 - norm.cdf(abs(abs_gain / se))) if se > 0 else 1.0
        sig = "***" if p_value < 0.001 else ("**" if p_value < 0.01 else ("*" if p_value < 0.05 else "ns"))

        stats_results.append({
            "experiment": exp,
            "cleaning_method": strategy,
            "acc_corrupted": acc_corrupt,
            "acc_cleaned": acc_clean,
            "abs_gain": abs_gain,
            "recovery_pct": recovery_pct,
            "p_value": p_value,
            "significance": sig
        })

df_sig = pd.DataFrame(stats_results)
df_sig.to_csv(os.path.join(STAT_TESTS_DIR, "test1_significance_vs_corrupted.csv"), index=False)

# TEST 2: Best method per corruption
# identify which cleaning method most frequently achieves the highest accuracy
best = (
    analysis_df.sort_values(["experiment", "accuracy"], ascending=[True, False])
      .groupby("experiment", as_index=False)
      .first()
      [["experiment", "cleaning_name", "accuracy", "f1", "mae"]]
      .rename(columns={"cleaning_name": "best_method", "accuracy": "best_accuracy"})
)

win_counts = best["best_method"].value_counts().reset_index()
win_counts.columns = ["cleaning_method", "wins"]
best.to_csv(os.path.join(STAT_TESTS_DIR, "test2_best_method_per_experiment.csv"), index=False)
win_counts.to_csv(os.path.join(STAT_TESTS_DIR, "test2_win_counts.csv"), index=False)

# TEST 3: Pairwise improvement (ModelAware vs corrupted and vs heuristic) 
# Quantify magnitude of gains of the best method vs the corrupted and the 2nd best method
wide_acc = analysis_df.pivot(index="experiment", columns="cleaning_name", values="accuracy")

impr = pd.DataFrame({
    "experiment": wide_acc.index,
    "acc_modelaware_vs_corrupted": wide_acc["4_ModelAware"] - wide_acc["0_Corrupted"],
    "acc_modelaware_vs_heuristic": wide_acc["4_ModelAware"] - wide_acc["2_Heuristic"],
}).reset_index(drop=True)
impr.to_csv(os.path.join(STAT_TESTS_DIR, "test3_accuracy_deltas.csv"), index=False)

# TEST 4: Metric consistency (accuracy, F1, MAE)
# ensuring improvements are not only for one metric
wide_f1  = analysis_df.pivot(index="experiment", columns="cleaning_name", values="f1")
wide_mae = analysis_df.pivot(index="experiment", columns="cleaning_name", values="mae")

consistency = pd.DataFrame({
    "experiment": wide_acc.index,
    "acc_improved": (wide_acc["4_ModelAware"] > wide_acc["0_Corrupted"]).astype(int),
    "f1_improved": (wide_f1["4_ModelAware"] > wide_f1["0_Corrupted"]).astype(int),
    "mae_improved": (wide_mae["4_ModelAware"] < wide_mae["0_Corrupted"]).astype(int),
})

consistency["n_metrics_improved"] = consistency[["acc_improved", "f1_improved", "mae_improved"]].sum(axis=1)
consistency.to_csv(os.path.join(STAT_TESTS_DIR, "test4_metric_consistency.csv"), index=False)

# TEST 5: Cost vs benefit (accuracy gain per second)
# evaluate efficiency of cleaning
t = analysis_df[analysis_df["cleaning_name"].isin(["0_Corrupted", "4_ModelAware"])]
w = t.pivot(index="experiment", columns="cleaning_name")

cost = pd.DataFrame({
    "experiment": w.index,
    "acc_gain": w["accuracy"]["4_ModelAware"] - w["accuracy"]["0_Corrupted"],
    "clean_time": w["cleaning_time"]["4_ModelAware"],
    "eval_time": w["eval_time"]["4_ModelAware"],
})

cost["total_time"] = cost["clean_time"] + cost["eval_time"]
cost["gain_per_clean_sec"] = cost["acc_gain"] / cost["clean_time"].replace(0, np.nan)
cost["gain_per_total_sec"] = cost["acc_gain"] / cost["total_time"].replace(0, np.nan)
cost.to_csv(os.path.join(STAT_TESTS_DIR, "test5_cost_benefit.csv"), index=False)

del analysis_df


### Visual Analysis of Performance

In [None]:
PERFORMANCE_DIR = os.path.join(FIGURES_DIR, "performance_analysis")
os.makedirs(PERFORMANCE_DIR, exist_ok=True)

# styles
plt.rcParams.update(PLOT_STYLE)

analysis_df = results_df.copy()

analysis_df['data_loss_pct'] = ((analysis_df['n_before'] - analysis_df['n_after']) / analysis_df['n_before']) * 100
analysis_df['accuracy_gain'] = analysis_df['accuracy'] - analysis_df['level_0_acc']

# FIGURE 1: Corruption Damage Ranking (Bar Chart)
corruption_data = analysis_df[analysis_df['cleaning_name'] == '0_Corrupted'].copy()
corruption_data['Damage'] = corruption_data['baseline_acc'] - corruption_data['accuracy']
corruption_data = corruption_data.sort_values('accuracy', ascending=True)

fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(corruption_data['experiment'], corruption_data['Damage'],
        color='salmon', edgecolor='black', alpha=0.8)

ax.set_xlabel('Accuracy Drop (Baseline - Corrupted)')
ax.set_title('Corruption Impact: Accuracy Degradation by Type')
ax.axvline(x=0, color='black', linestyle='--', alpha=0.5)
save_plot("performance", "Corruption Damage Ranking", "01_corruption_damage_ranking.png")

#FIGURE 2: Average Model Accuracy in increasing order (Bar Chart)
avg_acc = analysis_df[analysis_df['cleaning_name'] != '0_Corrupted'].groupby('cleaning_name')['accuracy'].mean().reset_index()
avg_corruption_acc = analysis_df[analysis_df['cleaning_name'] == '0_Corrupted']['accuracy'].mean()
avg_acc = avg_acc.sort_values('accuracy', ascending=True)

colors = plt.cm.viridis(np.linspace(0, 0.8, len(avg_acc)))

fig, ax = plt.subplots(figsize=(12, 7))
bars = ax.bar(avg_acc['cleaning_name'], avg_acc['accuracy'],
              color=colors, edgecolor='black', alpha=0.85)
ax.axhline(y=avg_corruption_acc, color='red', linestyle='--', linewidth=2.5,
           label=f'Avg Corrupted Acc ({avg_corruption_acc:.3f})', zorder=3)

for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.001,
            f'{height:.3f}', ha='center', va='bottom',
            fontsize=10, fontweight='bold', color='black')

ax.set_ylim(0.40, 0.52)
import matplotlib.ticker as ticker
ax.yaxis.set_major_locator(ticker.MultipleLocator(0.01))
ax.yaxis.set_minor_locator(ticker.MultipleLocator(0.0025))

ax.grid(which='major', axis='y', linestyle='-', alpha=0.3, color='gray')
ax.grid(which='minor', axis='y', linestyle=':', alpha=0.2, color='gray')
ax.set_axisbelow(True)

ax.set_xlabel('Cleaning Strategy', fontsize=12, labelpad=10)
ax.set_ylabel('Average Model Accuracy', fontsize=12, labelpad=10)
ax.set_title('Detailed Strategy Effectiveness vs. Baseline', fontsize=14, fontweight='bold', pad=20)

ax.legend(frameon=True, shadow=True, loc='upper left')
save_plot("performance", "Average Model Accuracy", "02_average_model_accuracy.png")

# FIGURE 3: Strategy Comparison Heatmap
df_heatmap = analysis_df.pivot(index='experiment', columns='cleaning_name', values='accuracy')
df_heatmap = df_heatmap[HEATMAP_COLS]

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df_heatmap, annot=True, fmt='.3f', cmap='YlGn',
            linewidths=0.5, linecolor='white', ax=ax)
ax.set_title('Strategy Comparison: Accuracy Across Cleaning Methods')
ax.set_xlabel('Cleaning Strategy')
ax.set_ylabel('Corruption Type')
save_plot("performance", "Strategy Comparison Heatmap", "03_strategy_comparison_heatmap.png")

# FIGURE 4: Best Strategy Performance Ranking
best_per_exp = analysis_df.loc[analysis_df.groupby('experiment')['accuracy'].idxmax()].copy()
best_per_exp = best_per_exp.sort_values('accuracy', ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
unique_strats = best_per_exp['cleaning_name'].unique()
colors = plt.get_cmap('Set2')(range(len(unique_strats)))
strat_color_map = {strat: colors[i] for i, strat in enumerate(unique_strats)}

ax.barh(best_per_exp['experiment'], best_per_exp['accuracy'],
        color=[strat_color_map[s] for s in best_per_exp['cleaning_name']],
        edgecolor='black', alpha=0.8)

ax.axvline(x=BASELINE_ACC, color='blue', linestyle='--', linewidth=2, label=f'Baseline ({BASELINE_ACC:.2f})')
ax.set_xlabel('Highest Accuracy Achieved')
ax.set_title('Final Model Performance: Best Result per Corruption')
ax.set_xlim(analysis_df['accuracy'].min() * 0.9, BASELINE_ACC * 1.1)

legend_elements = [Patch(facecolor=strat_color_map[s], label=s) for s in unique_strats]
legend_elements.append(plt.Line2D([0], [0], color='blue', linestyle='--', label='Baseline'))
ax.legend(handles=legend_elements, loc='lower right')

save_plot("performance", "Best Strategy Performance Ranking", "04_best_strategy_performance_ranking.png")

# FIGURE 5: Trade-off: Data Loss vs. Cleaning Effectiveness
plt.figure(figsize=(10, 7))
df_plot = analysis_df[analysis_df['cleaning_name'] != '0_Corrupted']
sns.scatterplot(data=df_plot, x='data_loss_pct', y="accuracy", hue='cleaning_name', style='cleaning_name', s=100)

plt.axhline(y=BASELINE_ACC, color='blue', linestyle='--', alpha=0.5, label='Baseline')
plt.ylabel('Model Accuracy')

plt.title("Trade-off: Data Loss vs. Cleaning Effectiveness", fontweight='bold')
plt.xlabel('Data Loss (%)')
plt.legend(title='Strategy', bbox_to_anchor=(1.05, 1), loc='upper left')
save_plot("performance", "Trade-off Data Loss vs Cleaning Effectiveness", "05_data_loss_vs_effectiveness.png")

# FIGURE 6: Accuracy Gain Heatmap
pivot_acc = analysis_df.pivot(index='experiment', columns='cleaning_name', values='accuracy')[HEATMAP_COLS]
delta_df = (pivot_acc.subtract(pivot_acc['0_Corrupted'], axis=0).divide(pivot_acc['0_Corrupted'], axis=0) * 100).drop(columns='0_Corrupted')

plt.figure(figsize=(12, 8))
sns.heatmap(delta_df, annot=True, fmt='.3f', cmap='YlGn', linewidths=0.5, linecolor='white',
            cbar_kws={'label': 'Absolute Accuracy Gain (Points)'})
plt.title('Strategy Effectiveness: Accuracy Gain Over Corrupted Baseline', fontsize=14, fontweight='bold', pad=20)
save_plot("performance", "Accuracy Gain Heatmap", "06_accuracy_gain_heatmap.png")

# FIGURE 7: Recovery Robustness
df_strategies = analysis_df[analysis_df['cleaning_name'].isin(CLEANING_STRATEGIES_ORDER)].copy()
df_strategies['cleaning_name'] = pd.Categorical(df_strategies['cleaning_name'], categories=CLEANING_STRATEGIES_ORDER, ordered=True)

df_strategies['recovery_pct_viz'] = df_strategies['recovery_pct'].clip(lower=-20, upper=150)
plt.figure(figsize=(10, 6))
sns.boxplot(x='cleaning_name', y='recovery_pct_viz', data=df_strategies, palette='viridis', showmeans=True,
            meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"black"})
sns.stripplot(x='cleaning_name', y='recovery_pct_viz', data=df_strategies, color=".3", alpha=0.6, size=6)
plt.axhline(y=100, color='red', linestyle='--', label='100% Recovery')
plt.title('Strategy Recovery Success (Capped at 150%)', fontweight='bold')
plt.legend()
save_plot("performance", "Recovery Robustness", "07_recovery_robustness_boxplot.png")

# FIGURE 8: Absolute Accuracy Boxplot
plt.figure(figsize=(10, 6))
all_order = ['0_Corrupted'] + CLEANING_STRATEGIES_ORDER
plot_all_df = analysis_df[analysis_df['cleaning_name'].isin(all_order)].copy()
plot_all_df['cleaning_name'] = pd.Categorical(plot_all_df['cleaning_name'], categories=all_order, ordered=True)

sns.boxplot(x='cleaning_name', y='accuracy', data=plot_all_df, palette='Set2', showmeans=True)
sns.swarmplot(x='cleaning_name', y='accuracy', data=plot_all_df, color=".25", alpha=0.6)
plt.axhline(y=BASELINE_ACC, color='blue', linestyle='--', linewidth=2, label=f'Baseline ({BASELINE_ACC:.3f})')
plt.title('Performance Distribution: Absolute Accuracy', fontsize=14, fontweight='bold')
plt.legend()
save_plot("performance", "Accuracy Distribution", "08_accuracy_distribution_boxplot.png")

# FIGURE 9: Impact Bar Chart
df_corrupted = analysis_df[analysis_df['cleaning_name'] == '0_Corrupted'].copy()
df_corrupted['Drop'] = df_corrupted['baseline_acc'] - df_corrupted['accuracy']
df_corrupted = df_corrupted.sort_values('Drop', ascending=False)

melted_df = df_corrupted.melt(id_vars=['experiment', 'Drop'], value_vars=['baseline_acc', 'accuracy'],
                              var_name='Metrics', value_name='Accuracy')

plt.figure(figsize=(12, 7))
ax = sns.barplot(data=melted_df, x='experiment', y='Accuracy', hue='Metrics', palette=['#3498db', '#e74c3c'])
for i, (_, row) in enumerate(df_corrupted.iterrows()):
    ax.text(i + 0.2, row['accuracy'] + 0.01, f"-{row['Drop']:.1%}",
            ha='center', va='bottom', color='red', fontweight='bold', fontsize=9)

plt.xticks(rotation=45, ha='right')
plt.title('Impact of Corruption: Baseline vs. Corrupted', fontweight='bold')
save_plot("performance", "Impact of Corruption", "09_corruption_impact_bar_chart.png")

# FIGURE 10: Recovery Trends
severity_order = df_corrupted.sort_values('Drop', ascending=False)['experiment'].unique().tolist()

df_strat_trend = analysis_df[analysis_df['cleaning_name'].isin(CLEANING_STRATEGIES_ORDER)].copy()
df_strat_trend['cleaning_name'] = pd.Categorical(df_strat_trend['cleaning_name'], categories=CLEANING_STRATEGIES_ORDER, ordered=True)
df_strat_trend['experiment'] = pd.Categorical(df_strat_trend['experiment'], categories=severity_order, ordered=True)
df_strat_trend['recovery_percent'] = df_strat_trend['recovery_pct'].clip(lower=-20, upper=120)

plt.figure(figsize=(12, 6))
sns.lineplot(data=df_strat_trend, x='experiment', y='recovery_percent', hue='cleaning_name',
             marker='o', markersize=8, linewidth=2, palette='viridis')

plt.axhline(y=100, color='red', linestyle='--', alpha=0.6, label='Full Recovery (100%)')
plt.ylim(-30, 130)
plt.xticks(rotation=45, ha='right')
plt.title('Performance Recovery Trends Across Corruption Severity', fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
save_plot("performance", "Performance Recovery Trends", "10_performance_recovery_trends.png")

print(f"All performance analysis figures saved.")

del analysis_df, corruption_data, avg_acc, df_heatmap, best_per_exp, df_plot, delta_df, df_strategies, plot_all_df, df_corrupted, melted_df, df_strat_trend
clear_memory()

Generating plot: Corruption Damage Ranking
Plot saved.
Generating plot: Average Model Accuracy
Plot saved.
Generating plot: Strategy Comparison Heatmap
Plot saved.
Generating plot: Best Strategy Performance Ranking
Plot saved.
Generating plot: Trade-off Data Loss vs Cleaning Effectiveness
Plot saved.
Generating plot: Accuracy Gain Heatmap
Plot saved.
Generating plot: Recovery Robustness
Plot saved.
Generating plot: Accuracy Distribution
Plot saved.
Generating plot: Impact of Corruption
Plot saved.
Generating plot: Performance Recovery Trends
Plot saved.
All performance analysis figures saved.


### Visual Analysis of Computation

In [None]:
from matplotlib.ticker import ScalarFormatter


COMP_DIR = os.path.join(FIGURES_DIR, "computation_analysis")
os.makedirs(COMP_DIR, exist_ok=True)

analysis_df = results_df.copy()

# extra metrics
analysis_df['total_time'] = analysis_df['corrupt_time'] + analysis_df['cleaning_time'] + analysis_df['eval_time']
analysis_df['cleaning_ratio'] = analysis_df['cleaning_time'] / analysis_df['total_time']
analysis_df['eval_ratio'] = analysis_df['eval_time'] / analysis_df['total_time']
analysis_df['corrupt_ratio'] = analysis_df['corrupt_time'] / analysis_df['total_time']

analysis_df['cleaning_name'] = pd.Categorical(analysis_df['cleaning_name'], categories=ALL_CLEANING_ORDER, ordered=True)

# FIGURE 1: Time Breakdown Stacked Bar Chart
avg_time = analysis_df.groupby('cleaning_name')[['cleaning_time', 'eval_time']].mean()

ax = avg_time.plot(kind='bar', stacked=True, figsize=(10, 6), color=['#3498db', '#2ecc71'])
plt.title('Average Computational Time Breakdown per Strategy', fontweight='bold')
plt.ylabel('Time (Seconds)')
plt.xlabel('Cleaning Strategy')
plt.xticks(rotation=0)
plt.legend(['Cleaning Time', 'Eval Time'])
save_plot("computation", "Time Breakdown Stacked Bar Chart", "01_time_breakdown_stacked_bar.png")

# FIGURE 2: Total Pipeline Time Comparision
plt.figure(figsize=(10, 6))
sns.barplot(x='cleaning_name', y='total_time', data=analysis_df, palette='magma', errorbar=None)
plt.title('Total Pipeline Runtime Comparison (End-to-End)', fontweight='bold')
plt.ylabel('Total Time (Seconds)')
plt.xlabel('Cleaning Strategy')
save_plot("computation", "Total Pipeline Time Comparison", "02_total_pipeline_time.png")

# FIGURE 3: Boxplot of Time Variability
df_summary = analysis_df.groupby('cleaning_name').agg({
    'accuracy': 'mean',
    'total_time': 'mean'
}).reset_index()
df_summary = df_summary[df_summary['cleaning_name'] != '0_Corrupted'].copy()
df_summary['label'] = df_summary['cleaning_name']

df_summary = df_summary.sort_values('total_time')

plt.figure(figsize=(10, 6))
sns.set_style({'axes.grid': True, 'grid.linestyle': '--'})

plt.xscale('log')
plt.plot(df_summary['total_time'], df_summary['accuracy'],
         color='lightgrey', linestyle='--', linewidth=1.5, zorder=1)
colors = ['#4C72B0', '#55A868', '#C44E52', '#8172B3']
for i, (idx, row) in enumerate(df_summary.iterrows()):
    plt.scatter(row['total_time'], row['accuracy'], s=500,
                color=colors[i % len(colors)], edgecolor='black',
                alpha=0.9, linewidth=1.5, zorder=2)

    x_offset = 1.05
    y_offset = 0.0008

    plt.text(row['total_time'] * x_offset, row['accuracy'] + y_offset,
             row['label'], fontweight='bold', fontsize=11,
             verticalalignment='center')

plt.gca().xaxis.set_major_formatter(ScalarFormatter())
plt.xticks([100, 150, 200, 300, 400, 500])

plt.title('Accuracy vs. Time Trade-off', fontsize=16, fontweight='bold', pad=25)
plt.xlabel('Total Computational Time (Seconds)', fontsize=12)
plt.ylabel('Average Model Accuracy', fontsize=12)
sns.despine(left=True, bottom=True)
plt.grid(True, which="both", ls="--", alpha=0.4)
save_plot("computation", "Accuracy vs Time Trade-off", "03_accuracy_vs_time_tradeoff.png")

# FIGURE 4: Cleaning Overhead Ratio
ratio_data = analysis_df.groupby('cleaning_name')[['corrupt_ratio', 'cleaning_ratio', 'eval_ratio']].mean()

ax = ratio_data.plot(kind='bar', stacked=True, figsize=(10, 6), color=['#95a5a6', '#3498db', '#2ecc71'])
plt.title('Computational Overhead Ratio: Where is the bottleneck?', fontweight='bold')
plt.ylabel('Percentage of Total Runtime')
plt.xlabel('Cleaning Strategy')
plt.xticks(rotation=0)
plt.legend(['Corrupt %', 'Cleaning %', 'Eval %'], loc='lower right')

vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0%}'.format(x) for x in vals])

save_plot("computation", "Cleaning Overhead Ratios", "04_cleaning_overhead_ratios.png")

# FIGURE 5: Corruption-specific Time Sensitivity
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=analysis_df,
    x='experiment',
    y='cleaning_time',
    hue='cleaning_name',
    marker='o',
    markersize=8,
    linewidth=2,
    sort=False
)

plt.yscale('log')
plt.title(
    'Cleaning Time Sensitivity across Corruption Types (Log Scale)',
    fontweight='bold'
)
plt.ylabel('Cleaning Time (seconds, log scale)')
plt.xlabel('Corruption Experiment')
plt.xticks(rotation=45, ha='right')

plt.yticks([10, 100])
plt.ylim(0.8, analysis_df['cleaning_time'].max() * 1.1)

plt.legend(title='Strategy', loc='upper right')

plt.tight_layout()
save_plot("computation", "Time Sensitivity per Corruption", "05_time_sensitivity_per_corruption.png")

print("\nAll computation analysis figures saved.")

del analysis_df, avg_time, ratio_data, df_summary
clear_memory()

Generating plot: Time Breakdown Stacked Bar Chart
Plot saved.
Generating plot: Total Pipeline Time Comparison
Plot saved.
Generating plot: Accuracy vs Time Trade-off
Plot saved.
Generating plot: Cleaning Overhead Ratios
Plot saved.
Generating plot: Time Sensitivity per Corruption
Plot saved.

All computation analysis figures saved.


### Visual Analysis of Model Aware Reweighting using CleanLab

#### Combining batch1 and batch2 cleanlab details

In [None]:
batch1_cleanlab_path = os.path.join(RESULTS_DIR, "batch1_cleanlab_details.csv")
batch2_cleanlab_path = os.path.join(RESULTS_DIR, "batch2_cleanlab_details.csv")

df_cleanlab1 = pd.read_csv(batch1_cleanlab_path)
df_cleanlab2 = pd.read_csv(batch2_cleanlab_path)

df_cleanlab_combined = pd.concat([df_cleanlab1, df_cleanlab2], ignore_index=True)

combined_cleanlab_path = os.path.join(RESULTS_DIR, "all_cleanlab_stats_combined.csv")

df_cleanlab_combined.to_csv(combined_cleanlab_path, index=False)

print(f"Combined and saved Cleanlab details.")
print(f"Total cleanlab evaluations: {len(df_cleanlab_combined)}")

Combined and saved Cleanlab details.
Total cleanlab evaluations: 9


#### Analysis and Visualization of CleanLab Results

In [None]:
CLEANLAB_DIR = os.path.join(FIGURES_DIR, "cleanlab_analysis")
os.makedirs(CLEANLAB_DIR, exist_ok=True)

# Figure 1: Confidence Separation
plt.figure(figsize=(8, 6))
conf_melted = df_cleanlab_combined.melt(id_vars=['experiment'], value_vars=['avg_conf_clean', 'avg_conf_noisy'],
                                 var_name='Confidence Type', value_name='Score')
conf_melted['Confidence Type'] = conf_melted['Confidence Type'].replace({
    'avg_conf_clean': 'Clean Samples',
    'avg_conf_noisy': 'Noisy/Issue Samples'
})

sns.boxplot(x='Confidence Type', y='Score', data=conf_melted, palette='Set2')
sns.stripplot(x='Confidence Type', y='Score', data=conf_melted, color=".3", alpha=0.5)
plt.title('Cleanlab Issue Detection: Predicted Confidence Separation', fontsize=14, fontweight='bold')
plt.ylabel('Mean Predicted Confidence')
plt.xlabel('')
save_plot("cleanlab", "Confidence Separation", "01_confidence_separation.png")

# Figure 2: Class-wise Noise Analysis
noise_cols = [f'class_noise_{i}' for i in range(1, 6)]
heatmap_data = df_cleanlab_combined.set_index('experiment')[noise_cols]
heatmap_data.columns = [f'Class {i}' for i in range(1, 6)]

plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data, annot=True, cmap='YlOrRd', fmt='.3f', cbar_kws={'label': 'Estimated Noise Proportion'})
plt.title('Class-Specific Noise Estimated by Cleanlab', fontsize=14, fontweight='bold')
plt.ylabel('Corruption Experiment')
plt.xlabel('Review Rating (Class)')
save_plot("cleanlab", "Class-wise Noise Heatmap", "02_class_wise_noise_heatmap.png")

# Figure 3: Reweighting Behavior Analysis
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_cleanlab_combined, x='issue_rate', y='avg_weight', hue='experiment',
                s=150, palette='tab10', alpha=0.8)

z = np.polyfit(df_cleanlab_combined['issue_rate'], df_cleanlab_combined['avg_weight'], 1)
p = np.poly1d(z)
x_range = np.linspace(df_cleanlab_combined['issue_rate'].min(), df_cleanlab_combined['issue_rate'].max(), 100)
plt.plot(x_range, p(x_range), "r--", alpha=0.5, label='General Trend')

plt.title('Sample Reweighting vs. Detected Noise (Issue Rate)', fontsize=14, fontweight='bold')
plt.xlabel('Detected Issue Rate (proportion)')
plt.ylabel('Average Sample Weight')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Experiment')
plt.grid(True, linestyle='--', alpha=0.6)
save_plot("cleanlab", "Reweighting Behavior Analysis", "03_reweighting_behavior_analysis.png")

print(f"All CleanLab analysis figures saved.")

Generating plot: Confidence Separation
Plot saved.
Generating plot: Class-wise Noise Heatmap
Plot saved.
Generating plot: Reweighting Behavior Analysis
Plot saved.
All CleanLab analysis figures saved.


## Computing and Saving some Relevant Statistics

### Performance Related

In [None]:
df = results_df.copy()

df['total_time'] = df['corrupt_time'] + df['cleaning_time'] + df['eval_time']
df['cleaning_name'] = pd.Categorical(df['cleaning_name'], categories=ALL_CLEANING_ORDER, ordered=True)

# Table 1: Corruption Severity (Baseline vs Corrupted)
print("Generating Table 1: Corruption Severity...")
t1 = df[df['cleaning_name'] == '0_Corrupted'][['experiment', 'baseline_acc', 'accuracy']].copy()
t1['accuracy_drop'] = t1['baseline_acc'] - t1['accuracy']
t1 = t1.rename(columns={'accuracy': 'corrupted_acc'})
t1 = t1.sort_values('accuracy_drop', ascending=False).round(3)
file_name = os.path.join(STAT_TABLES_DIR, 'table_01_corruption_severity.csv')
t1.to_csv(file_name, index=False)

# Table 2: Average Cleaning Method Effectiveness
print("Generating Table 2: Method Effectiveness...")
t2 = df.groupby('cleaning_name')[['accuracy', 'f1', 'recovery_pct']].mean().round(3)
file_name = os.path.join(STAT_TABLES_DIR, 'table_02_method_effectiveness.csv')
t2.to_csv(file_name)

# 3. Table 3: Cleaning x Corruption Interaction
print("Generating Table 3: Interaction Analysis...")
t3_acc = df.pivot(index='experiment', columns='cleaning_name', values='accuracy')
t3_rec = df.pivot(index='experiment', columns='cleaning_name', values='recovery_pct')

# Flattening the pivot for a clean CSV structure
t3_acc.columns = [f"{col}_accuracy" for col in t3_acc.columns]
t3_rec.columns = [f"{col}_recovery" for col in t3_rec.columns]
t3 = pd.concat([t3_acc, t3_rec], axis=1).round(3)
file_name = os.path.join(STAT_TABLES_DIR, 'table_03_cleaning_interaction.csv')
t3.to_csv(file_name)

# 4. Table 4: Computational Cost Summary (Computational Cost Summary)
print("Generating Table 4: Computational Cost...")
t4 = df.groupby('cleaning_name')[['corrupt_time', 'cleaning_time', 'eval_time', 'total_time']].mean().round(3)
file_name = os.path.join(STAT_TABLES_DIR, 'table_04_computational_cost.csv')
t4.to_csv(file_name)

# 5. Table 5: Data Impact Summary
print("Generating Table 5: Data Impact Summary...")
t5 = df.groupby('cleaning_name')[['n_before', 'n_after']].mean()
t5['pct_data_affected'] = (100 * (t5['n_before'] - t5['n_after']) / t5['n_before']).round(2)
t5 = t5.round(0)
t5['n_before'] = t5['n_before'].astype(int)
t5['n_after'] = t5['n_after'].astype(int)
file_name = os.path.join(STAT_TABLES_DIR, 'table_05_data_impact.csv')
t5.to_csv(file_name)

print(f"\nAnalysis Tables generated and saved.")

del df, t1, t2, t3, t3_acc, t3_rec, t4, t5
clear_memory()

Generating Table 1: Corruption Severity...
Generating Table 2: Method Effectiveness...
Generating Table 3: Interaction Analysis...
Generating Table 4: Computational Cost...
Generating Table 5: Data Impact Summary...

Analysis Tables generated and saved.


### Data Related

In [None]:
df = results_df.copy()

def get_n_issues(stats_str):
    if pd.isna(stats_str):
        return 0
    try:
        s = stats_str.replace('np.int64(', '').replace('np.float64(', '').replace(')', '')
        d = ast.literal_eval(s)
        return d.get('n_issues', 0)
    except:
        return 0

df['n_issues_cl'] = df['stats'].apply(get_n_issues)

df['rows_removed'] = df['n_before'] - df['n_after']
df['rows_affected'] = np.where(df['cleaning_name'] == '4_ModelAware',
                               df['n_issues_cl'],
                               df['rows_removed'])

df['pct_affected'] = (df['rows_affected'] / df['n_before']) * 100
df['retention_pct'] = (df['n_after'] / df['n_before']) * 100

pd.options.display.float_format = '{:.2f}'.format

# Table 6: Dataset Size Before vs After Cleaning
print("Generating Table 6: Dataset Size Before vs After Cleaning ...")
t_d1 = df.groupby('cleaning_name').agg({
    'n_before': 'mean',
    'n_after': 'mean',
    'rows_affected': 'mean'
}).reset_index()
t_d1.rename(columns={'rows_affected': 'rows_affected_or_removed'}, inplace=True)
file_name = os.path.join(STAT_TABLES_DIR, 'table_06_size_impact.csv')
t_d1.to_csv(file_name, index=False)

# Table 7: Percentage of Data Affected
print("Generating Table 7: Percentage of Data Affected ...")
t_d2 = df.groupby('cleaning_name').agg({
    'pct_affected': 'mean'
}).reset_index()
file_name = os.path.join(STAT_TABLES_DIR, 'table_07_pct_affected.csv')
t_d2.to_csv(file_name, index=False)

# Table 8: Corruption Severity vs Data Loss vs Accuracy
print("Generating Table 8: Corruption Severity vs Data Loss vs Accuracy ...")
corruption_impact = df[df['cleaning_name'] == '0_Corrupted'][['experiment', 'accuracy', 'pct_affected']].copy()
corruption_impact.rename(columns={'accuracy': 'corrupted_accuracy', 'pct_affected': 'intrinsic_data_loss_pct'}, inplace=True)

avg_cleaned_acc = df[df['cleaning_name'] != '0_Corrupted'].groupby('experiment')['accuracy'].mean().reset_index()
t_d3 = pd.merge(corruption_impact, avg_cleaned_acc, on='experiment')
t_d3.rename(columns={'accuracy': 'avg_cleaned_accuracy'}, inplace=True)
file_name = os.path.join(STAT_TABLES_DIR, 'table_08_severity_vs_loss.csv')
t_d3.to_csv(file_name, index=False)

# Table 9: Data Retention vs Accuracy (Per Strategy)
print("Generating Table 9: Data Retention vs Accuracy (Per Strategy) ...")
t_d4 = df.groupby('cleaning_name').agg({
    'retention_pct': 'mean',
    'accuracy': 'mean'
}).reset_index()
t_d4.sort_values(by='accuracy', ascending=False, inplace=True)
file_name = os.path.join(STAT_TABLES_DIR, 'table_09_retention_vs_accuracy.csv')
t_d4.to_csv(file_name, index=False)

print(f"\nData Analysis Tables generated and saved.")

del df, t_d1, t_d2, t_d3, t_d4
clear_memory()

Generating Table 6: Dataset Size Before vs After Cleaning ...
Generating Table 7: Percentage of Data Affected ...
Generating Table 8: Corruption Severity vs Data Loss vs Accuracy ...
Generating Table 9: Data Retention vs Accuracy (Per Strategy) ...

Data Analysis Tables generated and saved.
