## 1Ô∏è‚É£ Setup

In [1]:
!pip install -q transformers datasets accelerate scikit-learn pandas numpy

In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üñ•Ô∏è Device: {device}")
if device == "cuda":
    print(f"   GPU: {torch.cuda.get_device_name(0)}")

üñ•Ô∏è Device: cuda
   GPU: NVIDIA A100-SXM4-40GB


In [3]:
MODEL_NAME = "distilbert-base-uncased"
SEED = 42
EPOCHS = 3
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NOISE_RATE = 0.10  # 10% label noise

np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

## 2Ô∏è‚É£ Load Clean Data + Inject Noise

In [4]:
from google.colab import files
print("üìÅ Upload sst2_ci_demo_curated.csv:")
uploaded = files.upload()

üìÅ Upload sst2_ci_demo_curated.csv:


Saving sst2_ci_demo_curated.csv to sst2_ci_demo_curated.csv


In [5]:
df = pd.read_csv("sst2_ci_demo_curated.csv")
print(f"üìä Total samples: {len(df)}")

# Use base variants only for cleaner experiment
df = df[df["variant_id"] == "base"].reset_index(drop=True)
print(f"üìä Base samples: {len(df)}")

üìä Total samples: 2000
üìä Base samples: 500


In [6]:
# Label mapping
label2id = {"negative": 0, "positive": 1, "0": 0, "1": 1, 0: 0, 1: 1}
id2label = {0: "negative", 1: "positive"}
df["clean_label"] = df["true_label"].map(lambda x: label2id.get(x, label2id.get(str(x).lower(), 0)))

print(f"Label distribution (clean):")
print(df["clean_label"].value_counts())

Label distribution (clean):
clean_label
1    261
0    239
Name: count, dtype: int64


In [7]:
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# INJECT SYNTHETIC NOISE
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

n_to_flip = int(len(df) * NOISE_RATE)
flip_indices = np.random.choice(df.index, size=n_to_flip, replace=False)

df["noisy_label"] = df["clean_label"].copy()
df["is_flipped"] = False
df.loc[flip_indices, "noisy_label"] = 1 - df.loc[flip_indices, "clean_label"]
df.loc[flip_indices, "is_flipped"] = True

print(f"üî¥ Injected {NOISE_RATE*100:.0f}% label noise")
print(f"   Flipped: {n_to_flip} samples")
print(f"   Clean:   {len(df) - n_to_flip} samples")
print(f"\nNoisy label distribution:")
print(df["noisy_label"].value_counts())

üî¥ Injected 10% label noise
   Flipped: 50 samples
   Clean:   450 samples

Noisy label distribution:
noisy_label
1    265
0    235
Name: count, dtype: int64


## 3Ô∏è‚É£ Simulate CI Analysis on Noisy Data

In real usage, you'd run `ci analyze` then `ci curate relabel`.
Here we simulate by checking if CI's "dangerous" flag correlates with flipped labels.

In [8]:
# Check if original CI analysis caught the noisy samples
# (The CI analysis was done on CLEAN data, so this is a proxy)

print("CI 'dangerous' vs actual flipped labels:")
print(pd.crosstab(df["difficulty"] == "dangerous", df["is_flipped"],
                  rownames=["CI=dangerous"], colnames=["is_flipped"]))

CI 'dangerous' vs actual flipped labels:
is_flipped    False  True 
CI=dangerous              
False           439     49
True             11      1


In [9]:
# Since original CI was on clean data, let's use a simple heuristic:
# Train a quick model on noisy data, flag high-loss samples as "suspicious"

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

# Quick TF-IDF + LogReg to get prediction confidence
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = vectorizer.fit_transform(df["text"])
y_noisy = df["noisy_label"].values

# Cross-val predictions to avoid data leakage
lr = LogisticRegression(max_iter=1000, random_state=SEED)
probs = cross_val_predict(lr, X, y_noisy, cv=5, method="predict_proba")

# Confidence = max prob, low confidence = suspicious
df["probe_confidence"] = probs.max(axis=1)
df["probe_pred"] = probs.argmax(axis=1)

print("Probe model confidence distribution:")
print(df["probe_confidence"].describe())

Probe model confidence distribution:
count    500.000000
mean       0.545977
std        0.034336
min        0.500103
25%        0.517437
50%        0.539361
75%        0.567759
max        0.653731
Name: probe_confidence, dtype: float64


In [10]:
# Flag bottom 15% confidence as "suspicious" (CI-like heuristic)
SUSPICIOUS_THRESHOLD = df["probe_confidence"].quantile(0.15)
df["is_suspicious"] = df["probe_confidence"] < SUSPICIOUS_THRESHOLD

print(f"Suspicious threshold: {SUSPICIOUS_THRESHOLD:.3f}")
print(f"Flagged as suspicious: {df['is_suspicious'].sum()} ({100*df['is_suspicious'].mean():.1f}%)")

# How well does this catch flipped labels?
print("\nSuspicious vs actually flipped:")
ct = pd.crosstab(df["is_suspicious"], df["is_flipped"],
                 rownames=["suspicious"], colnames=["flipped"])
print(ct)

# Precision/Recall
if df["is_suspicious"].sum() > 0:
    precision = ct.loc[True, True] / ct.loc[True].sum() if True in ct.columns else 0
    recall = ct.loc[True, True] / ct[True].sum() if True in ct.columns else 0
    print(f"\nPrecision (of suspicious, % actually flipped): {precision:.1%}")
    print(f"Recall (of flipped, % caught): {recall:.1%}")

Suspicious threshold: 0.511
Flagged as suspicious: 75 (15.0%)

Suspicious vs actually flipped:
flipped     False  True 
suspicious              
False         382     43
True           68      7

Precision (of suspicious, % actually flipped): 9.3%
Recall (of flipped, % caught): 14.0%


## 4Ô∏è‚É£ Prepare Training Splits

In [11]:
# Split - use CLEAN labels for eval (we want to measure true performance)
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df["clean_label"])

print(f"üìä Train: {len(train_df)}, Eval: {len(eval_df)}")
print(f"   Train flipped: {train_df['is_flipped'].sum()} ({100*train_df['is_flipped'].mean():.1f}%)")
print(f"   Train suspicious: {train_df['is_suspicious'].sum()} ({100*train_df['is_suspicious'].mean():.1f}%)")

üìä Train: 400, Eval: 100
   Train flipped: 40 (10.0%)
   Train suspicious: 60 (15.0%)


In [12]:
# BASELINE: Train on ALL noisy data
baseline_train = train_df.copy()
baseline_train["label"] = baseline_train["noisy_label"]  # Use noisy labels
print(f"üîµ Baseline: {len(baseline_train)} samples (all noisy data)")

üîµ Baseline: 400 samples (all noisy data)


In [13]:
# CURATED: Remove suspicious samples
curated_train = train_df[~train_df["is_suspicious"]].copy()
curated_train["label"] = curated_train["noisy_label"]  # Still noisy, but cleaner
n_removed = len(baseline_train) - len(curated_train)
n_flipped_removed = train_df[train_df["is_suspicious"]]["is_flipped"].sum()
print(f"üü¢ Curated: {len(curated_train)} samples ({n_removed} suspicious removed)")
print(f"   Of removed: {n_flipped_removed} were actually flipped ({100*n_flipped_removed/n_removed:.1f}% precision)")

üü¢ Curated: 340 samples (60 suspicious removed)
   Of removed: 5 were actually flipped (8.3% precision)


In [14]:
# ORACLE: Train on clean labels (upper bound)
oracle_train = train_df.copy()
oracle_train["label"] = oracle_train["clean_label"]  # Perfect labels
print(f"üü° Oracle: {len(oracle_train)} samples (clean labels - upper bound)")

üü° Oracle: 400 samples (clean labels - upper bound)


In [15]:
# Eval always uses clean labels
eval_df = eval_df.copy()
eval_df["label"] = eval_df["clean_label"]
print(f"üìä Eval: {len(eval_df)} samples (clean labels)")

üìä Eval: 100 samples (clean labels)


## 5Ô∏è‚É£ Tokenize

In [16]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_df(df):
    ds = Dataset.from_pandas(df[["text", "label"]].reset_index(drop=True))
    ds = ds.map(lambda x: tokenizer(x["text"], truncation=True, max_length=128), batched=True)
    ds = ds.remove_columns(["text"])
    return ds

baseline_train_ds = tokenize_df(baseline_train)
curated_train_ds = tokenize_df(curated_train)
oracle_train_ds = tokenize_df(oracle_train)
eval_ds = tokenize_df(eval_df)

print("‚úÖ Tokenized")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/340 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

‚úÖ Tokenized


In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()[:, 1]
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="binary"),
        "auc": roc_auc_score(labels, probs) if len(np.unique(labels)) > 1 else 0.0
    }

def train_model(name, train_ds, eval_ds, output_dir):
    print(f"\n{'='*50}")
    print(f"Training {name}...")
    print(f"{'='*50}")

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=2, id2label=id2label, label2id={v:k for k,v in id2label.items()}
    )

    args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        seed=SEED,
        logging_steps=50,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    return trainer, model

## 6Ô∏è‚É£ Train All Models

In [18]:
baseline_trainer, baseline_model = train_model(
    "üîµ BASELINE (noisy)", baseline_train_ds, eval_ds, "./baseline_model"
)


Training üîµ BASELINE (noisy)...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,No log,0.673477,0.52,0.684211,0.708734
2,0.664600,0.634208,0.6,0.71831,0.800881
3,0.664600,0.610538,0.68,0.757576,0.81851


In [19]:
curated_trainer, curated_model = train_model(
    "üü¢ CURATED (suspicious removed)", curated_train_ds, eval_ds, "./curated_model"
)


Training üü¢ CURATED (suspicious removed)...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,No log,0.675084,0.52,0.68,0.709936
2,No log,0.643127,0.61,0.715328,0.765224
3,0.648900,0.616993,0.72,0.762712,0.788462


In [20]:
oracle_trainer, oracle_model = train_model(
    "üü° ORACLE (clean labels)", oracle_train_ds, eval_ds, "./oracle_model"
)


Training üü° ORACLE (clean labels)...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,No log,0.667546,0.55,0.693878,0.723157
2,0.634800,0.57415,0.74,0.775862,0.833333
3,0.634800,0.523924,0.78,0.796296,0.859776


## 7Ô∏è‚É£ Final Results

In [21]:
print("üìä Final Evaluation")
print("="*50)

baseline_results = baseline_trainer.evaluate()
curated_results = curated_trainer.evaluate()
oracle_results = oracle_trainer.evaluate()

b_acc, c_acc, o_acc = baseline_results['eval_accuracy'], curated_results['eval_accuracy'], oracle_results['eval_accuracy']
b_f1, c_f1, o_f1 = baseline_results['eval_f1'], curated_results['eval_f1'], oracle_results['eval_f1']
b_auc, c_auc, o_auc = baseline_results['eval_auc'], curated_results['eval_auc'], oracle_results['eval_auc']

üìä Final Evaluation


In [22]:
print("\n" + "="*70)
print("üèÜ FINAL RESULTS: CI-Guided Curation on Noisy Data")
print("="*70)
print(f"Noise rate: {NOISE_RATE*100:.0f}% | Suspicious removed: {n_removed}")
print()

print(f"{'Metric':<15} {'Baseline':>12} {'Curated':>12} {'Oracle':>12} {'Cure vs Base':>12}")
print("-"*70)
print(f"{'Accuracy':<15} {b_acc:>12.4f} {c_acc:>12.4f} {o_acc:>12.4f} {c_acc-b_acc:>+12.4f} {'‚úÖ' if c_acc > b_acc else '‚ùå'}")
print(f"{'F1 Score':<15} {b_f1:>12.4f} {c_f1:>12.4f} {o_f1:>12.4f} {c_f1-b_f1:>+12.4f} {'‚úÖ' if c_f1 > b_f1 else '‚ùå'}")
print(f"{'AUC':<15} {b_auc:>12.4f} {c_auc:>12.4f} {o_auc:>12.4f} {c_auc-b_auc:>+12.4f} {'‚úÖ' if c_auc > b_auc else '‚ùå'}")
print("-"*70)

# Recovery rate: how much of the gap to oracle did curation close?
gap_closed_acc = (c_acc - b_acc) / (o_acc - b_acc) * 100 if o_acc != b_acc else 0
gap_closed_f1 = (c_f1 - b_f1) / (o_f1 - b_f1) * 100 if o_f1 != b_f1 else 0
gap_closed_auc = (c_auc - b_auc) / (o_auc - b_auc) * 100 if o_auc != b_auc else 0

print(f"\nüìà Gap to Oracle closed by curation:")
print(f"   Accuracy: {gap_closed_acc:+.1f}%")
print(f"   F1:       {gap_closed_f1:+.1f}%")
print(f"   AUC:      {gap_closed_auc:+.1f}%")

wins = sum([c_acc > b_acc, c_f1 > b_f1, c_auc > b_auc])
if wins == 3:
    print(f"\nüéâ VERDICT: CI CURATION WORKS ON NOISY DATA! ({wins}/3 improved)")
elif wins >= 2:
    print(f"\nüü° VERDICT: Partial improvement ({wins}/3 improved)")
else:
    print(f"\n‚ùå VERDICT: No clear improvement ({wins}/3 improved)")


üèÜ FINAL RESULTS: CI-Guided Curation on Noisy Data
Noise rate: 10% | Suspicious removed: 60

Metric              Baseline      Curated       Oracle Cure vs Base
----------------------------------------------------------------------
Accuracy              0.6800       0.7200       0.7800      +0.0400 ‚úÖ
F1 Score              0.7576       0.7627       0.7963      +0.0051 ‚úÖ
AUC                   0.8185       0.7885       0.8598      -0.0300 ‚ùå
----------------------------------------------------------------------

üìà Gap to Oracle closed by curation:
   Accuracy: +40.0%
   F1:       +13.3%
   AUC:      -72.8%

üü° VERDICT: Partial improvement (2/3 improved)


In [23]:
# Save results
import json
results = {
    "experiment": "synthetic_noise_v4",
    "noise_rate": NOISE_RATE,
    "suspicious_removed": n_removed,
    "baseline": {"accuracy": float(b_acc), "f1": float(b_f1), "auc": float(b_auc), "train_size": len(baseline_train)},
    "curated": {"accuracy": float(c_acc), "f1": float(c_f1), "auc": float(c_auc), "train_size": len(curated_train)},
    "oracle": {"accuracy": float(o_acc), "f1": float(o_f1), "auc": float(o_auc), "train_size": len(oracle_train)},
    "gap_closed": {"accuracy": gap_closed_acc, "f1": gap_closed_f1, "auc": gap_closed_auc}
}
with open("curation_noisy_results.json", "w") as f:
    json.dump(results, f, indent=2)
print("üíæ Saved results")
files.download("curation_noisy_results.json")

üíæ Saved results


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>