## 1️⃣ Setup

In [1]:
!pip install -q transformers datasets accelerate scikit-learn pandas numpy

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🖥️ Device: {device}")
if device == "cuda":
    print(f"   GPU: {torch.cuda.get_device_name(0)}")

🖥️ Device: cuda
   GPU: NVIDIA A100-SXM4-40GB


In [3]:
MODEL_NAME = "distilbert-base-uncased"
SEED = 42
EPOCHS = 3
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NOISE_RATE = 0.10  # 10% label noise

np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

## 2️⃣ Load Clean Data + Inject Noise

In [4]:
from google.colab import files
print("📁 Upload sst2_ci_demo_curated.csv:")
uploaded = files.upload()

📁 Upload sst2_ci_demo_curated.csv:


Saving sst2_ci_demo_curated.csv to sst2_ci_demo_curated.csv


In [5]:
# Load full dataset with CI scores
df_full = pd.read_csv("sst2_ci_demo_curated.csv")

# Get unique samples (base variants only)
df = df_full[df_full['variant_id'] == 'base'].copy().reset_index(drop=True)

# Create ci_dangerous flag from difficulty column
df['ci_dangerous'] = (df['difficulty'] == 'dangerous').astype(int)

print(f"📊 Dataset: {len(df)} samples")
print(f"   Labels: {df['true_label'].value_counts().to_dict()}")
print(f"   CI dangerous: {df['ci_dangerous'].sum()} ({100*df['ci_dangerous'].mean():.1f}%)")

📊 Dataset: 500 samples
   Labels: {'positive': 261, 'negative': 239}
   CI dangerous: 12 (2.4%)


In [6]:
# Label mapping
label2id = {"negative": 0, "positive": 1, "0": 0, "1": 1, 0: 0, 1: 1}
id2label = {0: "negative", 1: "positive"}
df['label'] = df['true_label'].map(lambda x: label2id.get(x, label2id.get(str(x).lower(), 0)))

# Split train/test (80/20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df['label'])
print(f"✅ Train: {len(train_df)}, Test: {len(test_df)}")

✅ Train: 400, Test: 100


In [7]:
# Create noisy version of training data
train_noisy = train_df.copy()

# Select random 10% to flip labels (binary flip: 0↔1)
n_flip = int(len(train_noisy) * NOISE_RATE)
flip_idx = np.random.choice(train_noisy.index, size=n_flip, replace=False)

train_noisy.loc[flip_idx, 'label'] = 1 - train_noisy.loc[flip_idx, 'label']

train_noisy['noisy_label'] = train_noisy['label']
train_noisy['original_label'] = train_df['label']
train_noisy['was_flipped'] = train_noisy['label'] != train_df['label']

print(f"💥 Noise injected:")
print(f"   Flipped: {train_noisy['was_flipped'].sum()} samples ({100*train_noisy['was_flipped'].mean():.1f}%)")
print(f"   CI dangerous in flipped: {train_noisy[train_noisy['was_flipped']]['ci_dangerous'].sum()}")
print(f"   CI dangerous total: {train_noisy['ci_dangerous'].sum()}")

💥 Noise injected:
   Flipped: 40 samples (10.0%)
   CI dangerous in flipped: 1
   CI dangerous total: 9


## 3️⃣ Use CI Dangerous Flags from CLI Analysis

The CSV already contains `difficulty` column from Collapse Index CLI analysis.
We use `difficulty == "dangerous"` directly - no proxies or simulations.

In [8]:
# The CSV already has ci_dangerous column from CLI analysis
# Let's see the distribution

print(f"📊 CI Dangerous Flags:")
print(f"   Dangerous samples: {train_noisy['ci_dangerous'].sum()} ({100*train_noisy['ci_dangerous'].mean():.1f}%)")
print(f"   Safe samples: {(train_noisy['ci_dangerous'] == 0).sum()} ({100*(train_noisy['ci_dangerous'] == 0).mean():.1f}%)")

# Check how many flipped labels are marked dangerous
dangerous_mask = train_noisy['ci_dangerous'] == 1
print(f"\n🎯 Overlap with actual flips:")
print(f"   Flipped samples marked dangerous: {train_noisy[train_noisy['was_flipped']]['ci_dangerous'].sum()}/{train_noisy['was_flipped'].sum()}")
if dangerous_mask.sum() > 0:
    precision = train_noisy[dangerous_mask]['was_flipped'].sum() / dangerous_mask.sum()
    print(f"   Precision: {precision:.1%}")

📊 CI Dangerous Flags:
   Dangerous samples: 9 (2.2%)
   Safe samples: 391 (97.8%)

🎯 Overlap with actual flips:
   Flipped samples marked dangerous: 1/40
   Precision: 11.1%


## 4️⃣ Experiment: Noisy Data (Validation)

**Hypothesis:** Removing CI-dangerous samples from noisy data SHOULD help (removes actual noise)

In [9]:
# Baseline: Train on all noisy data
train_baseline_noisy = train_noisy.copy()

# Curated: Remove CI-dangerous samples (using actual CLI flags)
train_curated_noisy = train_noisy[train_noisy['ci_dangerous'] == 0].copy()

# Oracle: Train on perfect labels (no noise)
train_oracle = train_df.copy()

print(f"✅ Experiment splits:")
print(f"   Baseline (noisy): {len(train_baseline_noisy)} samples")
print(f"   Curated (noisy): {len(train_curated_noisy)} samples (removed {len(train_baseline_noisy) - len(train_curated_noisy)})")
print(f"   Oracle (clean): {len(train_oracle)} samples")
print(f"\n📊 Removed samples analysis:")
removed_mask = train_noisy['ci_dangerous'] == 1
n_removed = removed_mask.sum()
if n_removed > 0:
    print(f"   Total removed: {n_removed}")
    print(f"   Were flipped: {train_noisy[removed_mask]['was_flipped'].sum()} ({100*train_noisy[removed_mask]['was_flipped'].mean():.1f}%)")
    print(f"   Precision: {100*train_noisy[removed_mask]['was_flipped'].mean():.1f}%")
else:
    print(f"   No dangerous samples to remove")

✅ Experiment splits:
   Baseline (noisy): 400 samples
   Curated (noisy): 391 samples (removed 9)
   Oracle (clean): 400 samples

📊 Removed samples analysis:
   Total removed: 9
   Were flipped: 1 (11.1%)
   Precision: 11.1%


## 5️⃣ Prepare Datasets

In [10]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=128)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()[:, 1]
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='binary')
    auc = roc_auc_score(labels, probs) if len(np.unique(labels)) > 1 else 0.0
    return {"accuracy": acc, "f1": f1, "auc": auc}

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
# Prepare datasets
def prepare_dataset(df):
    ds = Dataset.from_pandas(df[['text', 'label']].reset_index(drop=True))
    ds = ds.map(tokenize_function, batched=True)
    return ds

ds_baseline_noisy = prepare_dataset(train_baseline_noisy)
ds_curated_noisy = prepare_dataset(train_curated_noisy)
ds_oracle = prepare_dataset(train_oracle)
ds_test = prepare_dataset(test_df)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print("✅ Datasets prepared")

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/391 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

✅ Datasets prepared


## 6️⃣ Train Models

In [12]:
# Train Baseline (Noisy)
print("🚀 Training Baseline (Noisy)...")

model_baseline_noisy = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2, id2label=id2label, label2id={v:k for k,v in id2label.items()}
).to(device)

training_args_baseline = TrainingArguments(
    output_dir="./baseline_noisy",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="no",
    save_strategy="no",
    seed=SEED,
)

trainer_baseline_noisy = Trainer(
    model=model_baseline_noisy,
    args=training_args_baseline,
    train_dataset=ds_baseline_noisy,
    eval_dataset=ds_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_baseline_noisy.train()
results_baseline_noisy = trainer_baseline_noisy.evaluate()
print(f"✅ Baseline (Noisy): Acc={results_baseline_noisy['eval_accuracy']:.4f}, F1={results_baseline_noisy['eval_f1']:.4f}, AUC={results_baseline_noisy['eval_auc']:.4f}")

🚀 Training Baseline (Noisy)...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
50,0.6663


✅ Baseline (Noisy): Acc=0.6600, F1=0.7344, AUC=0.8237


In [13]:
# Train Curated (Noisy)
print("🚀 Training Curated (Noisy)...")

model_curated_noisy = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2, id2label=id2label, label2id={v:k for k,v in id2label.items()}
).to(device)

training_args_curated = TrainingArguments(
    output_dir="./curated_noisy",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="no",
    save_strategy="no",
    seed=SEED,
)

trainer_curated_noisy = Trainer(
    model=model_curated_noisy,
    args=training_args_curated,
    train_dataset=ds_curated_noisy,
    eval_dataset=ds_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_curated_noisy.train()
results_curated_noisy = trainer_curated_noisy.evaluate()
print(f"✅ Curated (Noisy): Acc={results_curated_noisy['eval_accuracy']:.4f}, F1={results_curated_noisy['eval_f1']:.4f}, AUC={results_curated_noisy['eval_auc']:.4f}")

🚀 Training Curated (Noisy)...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
50,0.6612


✅ Curated (Noisy): Acc=0.8100, F1=0.8348, AUC=0.8918


In [14]:
# Train Oracle (Perfect Labels)
print("🚀 Training Oracle (Perfect Labels)...")

model_oracle = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2, id2label=id2label, label2id={v:k for k,v in id2label.items()}
).to(device)

training_args_oracle = TrainingArguments(
    output_dir="./oracle",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="no",
    save_strategy="no",
    seed=SEED,
)

trainer_oracle = Trainer(
    model=model_oracle,
    args=training_args_oracle,
    train_dataset=ds_oracle,
    eval_dataset=ds_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_oracle.train()
results_oracle = trainer_oracle.evaluate()
print(f"✅ Oracle: Acc={results_oracle['eval_accuracy']:.4f}, F1={results_oracle['eval_f1']:.4f}, AUC={results_oracle['eval_auc']:.4f}")

🚀 Training Oracle (Perfect Labels)...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
50,0.6264


✅ Oracle: Acc=0.8000, F1=0.8182, AUC=0.8986


## 7️⃣ Results Summary

In [15]:
# Compile results
results_summary = pd.DataFrame({
    "Experiment": [
        "Baseline (Noisy)",
        "Curated (Noisy)",
        "Oracle (Perfect)"
    ],
    "Accuracy": [
        results_baseline_noisy['eval_accuracy'],
        results_curated_noisy['eval_accuracy'],
        results_oracle['eval_accuracy']
    ],
    "F1 Score": [
        results_baseline_noisy['eval_f1'],
        results_curated_noisy['eval_f1'],
        results_oracle['eval_f1']
    ],
    "AUC": [
        results_baseline_noisy['eval_auc'],
        results_curated_noisy['eval_auc'],
        results_oracle['eval_auc']
    ]
})

print("\n" + "="*60)
print("📊 EXPERIMENT RESULTS - SST-2")
print("="*60)
print(results_summary.to_string(index=False))


📊 EXPERIMENT RESULTS - SST-2
      Experiment  Accuracy  F1 Score      AUC
Baseline (Noisy)      0.66  0.734375 0.823718
 Curated (Noisy)      0.81  0.834783 0.891827
Oracle (Perfect)      0.80  0.818182 0.898638


In [16]:
# Calculate gap closure
baseline_acc = results_baseline_noisy['eval_accuracy']
curated_acc = results_curated_noisy['eval_accuracy']
oracle_acc = results_oracle['eval_accuracy']

baseline_f1 = results_baseline_noisy['eval_f1']
curated_f1 = results_curated_noisy['eval_f1']
oracle_f1 = results_oracle['eval_f1']

baseline_auc = results_baseline_noisy['eval_auc']
curated_auc = results_curated_noisy['eval_auc']
oracle_auc = results_oracle['eval_auc']

# Gap closure calculations
gap_acc = oracle_acc - baseline_acc
gap_f1 = oracle_f1 - baseline_f1
gap_auc = oracle_auc - baseline_auc

gap_closed_acc = ((curated_acc - baseline_acc) / gap_acc * 100) if gap_acc != 0 else 0
gap_closed_f1 = ((curated_f1 - baseline_f1) / gap_f1 * 100) if gap_f1 != 0 else 0
gap_closed_auc = ((curated_auc - baseline_auc) / gap_auc * 100) if gap_auc != 0 else 0

print("\n" + "="*60)
print("🎯 GAP ANALYSIS")
print("="*60)
print(f"Baseline (Noisy):  Acc={baseline_acc:.4f}, F1={baseline_f1:.4f}, AUC={baseline_auc:.4f}")
print(f"Curated (Noisy):   Acc={curated_acc:.4f}, F1={curated_f1:.4f}, AUC={curated_auc:.4f}")
print(f"Oracle (Perfect):  Acc={oracle_acc:.4f}, F1={oracle_f1:.4f}, AUC={oracle_auc:.4f}")
print(f"\n📈 Gap to Oracle closed by CI curation:")
print(f"   Accuracy: {gap_closed_acc:+.1f}%")
print(f"   F1:       {gap_closed_f1:+.1f}%")
print(f"   AUC:      {gap_closed_auc:+.1f}%")

wins = sum([curated_acc > baseline_acc, curated_f1 > baseline_f1, curated_auc > baseline_auc])
if wins == 3:
    print(f"\n🎉 VERDICT: CI CURATION WORKS! ({wins}/3 improved)")
elif wins >= 2:
    print(f"\n🟡 VERDICT: Partial improvement ({wins}/3 improved)")
else:
    print(f"\n❌ VERDICT: No clear improvement ({wins}/3 improved)")
print("="*60)


🎯 GAP ANALYSIS
Baseline (Noisy):  Acc=0.6600, F1=0.7344, AUC=0.8237
Curated (Noisy):   Acc=0.8100, F1=0.8348, AUC=0.8918
Oracle (Perfect):  Acc=0.8000, F1=0.8182, AUC=0.8986

📈 Gap to Oracle closed by CI curation:
   Accuracy: +107.1%
   F1:       +119.8%
   AUC:      +90.9%

🎉 VERDICT: CI CURATION WORKS! (3/3 improved)


In [17]:
# Save results
results_summary.to_csv("sst2_curation_results.csv", index=False)
print("✅ Results saved to sst2_curation_results.csv")

✅ Results saved to sst2_curation_results.csv


## 8️⃣ Interpretation

**What this experiment tests:**
- Removing samples flagged as `difficulty == "dangerous"` by Collapse Index CLI
- Uses ACTUAL CLI output - no proxies or simulations

**Key Insight:**
> CI-guided curation uses the `difficulty` column from Collapse Index CLI analysis. Labs receive the pre-flagged CSV and simply exclude dangerous samples during training - no CI system required on their end.