## 1Ô∏è‚É£ Setup

In [1]:
!pip install -q transformers datasets accelerate scikit-learn pandas numpy

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üñ•Ô∏è Device: {device}")
if device == "cuda":
    print(f"   GPU: {torch.cuda.get_device_name(0)}")

üñ•Ô∏è Device: cuda
   GPU: NVIDIA A100-SXM4-40GB


In [3]:
MODEL_NAME = "distilbert-base-uncased"
SEED = 42
EPOCHS = 3
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NOISE_RATE = 0.10  # 10% label noise

# AG News label mapping
LABEL_MAP = {"World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3}
LABEL_NAMES = ["World", "Sports", "Business", "Sci/Tech"]

np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

## 2Ô∏è‚É£ Load Data + Inject Noise

In [4]:
from google.colab import files
print("üìÅ Upload agnews_ci_demo_curated.csv:")
uploaded = files.upload()

üìÅ Upload agnews_ci_demo_curated.csv:


Saving agnews_ci_demo_curated.csv to agnews_ci_demo_curated.csv


In [5]:
# Load full dataset with CI scores
df_full = pd.read_csv("agnews_ci_demo_curated.csv")

# Get unique samples (base variants only)
df = df_full[df_full['variant_id'] == 'base'].copy().reset_index(drop=True)

print(f"üìä Dataset: {len(df)} samples")
print(f"   Labels: {df['true_label'].value_counts().to_dict()}")
print(f"   CI dangerous: {df['ci_dangerous'].sum()} ({100*df['ci_dangerous'].mean():.1f}%)")

üìä Dataset: 500 samples
   Labels: {'Business': 134, 'Sci/Tech': 125, 'Sports': 121, 'World': 120}
   CI dangerous: 42 (8.4%)


In [6]:
# Map string labels to integers
df['label'] = df['true_label'].map(LABEL_MAP)

# Split train/test (80/20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df['label'])
print(f"‚úÖ Train: {len(train_df)}, Test: {len(test_df)}")

‚úÖ Train: 400, Test: 100


### üß™ Inject 10% Label Noise (Random Flips)

In [7]:
# Create noisy version of training data
train_noisy = train_df.copy()

# Select random 10% to flip labels
n_flip = int(len(train_noisy) * NOISE_RATE)
flip_idx = np.random.choice(train_noisy.index, size=n_flip, replace=False)

# For each flipped sample, pick a different random label
for idx in flip_idx:
    current_label = train_noisy.loc[idx, 'label']
    # Pick from labels that are NOT the current label
    other_labels = [l for l in range(4) if l != current_label]
    train_noisy.loc[idx, 'label'] = np.random.choice(other_labels)

train_noisy['noisy_label'] = train_noisy['label']
train_noisy['original_label'] = train_df['label']
train_noisy['was_flipped'] = train_noisy['label'] != train_df['label']

print(f"üí• Noise injected:")
print(f"   Flipped: {train_noisy['was_flipped'].sum()} samples ({100*train_noisy['was_flipped'].mean():.1f}%)")
print(f"   CI dangerous in flipped: {train_noisy[train_noisy['was_flipped']]['ci_dangerous'].sum()}")
print(f"   CI dangerous total: {train_noisy['ci_dangerous'].sum()}")

üí• Noise injected:
   Flipped: 40 samples (10.0%)
   CI dangerous in flipped: 3
   CI dangerous total: 32


## 3Ô∏è‚É£ Use CI Dangerous Flags from CLI Analysis

The CSV already contains `ci_dangerous` column from Collapse Index CLI analysis.
We'll use this directly - no need to simulate or probe.

In [8]:
# The CSV already has ci_dangerous column from CLI analysis
# Let's see the distribution

print(f"üìä CI Dangerous Flags:")
print(f"   Dangerous samples: {train_noisy['ci_dangerous'].sum()} ({100*train_noisy['ci_dangerous'].mean():.1f}%)")
print(f"   Safe samples: {(train_noisy['ci_dangerous'] == 0).sum()} ({100*(train_noisy['ci_dangerous'] == 0).mean():.1f}%)")

# Check how many flipped labels are marked dangerous
dangerous_mask = train_noisy['ci_dangerous'] == 1
print(f"\nüéØ Overlap with actual flips:")
print(f"   Flipped samples marked dangerous: {train_noisy[train_noisy['was_flipped']]['ci_dangerous'].sum()}/{train_noisy['was_flipped'].sum()}")
if dangerous_mask.sum() > 0:
    precision = train_noisy[dangerous_mask]['was_flipped'].sum() / dangerous_mask.sum()
    print(f"   Precision: {precision:.1%}")

üìä CI Dangerous Flags:
   Dangerous samples: 32 (8.0%)
   Safe samples: 368 (92.0%)

üéØ Overlap with actual flips:
   Flipped samples marked dangerous: 3/40
   Precision: 9.4%


## 4Ô∏è‚É£ AG News Experiment 1: Clean Data (Control)

**Hypothesis:** Removing CI-dangerous samples from clean data should NOT help (nothing to fix)

In [9]:
# Use clean training data (before noise injection)
train_clean = train_df.copy()

print(f"üìã AG News Exp 1 Setup (Clean):")
print(f"   Total train: {len(train_clean)}")
print(f"   CI dangerous: {train_clean['ci_dangerous'].sum()} ({100*train_clean['ci_dangerous'].mean():.1f}%)")

üìã AG News Exp 1 Setup (Clean):
   Total train: 400
   CI dangerous: 32 (8.0%)


In [10]:
# Baseline: Train on all clean data
train_baseline_clean = train_clean.copy()

# Curated: Remove CI-dangerous samples
train_curated_clean = train_clean[train_clean['ci_dangerous'] == 0].copy()

print(f"‚úÖ AG News Exp 1 splits:")
print(f"   Baseline: {len(train_baseline_clean)} samples")
print(f"   Curated: {len(train_curated_clean)} samples (removed {len(train_baseline_clean) - len(train_curated_clean)})")

‚úÖ AG News Exp 1 splits:
   Baseline: 400 samples
   Curated: 368 samples (removed 32)


## 5Ô∏è‚É£ Train Models - AG News Exp 1 (Clean)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=512)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    return {"accuracy": acc, "f1": f1}

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
# Prepare datasets
def prepare_dataset(df):
    ds = Dataset.from_pandas(df[['text', 'label']].reset_index(drop=True))
    ds = ds.map(tokenize_function, batched=True)
    return ds

ds_baseline_clean = prepare_dataset(train_baseline_clean)
ds_curated_clean = prepare_dataset(train_curated_clean)
ds_test = prepare_dataset(test_df)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/368 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [13]:
# Train Baseline (Clean)
print("üöÄ Training Baseline (Clean)...")

model_baseline_clean = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=4
).to(device)

training_args = TrainingArguments(
    output_dir="./agnews_exp1_baseline",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="no",
    save_strategy="no",
    seed=SEED,
)

trainer_baseline_clean = Trainer(
    model=model_baseline_clean,
    args=training_args,
    train_dataset=ds_baseline_clean,
    eval_dataset=ds_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_baseline_clean.train()
results_baseline_clean = trainer_baseline_clean.evaluate()
print(f"‚úÖ Baseline (Clean): Acc={results_baseline_clean['eval_accuracy']:.4f}, F1={results_baseline_clean['eval_f1']:.4f}")

üöÄ Training Baseline (Clean)...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
50,1.0755


‚úÖ Baseline (Clean): Acc=0.7800, F1=0.7823


In [14]:
# Train Curated (Clean)
print("üöÄ Training Curated (Clean)...")

model_curated_clean = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=4
).to(device)

training_args_curated = TrainingArguments(
    output_dir="./agnews_exp1_curated",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="no",
    save_strategy="no",
    seed=SEED,
)

trainer_curated_clean = Trainer(
    model=model_curated_clean,
    args=training_args_curated,
    train_dataset=ds_curated_clean,
    eval_dataset=ds_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_curated_clean.train()
results_curated_clean = trainer_curated_clean.evaluate()
print(f"‚úÖ Curated (Clean): Acc={results_curated_clean['eval_accuracy']:.4f}, F1={results_curated_clean['eval_f1']:.4f}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üöÄ Training Curated (Clean)...


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
50,1.0465


‚úÖ Curated (Clean): Acc=0.7900, F1=0.7893


## 6Ô∏è‚É£ AG News Experiment 2: Noisy Data (Validation)

**Hypothesis:** Removing CI-dangerous samples from noisy data SHOULD help (removes actual noise)

In [15]:
# Baseline: Train on all noisy data
train_baseline_noisy = train_noisy.copy()

# Curated: Remove CI-dangerous samples (using actual CLI flags)
train_curated_noisy = train_noisy[train_noisy['ci_dangerous'] == 0].copy()

# Oracle: Train on perfect labels (no noise)
train_oracle = train_df.copy()

print(f"‚úÖ AG News Exp 2 splits:")
print(f"   Baseline (noisy): {len(train_baseline_noisy)} samples")
print(f"   Curated (noisy): {len(train_curated_noisy)} samples (removed {len(train_baseline_noisy) - len(train_curated_noisy)})")
print(f"   Oracle (clean): {len(train_oracle)} samples")
print(f"\nüìä Removed samples analysis:")
removed_mask = train_noisy['ci_dangerous'] == 1
print(f"   Total removed: {removed_mask.sum()}")
print(f"   Were flipped: {train_noisy[removed_mask]['was_flipped'].sum()} ({100*train_noisy[removed_mask]['was_flipped'].mean():.1f}%)")
print(f"   Precision: {100*train_noisy[removed_mask]['was_flipped'].mean():.1f}%")

‚úÖ AG News Exp 2 splits:
   Baseline (noisy): 400 samples
   Curated (noisy): 368 samples (removed 32)
   Oracle (clean): 400 samples

üìä Removed samples analysis:
   Total removed: 32
   Were flipped: 3 (9.4%)
   Precision: 9.4%


In [16]:
# Prepare datasets
ds_baseline_noisy = prepare_dataset(train_baseline_noisy)
ds_curated_noisy = prepare_dataset(train_curated_noisy)
ds_oracle = prepare_dataset(train_oracle)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/368 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

## 7Ô∏è‚É£ Train Models - AG News Exp 2 (Noisy)

In [17]:
# Train Baseline (Noisy)
print("üöÄ Training Baseline (Noisy)...")

model_baseline_noisy = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=4
).to(device)

training_args_baseline = TrainingArguments(
    output_dir="./agnews_exp2_baseline",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="no",
    save_strategy="no",
    seed=SEED,
)

trainer_baseline_noisy = Trainer(
    model=model_baseline_noisy,
    args=training_args_baseline,
    train_dataset=ds_baseline_noisy,
    eval_dataset=ds_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_baseline_noisy.train()
results_baseline_noisy = trainer_baseline_noisy.evaluate()
print(f"‚úÖ Baseline (Noisy): Acc={results_baseline_noisy['eval_accuracy']:.4f}, F1={results_baseline_noisy['eval_f1']:.4f}")

üöÄ Training Baseline (Noisy)...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
50,1.1773


‚úÖ Baseline (Noisy): Acc=0.7900, F1=0.7874


In [18]:
# Train Curated (Noisy)
print("üöÄ Training Curated (Noisy)...")

model_curated_noisy = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=4
).to(device)

training_args_curated = TrainingArguments(
    output_dir="./agnews_exp2_curated",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="no",
    save_strategy="no",
    seed=SEED,
)

trainer_curated_noisy = Trainer(
    model=model_curated_noisy,
    args=training_args_curated,
    train_dataset=ds_curated_noisy,
    eval_dataset=ds_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_curated_noisy.train()
results_curated_noisy = trainer_curated_noisy.evaluate()
print(f"‚úÖ Curated (Noisy): Acc={results_curated_noisy['eval_accuracy']:.4f}, F1={results_curated_noisy['eval_f1']:.4f}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


üöÄ Training Curated (Noisy)...


Step,Training Loss
50,1.1511


‚úÖ Curated (Noisy): Acc=0.8100, F1=0.8121


In [19]:
# Train Oracle (Perfect Labels)
print("üöÄ Training Oracle (Perfect Labels)...")

model_oracle = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=4
).to(device)

training_args_oracle = TrainingArguments(
    output_dir="./agnews_exp2_oracle",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="no",
    save_strategy="no",
    seed=SEED,
)

trainer_oracle = Trainer(
    model=model_oracle,
    args=training_args_oracle,
    train_dataset=ds_oracle,
    eval_dataset=ds_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_oracle.train()
results_oracle = trainer_oracle.evaluate()
print(f"‚úÖ Oracle: Acc={results_oracle['eval_accuracy']:.4f}, F1={results_oracle['eval_f1']:.4f}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


üöÄ Training Oracle (Perfect Labels)...


Step,Training Loss
50,1.0876


‚úÖ Oracle: Acc=0.7900, F1=0.7886


## 8Ô∏è‚É£ Results Summary

In [20]:
# Compile results
results_summary = pd.DataFrame({
    "Experiment": [
        "AG News Exp 1: Baseline (Clean)",
        "AG News Exp 1: Curated (Clean)",
        "AG News Exp 2: Baseline (Noisy)",
        "AG News Exp 2: Curated (Noisy)",
        "AG News Exp 2: Oracle (Perfect)"
    ],
    "Accuracy": [
        results_baseline_clean['eval_accuracy'],
        results_curated_clean['eval_accuracy'],
        results_baseline_noisy['eval_accuracy'],
        results_curated_noisy['eval_accuracy'],
        results_oracle['eval_accuracy']
    ],
    "F1 Score": [
        results_baseline_clean['eval_f1'],
        results_curated_clean['eval_f1'],
        results_baseline_noisy['eval_f1'],
        results_curated_noisy['eval_f1'],
        results_oracle['eval_f1']
    ]
})

print("\n" + "="*60)
print("üìä EXPERIMENT RESULTS - AG NEWS")
print("="*60)
print(results_summary.to_string(index=False))

# Calculate gap closure for AG News Exp 2
baseline_acc = results_baseline_noisy['eval_accuracy']
curated_acc = results_curated_noisy['eval_accuracy']
oracle_acc = results_oracle['eval_accuracy']

delta_acc = curated_acc - baseline_acc

print("\n" + "="*60)
print("üéØ AG NEWS EXPERIMENT 2 ANALYSIS (Noisy Data)")
print("="*60)
print(f"Baseline (Noisy):  {baseline_acc:.4f}")
print(f"Curated (Noisy):   {curated_acc:.4f}  ({delta_acc:+.4f})")
print(f"Oracle (Perfect):  {oracle_acc:.4f}")

# Handle different scenarios
if curated_acc > oracle_acc:
    beat_oracle_by = (curated_acc - oracle_acc) * 100
    print(f"\nüèÜ CURATED BEAT ORACLE by +{beat_oracle_by:.1f}%!")
    print(f"   CI found genuinely problematic samples.")
elif oracle_acc > baseline_acc:
    gap_baseline_oracle = oracle_acc - baseline_acc
    gap_curated_oracle = oracle_acc - curated_acc
    gap_closed = (gap_baseline_oracle - gap_curated_oracle) / gap_baseline_oracle * 100
    print(f"\nGap to Oracle closed: {gap_closed:+.1f}%")
else:
    print(f"\nNote: Baseline already matched Oracle (no gap to close)")
print("="*60)


üìä EXPERIMENT RESULTS - AG NEWS
                     Experiment  Accuracy  F1 Score
AG News Exp 1: Baseline (Clean)      0.78  0.782314
 AG News Exp 1: Curated (Clean)      0.79  0.789290
AG News Exp 2: Baseline (Noisy)      0.79  0.787391
 AG News Exp 2: Curated (Noisy)      0.81  0.812130
AG News Exp 2: Oracle (Perfect)      0.79  0.788605

üéØ AG NEWS EXPERIMENT 2 ANALYSIS (Noisy Data)
Baseline (Noisy):  0.7900
Curated (Noisy):   0.8100  (+0.0200)
Oracle (Perfect):  0.7900

üèÜ CURATED BEAT ORACLE by +2.0%!
   CI found genuinely problematic samples.


In [21]:
# Save results
results_summary.to_csv("agnews_curation_results.csv", index=False)
print("‚úÖ Results saved to agnews_curation_results.csv")

‚úÖ Results saved to agnews_curation_results.csv


## 9Ô∏è‚É£ Interpretation

**AG News Experiment 1 (Clean Data):**
- Removing CI-dangerous samples from clean data should show **no improvement or slight decrease**
- This validates that CI doesn't hallucinate problems

**AG News Experiment 2 (Noisy Data):**
- Removing CI-dangerous samples from noisy data should show **accuracy improvement**
- Gap closure percentage shows how much error was recovered
- This validates that CI finds real label noise

**Key Insight:**
> CI-guided curation uses the `ci_dangerous` column from Collapse Index CLI analysis. Labs receive the pre-flagged CSV and simply exclude dangerous samples during training - no CI system required on their end.