In [None]:
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from cleanlab.count import compute_confident_joint
import seaborn as sns
import matplotlib.pyplot as plt

PREDICTIONS_PATH = Path("/workspace/cv_folds_5fold/predictions")
CLASS_NAMES = ['knife', 'gun', 'rifle', 'baseball_bat', 'background']
NUM_CLASSES = 5

In [None]:
with open(PREDICTIONS_PATH / "all_predictions.pkl", "rb") as f:
    all_predictions = pickle.load(f)

print(f"Total images: {len(all_predictions)}")

In [None]:
def get_primary_class(gt_boxes):
    if not gt_boxes:
        return 4
    classes = [b['class_id'] for b in gt_boxes]
    return max(set(classes), key=classes.count)

def get_pred_probs(pred_boxes, num_classes=5):
    probs = np.zeros(num_classes)
    if not pred_boxes:
        probs[4] = 1.0
        return probs
    for box in pred_boxes:
        cls_id = box['class_id']
        conf = box['confidence']
        probs[cls_id] = max(probs[cls_id], conf)
    if probs[:4].sum() == 0:
        probs[4] = 1.0
    else:
        probs[4] = max(0, 1.0 - probs[:4].max())
    probs = probs / probs.sum()
    return probs

In [None]:
labels = []
pred_probs = []
image_paths = []

for img_path, data in all_predictions.items():
    label = get_primary_class(data['ground_truth'])
    probs = get_pred_probs(data['predictions'], NUM_CLASSES)
    labels.append(label)
    pred_probs.append(probs)
    image_paths.append(img_path)

labels = np.array(labels)
pred_probs = np.array(pred_probs)

print(f"Labels shape: {labels.shape}")
print(f"Pred probs shape: {pred_probs.shape}")
print(f"\nLabel distribution:")
for i, name in enumerate(CLASS_NAMES):
    print(f"  {name}: {(labels == i).sum()}")

In [None]:
confident_joint, off_diag_indices = compute_confident_joint(
    labels=labels,
    pred_probs=pred_probs,
    calibrate=True,
    return_indices_of_off_diagonals=True
)

print("Confident Joint Matrix:")
print(confident_joint)
print(f"\nOff-diagonal examples (potential label errors): {len(off_diag_indices)}")

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(
    confident_joint,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=CLASS_NAMES,
    yticklabels=CLASS_NAMES
)
plt.xlabel('Predicted Class')
plt.ylabel('Given Label')
plt.title('Confident Joint Matrix')
plt.tight_layout()
plt.savefig(PREDICTIONS_PATH / 'confident_joint.png', dpi=150)
plt.show()

In [None]:
error_candidates = []
for idx in off_diag_indices:
    given_label = labels[idx]
    pred_label = np.argmax(pred_probs[idx])
    pred_conf = pred_probs[idx][pred_label]
    error_candidates.append({
        'image_path': image_paths[idx],
        'given_label': CLASS_NAMES[given_label],
        'predicted_label': CLASS_NAMES[pred_label],
        'predicted_conf': pred_conf,
        'given_label_id': given_label,
        'predicted_label_id': pred_label
    })

errors_df = pd.DataFrame(error_candidates)
errors_df = errors_df.sort_values('predicted_conf', ascending=False)
print(f"Total potential label errors: {len(errors_df)}")
errors_df.head(20)

In [None]:
errors_df.to_csv(PREDICTIONS_PATH / 'label_errors.csv', index=False)
np.save(PREDICTIONS_PATH / 'confident_joint.npy', confident_joint)

print(f"Saved: {PREDICTIONS_PATH / 'label_errors.csv'}")
print(f"Saved: {PREDICTIONS_PATH / 'confident_joint.npy'}")

print("\nError breakdown by type:")
print(errors_df.groupby(['given_label', 'predicted_label']).size().sort_values(ascending=False))

In [None]:
import shutil

INSPECTION_PATH = Path("/workspace/cv_folds_5fold/label_errors_inspection")
if INSPECTION_PATH.exists():
    shutil.rmtree(INSPECTION_PATH)
INSPECTION_PATH.mkdir(parents=True)

for _, row in errors_df.iterrows():
    img_path = Path(row['image_path'])
    folder_name = f"labeled_{row['given_label']}_pred_{row['predicted_label']}"
    folder = INSPECTION_PATH / folder_name
    folder.mkdir(exist_ok=True)
    
    label_path = img_path.parent.parent / 'labels' / 'val' / f"{img_path.stem}.txt"
    
    shutil.copy2(img_path, folder / img_path.name)
    if label_path.exists():
        shutil.copy2(label_path, folder / label_path.name)

print(f"Organized {len(errors_df)} images into: {INSPECTION_PATH}")
for folder in sorted(INSPECTION_PATH.iterdir()):
    count = len(list(folder.glob('*.jpg'))) + len(list(folder.glob('*.png')))
    print(f"  {folder.name}: {count} images")