# Confusion Matrix untuk Deteksi Dipstick
Notebook ini untuk evaluasi model: apakah gambar terdeteksi ada dipstick (valid) atau background

## 1. Setup & Import Libraries

In [1]:
import os
import cv2
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from ultralytics import YOLO
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 8)

ImportError: DLL load failed while importing _multiarray_umath: The specified module could not be found.

ImportError: numpy._core.multiarray failed to import

## 2. Load Model & Data Configuration

In [None]:
# Path ke model terbaik (sesuaikan dengan lokasi model kamu)
MODEL_PATH = r'c:\Users\ACER\Documents\5 Magang\Telkom\Project-dipstick\eksperimen-v11-3class\best.pt'

# Path ke data
DATA_YAML = r'c:\Users\ACER\Documents\5 Magang\Telkom\Project-dipstick\data\data.yaml'
DATASET_PATH = Path(r'c:\Users\ACER\Documents\5 Magang\Telkom\Project-dipstick\data')

# Load model
print("Loading model...")
model = YOLO(MODEL_PATH)
print(f"✓ Model loaded from: {MODEL_PATH}")

# Load data config
with open(DATA_YAML, 'r') as f:
    data_config = yaml.safe_load(f)

print(f"\nDataset Info:")
print(f"  Classes: {data_config['names']}")
print(f"  Number of classes: {data_config['nc']}")

## 3. Prepare Ground Truth Labels
Membaca label dari file txt untuk menentukan ground truth:
- **Background**: file txt kosong (tidak ada objek)
- **Dipstick (Valid)**: file txt ada isinya (ada objek terdeteksi)

In [None]:
def get_ground_truth_labels(images_dir, labels_dir):
    """
    Membaca ground truth dari label files
    Returns: dict {image_name: 'dipstick' or 'background'}
    """
    ground_truth = {}
    
    image_files = list(images_dir.glob('*'))
    image_files = [f for f in image_files if f.suffix.lower() in ['.jpg', '.jpeg', '.png']]
    
    for img_file in image_files:
        # Cari label file yang sesuai
        label_file = labels_dir / f"{img_file.stem}.txt"
        
        if label_file.exists():
            # Cek apakah file kosong (background) atau ada isinya (dipstick)
            if label_file.stat().st_size == 0:
                ground_truth[img_file.name] = 'background'
            else:
                ground_truth[img_file.name] = 'dipstick'
        else:
            # Jika tidak ada label file, anggap background
            ground_truth[img_file.name] = 'background'
    
    return ground_truth

# Get ground truth for test set
test_images_dir = DATASET_PATH / 'test' / 'images'
test_labels_dir = DATASET_PATH / 'test' / 'labels'

print("Reading ground truth labels...")
ground_truth = get_ground_truth_labels(test_images_dir, test_labels_dir)

# Statistics
gt_df = pd.DataFrame(list(ground_truth.items()), columns=['image', 'true_label'])
print(f"\nGround Truth Statistics:")
print(gt_df['true_label'].value_counts())
print(f"\nTotal images: {len(ground_truth)}")

## 4. Run Inference & Get Predictions
Jalankan model pada semua gambar test untuk mendapatkan prediksi

In [None]:
def get_predictions(model, images_dir, conf_threshold=0.25):
    """
    Jalankan inference dan klasifikasikan gambar sebagai 'dipstick' atau 'background'
    Jika ada deteksi dengan confidence >= threshold -> dipstick
    Jika tidak ada deteksi -> background
    """
    predictions = {}
    
    image_files = list(images_dir.glob('*'))
    image_files = [f for f in image_files if f.suffix.lower() in ['.jpg', '.jpeg', '.png']]
    
    print(f"Running inference on {len(image_files)} images...")
    
    for img_file in tqdm(image_files):
        # Run inference
        results = model(str(img_file), conf=conf_threshold, verbose=False)
        
        # Cek apakah ada deteksi
        if len(results[0].boxes) > 0:
            predictions[img_file.name] = 'dipstick'
        else:
            predictions[img_file.name] = 'background'
    
    return predictions

# Set confidence threshold
CONF_THRESHOLD = 0.25  # Sesuaikan jika perlu

print(f"Confidence threshold: {CONF_THRESHOLD}")
predictions = get_predictions(model, test_images_dir, conf_threshold=CONF_THRESHOLD)

# Statistics
pred_df = pd.DataFrame(list(predictions.items()), columns=['image', 'predicted_label'])
print(f"\nPrediction Statistics:")
print(pred_df['predicted_label'].value_counts())

## 5. Create Confusion Matrix

In [None]:
# Combine ground truth and predictions
results_df = pd.DataFrame({
    'image': list(ground_truth.keys()),
    'true_label': [ground_truth[img] for img in ground_truth.keys()],
    'predicted_label': [predictions.get(img, 'background') for img in ground_truth.keys()]
})

# Create confusion matrix
y_true = results_df['true_label']
y_pred = results_df['predicted_label']

# Define labels order
labels = ['background', 'dipstick']

# Calculate confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=labels)

# Plot confusion matrix
fig, ax = plt.subplots(figsize=(10, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(ax=ax, cmap='Blues', values_format='d')
plt.title('Confusion Matrix - Dipstick Detection\n(Image-level Classification)', fontsize=14, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Confusion matrix saved as 'confusion_matrix.png'")

## 6. Calculate Metrics

In [None]:
# Classification report
print("="*70)
print("CLASSIFICATION REPORT")
print("="*70)
print(classification_report(y_true, y_pred, labels=labels, target_names=labels))

# Manual calculation
tn, fp, fn, tp = cm.ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

print("="*70)
print("DETAILED METRICS")
print("="*70)
print(f"True Positives (TP):  {tp:4d}  - Correctly detected dipstick")
print(f"True Negatives (TN):  {tn:4d}  - Correctly detected background")
print(f"False Positives (FP): {fp:4d}  - Background predicted as dipstick")
print(f"False Negatives (FN): {fn:4d}  - Dipstick predicted as background")
print("="*70)
print(f"Accuracy:    {accuracy:.4f}  ({accuracy*100:.2f}%)")
print(f"Precision:   {precision:.4f}  ({precision*100:.2f}%)")
print(f"Recall:      {recall:.4f}  ({recall*100:.2f}%)")
print(f"F1-Score:    {f1_score:.4f}  ({f1_score*100:.2f}%)")
print(f"Specificity: {specificity:.4f}  ({specificity*100:.2f}%)")
print("="*70)

## 7. Analyze Errors
Lihat gambar mana saja yang salah diprediksi

In [None]:
# Find misclassified images
false_positives = results_df[(results_df['true_label'] == 'background') & 
                              (results_df['predicted_label'] == 'dipstick')]
false_negatives = results_df[(results_df['true_label'] == 'dipstick') & 
                              (results_df['predicted_label'] == 'background')]

print("="*70)
print("ERROR ANALYSIS")
print("="*70)
print(f"\nFalse Positives ({len(false_positives)}): Background diprediksi sebagai Dipstick")
if len(false_positives) > 0:
    print("Sample images:")
    for idx, row in false_positives.head(10).iterrows():
        print(f"  - {row['image']}")

print(f"\nFalse Negatives ({len(false_negatives)}): Dipstick diprediksi sebagai Background")
if len(false_negatives) > 0:
    print("Sample images:")
    for idx, row in false_negatives.head(10).iterrows():
        print(f"  - {row['image']}")

# Save results to CSV
results_df.to_csv('prediction_results.csv', index=False)
false_positives.to_csv('false_positives.csv', index=False)
false_negatives.to_csv('false_negatives.csv', index=False)

print("\n✓ Results saved to CSV files")

## 8. Visualize Sample Predictions
Visualisasi beberapa contoh prediksi (benar dan salah)

In [None]:
def visualize_samples(df, category, n_samples=4):
    """
    Visualize sample images from a specific category
    """
    if len(df) == 0:
        print(f"No samples found for {category}")
        return
    
    samples = df.head(n_samples)
    n = len(samples)
    
    fig, axes = plt.subplots(1, n, figsize=(5*n, 5))
    if n == 1:
        axes = [axes]
    
    for idx, (_, row) in enumerate(samples.iterrows()):
        img_path = test_images_dir / row['image']
        img = cv2.imread(str(img_path))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        axes[idx].imshow(img)
        axes[idx].set_title(f"True: {row['true_label']}\nPred: {row['predicted_label']}", 
                            fontsize=10)
        axes[idx].axis('off')
    
    plt.suptitle(f'{category} Samples', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(f'{category.lower().replace(" ", "_")}_samples.png', dpi=150, bbox_inches='tight')
    plt.show()

# Visualize different categories
print("Visualizing False Positives...")
visualize_samples(false_positives, "False Positives", n_samples=4)

print("\nVisualizing False Negatives...")
visualize_samples(false_negatives, "False Negatives", n_samples=4)

# True Positives
true_positives = results_df[(results_df['true_label'] == 'dipstick') & 
                             (results_df['predicted_label'] == 'dipstick')]
print("\nVisualizing True Positives...")
visualize_samples(true_positives, "True Positives (Correct)", n_samples=4)

# True Negatives
true_negatives = results_df[(results_df['true_label'] == 'background') & 
                             (results_df['predicted_label'] == 'background')]
print("\nVisualizing True Negatives...")
visualize_samples(true_negatives, "True Negatives (Correct)", n_samples=4)

## 9. Test Different Confidence Thresholds (Optional)
Coba berbagai confidence threshold untuk melihat dampaknya

In [None]:
# Test different thresholds
thresholds = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5]
threshold_results = []

print("Testing different confidence thresholds...\n")

for thresh in thresholds:
    print(f"Testing threshold: {thresh}")
    preds = get_predictions(model, test_images_dir, conf_threshold=thresh)
    
    y_pred_temp = [preds.get(img, 'background') for img in ground_truth.keys()]
    cm_temp = confusion_matrix(y_true, y_pred_temp, labels=labels)
    tn, fp, fn, tp = cm_temp.ravel()
    
    acc = (tp + tn) / (tp + tn + fp + fn)
    prec = tp / (tp + fp) if (tp + fp) > 0 else 0
    rec = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0
    
    threshold_results.append({
        'threshold': thresh,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'TP': tp,
        'TN': tn,
        'FP': fp,
        'FN': fn
    })

# Create comparison dataframe
threshold_df = pd.DataFrame(threshold_results)
print("\n" + "="*70)
print("THRESHOLD COMPARISON")
print("="*70)
print(threshold_df.to_string(index=False))

# Plot comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['accuracy', 'precision', 'recall', 'f1_score']
titles = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

for idx, (metric, title) in enumerate(zip(metrics, titles)):
    row = idx // 2
    col = idx % 2
    axes[row, col].plot(threshold_df['threshold'], threshold_df[metric], 
                        marker='o', linewidth=2, markersize=8)
    axes[row, col].set_xlabel('Confidence Threshold', fontsize=11)
    axes[row, col].set_ylabel(title, fontsize=11)
    axes[row, col].set_title(title, fontsize=12, fontweight='bold')
    axes[row, col].grid(True, alpha=0.3)
    axes[row, col].set_ylim([0, 1.05])

plt.suptitle('Metrics vs Confidence Threshold', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('threshold_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# Save to CSV
threshold_df.to_csv('threshold_comparison.csv', index=False)
print("\n✓ Threshold comparison saved")

## Summary

Notebook ini menghasilkan:
1. **Confusion Matrix** - Visualisasi performa klasifikasi
2. **Metrics** - Accuracy, Precision, Recall, F1-Score, Specificity
3. **Error Analysis** - Daftar gambar yang salah diprediksi (FP & FN)
4. **Sample Visualizations** - Contoh gambar dari setiap kategori
5. **Threshold Analysis** - Perbandingan performa di berbagai confidence threshold

File output yang dihasilkan:
- `confusion_matrix.png`
- `prediction_results.csv`
- `false_positives.csv`
- `false_negatives.csv`
- `threshold_comparison.png`
- `threshold_comparison.csv`
- `*_samples.png` (visualisasi sample)