# Testing Classification Metrics

Testing our metrics implementation:
1. 2x2 (Sick vs Healthy)
2. 3x3 (Severe, Moderate, Mild)

In [1]:
# Import our metrics module
from metrics import (
    evaluate_binary,
    calculate_per_class_metrics,
    calculate_macro_average,
    calculate_weighted_average
)

## 1. Binary Classification: Sick vs Healthy


2x2

In [2]:
# Binary confusion matrix
binary_cm = [
    [760, 190],  # [TN, FP]
    [10, 40]     # [FN, TP]
]

# Calculate and display all metrics
binary_metrics = evaluate_binary(binary_cm)
for metric_name, value in binary_metrics.items():
    print(f"{metric_name}: {value:.3f}")

accuracy: 0.800
precision: 0.174
recall: 0.800
f1_score: 0.286


## 2. Multi-class Classification: Disease Severity

3x3

In [3]:
# Multi-class confusion matrix
multiclass_cm = [
    [25, 8, 2],    # Severe predictions
    [5, 40, 10],   # Moderate predictions
    [3, 7, 100]    # Mild predictions
]

# Calculate per-class metrics
print("Metrics for each severity level:")
for i, severity in enumerate(['Severe', 'Moderate', 'Mild']):
    metrics = calculate_per_class_metrics(multiclass_cm, i)
    print(f"\n{severity}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.3f}")

# Calculate and display macro-averages
print("\nMacro-averaged metrics:")
macro_metrics = calculate_macro_average(multiclass_cm)
for metric_name, value in macro_metrics.items():
    print(f"{metric_name}: {value:.3f}")

Metrics for each severity level:

Severe:
precision: 0.758
recall: 0.714
f1_score: 0.735

Moderate:
precision: 0.727
recall: 0.727
f1_score: 0.727

Mild:
precision: 0.893
recall: 0.909
f1_score: 0.901

Macro-averaged metrics:
macro_precision: 0.793
macro_recall: 0.784
macro_f1: 0.788


# Testing Classification Metrics


1. Binary Classification Metrics
   - Accuracy, Precision, Recall, F1-score
   
2. Multi-class Classification Metrics
   - Per-class metrics
   - Macro averages
   - Weighted averages

In [4]:
# Import our metrics module
from metrics import (
    accuracy, precision, recall, f1,
    evaluate_binary,
    calculate_per_class_metrics,
    calculate_macro_average,
    calculate_weighted_average
)

## 1. Binary Classification Test


In [5]:
# Create a binary confusion matrix
binary_cm = [
    [50, 10],  # [TN, FP]
    [5, 35]    # [FN, TP]
]

# Test individual metrics
tp, tn = 35, 50
fp, fn = 10, 5

print("Individual metrics:")
print(f"Accuracy: {accuracy(tp, tn, fp, fn):.3f}")
print(f"Precision: {precision(tp, fp):.3f}")
print(f"Recall: {recall(tp, fn):.3f}")
print(f"F1 Score: {f1(precision(tp, fp), recall(tp, fn)):.3f}")

print("\nUsing evaluate_binary function:")
metrics = evaluate_binary(binary_cm)
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.3f}")

Individual metrics:
Accuracy: 0.850
Precision: 0.778
Recall: 0.875
F1 Score: 0.824

Using evaluate_binary function:
accuracy: 0.850
precision: 0.778
recall: 0.875
f1_score: 0.824


## 2. Multi-class Classification Test


In [6]:
# Create a multi-class confusion matrix
multiclass_cm = [
    [10, 2, 1],  # Class 0 predictions
    [3, 15, 2],  # Class 1 predictions
    [1, 2, 12]   # Class 2 predictions
]

# Test per-class metrics
print("Per-class metrics:")
for i in range(3):
    metrics = calculate_per_class_metrics(multiclass_cm, i)
    print(f"\nClass {i}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.3f}")

# Test macro-average metrics
print("\nMacro-averaged metrics:")
macro_metrics = calculate_macro_average(multiclass_cm)
for metric_name, value in macro_metrics.items():
    print(f"{metric_name}: {value:.3f}")

# Test weighted-average metrics
print("\nWeighted-averaged metrics:")
weighted_metrics = calculate_weighted_average(multiclass_cm)
for metric_name, value in weighted_metrics.items():
    print(f"{metric_name}: {value:.3f}")

Per-class metrics:

Class 0:
precision: 0.714
recall: 0.769
f1_score: 0.741

Class 1:
precision: 0.789
recall: 0.750
f1_score: 0.769

Class 2:
precision: 0.800
recall: 0.800
f1_score: 0.800

Macro-averaged metrics:
macro_precision: 0.768
macro_recall: 0.773
macro_f1: 0.770

Weighted-averaged metrics:
weighted_precision: 0.772
weighted_recall: 0.771
weighted_f1: 0.771


## 3. Comparasion with sklearn

In [7]:
# Import sklearn metrics for comparison
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Convert confusion matrices to true and predicted labels
def get_labels_from_confusion_matrix(cm):
    y_true = []
    y_pred = []
    for i in range(len(cm)):
        for j in range(len(cm)):
            y_true.extend([i] * cm[i][j])
            y_pred.extend([j] * cm[i][j])
    return np.array(y_true), np.array(y_pred)

## Comparing with sklearn metrics

Let's compare our implementation with sklearn's implementation for both binary and multi-class cases.

In [10]:
# Binary classification comparison
print("Binary Classification Comparison:")
print("---------------------------------")

# Get labels from binary confusion matrix
y_true_bin, y_pred_bin = get_labels_from_confusion_matrix(binary_cm)

# Our implementation
our_metrics = evaluate_binary(binary_cm)

# Sklearn metrics
sk_accuracy = accuracy_score(y_true_bin, y_pred_bin)
sk_precision = precision_score(y_true_bin, y_pred_bin, pos_label=1)
sk_recall = recall_score(y_true_bin, y_pred_bin, pos_label=1)
sk_f1 = f1_score(y_true_bin, y_pred_bin, pos_label=1)

# Compare results
print("\nOur Implementation vs Sklearn:")
print(f"Accuracy:  {our_metrics['accuracy']:.3f} vs {sk_accuracy:.3f}")
print(f"Precision: {our_metrics['precision']:.3f} vs {sk_precision:.3f}")
print(f"Recall:    {our_metrics['recall']:.3f} vs {sk_recall:.3f}")
print(f"F1 Score:  {our_metrics['f1_score']:.3f} vs {sk_f1:.3f}")

Binary Classification Comparison:
---------------------------------

Our Implementation vs Sklearn:
Accuracy:  0.850 vs 0.850
Precision: 0.778 vs 0.778
Recall:    0.875 vs 0.875
F1 Score:  0.824 vs 0.824


In [11]:
# Multi-class classification comparison
print("\nMulti-class Classification Comparison:")
print("---------------------------------")

# Get labels from multi-class confusion matrix
y_true_multi, y_pred_multi = get_labels_from_confusion_matrix(multiclass_cm)

# Our implementation
our_macro = calculate_macro_average(multiclass_cm)

# Sklearn metrics
sk_precision_macro = precision_score(y_true_multi, y_pred_multi, average='macro')
sk_recall_macro = recall_score(y_true_multi, y_pred_multi, average='macro')
sk_f1_macro = f1_score(y_true_multi, y_pred_multi, average='macro')

# Compare results
print("\nMacro Averages - Our Implementation vs Sklearn:")
print(f"Precision: {our_macro['macro_precision']:.3f} vs {sk_precision_macro:.3f}")
print(f"Recall:    {our_macro['macro_recall']:.3f} vs {sk_recall_macro:.3f}")
print(f"F1 Score:  {our_macro['macro_f1']:.3f} vs {sk_f1_macro:.3f}")

# Compare per-class metrics
print("\nPer-class metrics comparison:")
for i, severity in enumerate(['Severe', 'Moderate', 'Mild']):
    print(f"\n{severity}:")
    our_metrics = calculate_per_class_metrics(multiclass_cm, i)
    
    # Sklearn per-class metrics
    sk_precision = precision_score(y_true_multi, y_pred_multi, average=None)[i]
    sk_recall = recall_score(y_true_multi, y_pred_multi, average=None)[i]
    sk_f1 = f1_score(y_true_multi, y_pred_multi, average=None)[i]
    
    print(f"Precision: {our_metrics['precision']:.3f} vs {sk_precision:.3f}")
    print(f"Recall:    {our_metrics['recall']:.3f} vs {sk_recall:.3f}")
    print(f"F1 Score:  {our_metrics['f1_score']:.3f} vs {sk_f1:.3f}")


Multi-class Classification Comparison:
---------------------------------

Macro Averages - Our Implementation vs Sklearn:
Precision: 0.768 vs 0.768
Recall:    0.773 vs 0.773
F1 Score:  0.770 vs 0.770

Per-class metrics comparison:

Severe:
Precision: 0.714 vs 0.714
Recall:    0.769 vs 0.769
F1 Score:  0.741 vs 0.741

Moderate:
Precision: 0.789 vs 0.789
Recall:    0.750 vs 0.750
F1 Score:  0.769 vs 0.769

Mild:
Precision: 0.800 vs 0.800
Recall:    0.800 vs 0.800
F1 Score:  0.800 vs 0.800
