In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris, make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
print("Libraries loaded successfully.")

## The Three Multiclass Strategies

**One-vs-Rest (OvR):** Train K binary classifiers, one for each class vs all others.
- For K classes, train K classifiers
- Classifier k: "Is this class k or not?"
- Prediction: Pick class with highest confidence

**One-vs-One (OvO):** Train a classifier for every pair of classes.
- For K classes, train K(K-1)/2 classifiers
- Each classifier distinguishes between exactly 2 classes
- Prediction: Voting among all classifiers

**Softmax (Multinomial):** Single model that outputs K probabilities.
- Single classifier with K output nodes
- Softmax function ensures probabilities sum to 1
- Prediction: Class with highest probability

---

## Experiment 1: Iris Classification

Let's compare all three approaches on the classic Iris dataset (3 classes, 4 features).

In [None]:
# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target
class_names = iris.target_names

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

print("Iris Dataset:")
print(f"  Classes: {list(class_names)}")
print(f"  Samples: {X.shape[0]}, Features: {X.shape[1]}")
print(f"  Train: {len(X_train)}, Test: {len(X_test)}")

In [None]:
# Train all three multiclass strategies
import time

# One-vs-Rest
start = time.time()
ovr = OneVsRestClassifier(LogisticRegression(max_iter=1000))
ovr.fit(X_train, y_train)
ovr_time = (time.time() - start) * 1000
ovr_acc = ovr.score(X_test, y_test)

# One-vs-One
start = time.time()
ovo = OneVsOneClassifier(LogisticRegression(max_iter=1000))
ovo.fit(X_train, y_train)
ovo_time = (time.time() - start) * 1000
ovo_acc = ovo.score(X_test, y_test)

# Softmax (Multinomial)
start = time.time()
softmax = LogisticRegression(multi_class='multinomial', max_iter=1000)
softmax.fit(X_train, y_train)
softmax_time = (time.time() - start) * 1000
softmax_acc = softmax.score(X_test, y_test)

print("=" * 60)
print("MULTICLASS STRATEGY COMPARISON ON IRIS")
print("=" * 60)
print(f"\n{'Strategy':<20} {'Accuracy':<15} {'Time (ms)':<15} {'# Classifiers'}")
print("-" * 60)
print(f"{'One-vs-Rest (OvR)':<20} {ovr_acc:.4f}         {ovr_time:.2f}           {len(ovr.estimators_)}")
print(f"{'One-vs-One (OvO)':<20} {ovo_acc:.4f}         {ovo_time:.2f}           {len(ovo.estimators_)}")
print(f"{'Softmax':<20} {softmax_acc:.4f}         {softmax_time:.2f}           1")
print("-" * 60)

In [None]:
print("\nINTERPRETATION:")
print(f"\n1. Number of Classifiers:")
print(f"   - OvR: K = {len(class_names)} classifiers (one per class)")
print(f"   - OvO: K(K-1)/2 = {len(class_names)}*{len(class_names)-1}/2 = {len(ovo.estimators_)} classifiers")
print(f"   - Softmax: 1 unified model")

print(f"\n2. Training Complexity:")
print(f"   - OvR: Each classifier sees ALL training data")
print(f"   - OvO: Each classifier sees only data from 2 classes (smaller subsets)")
print(f"   - Softmax: Single optimization over all data")

print(f"\n3. For Iris (3 classes, 150 samples):")
print(f"   - All three perform similarly because classes are well-separated")
print(f"   - Differences become more pronounced with more classes or overlap")

## Experiment 2: Decision Boundary Visualization

Let's visualize how each strategy partitions the feature space. We'll use only 2 features for visualization.

In [None]:
# Use only 2 features for visualization
X_2d = X_scaled[:, :2]  # sepal length, sepal width
X_train_2d, X_test_2d, y_train_2d, y_test_2d = train_test_split(X_2d, y, test_size=0.3, random_state=42)

# Train models on 2D data
ovr_2d = OneVsRestClassifier(LogisticRegression(max_iter=1000)).fit(X_train_2d, y_train_2d)
ovo_2d = OneVsOneClassifier(LogisticRegression(max_iter=1000)).fit(X_train_2d, y_train_2d)
softmax_2d = LogisticRegression(multi_class='multinomial', max_iter=1000).fit(X_train_2d, y_train_2d)

# Create mesh grid for decision boundary
h = 0.02
x_min, x_max = X_2d[:, 0].min() - 1, X_2d[:, 0].max() + 1
y_min, y_max = X_2d[:, 1].min() - 1, X_2d[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

print("Models trained on 2D features for visualization.")

In [None]:
# Plot decision boundaries
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
models = [ovr_2d, ovo_2d, softmax_2d]
titles = ['One-vs-Rest (OvR)', 'One-vs-One (OvO)', 'Softmax (Multinomial)']
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

for ax, model, title in zip(axes, models, titles):
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
    ax.contour(xx, yy, Z, colors='black', linewidths=0.5)
    
    for i, (color, name) in enumerate(zip(['blue', 'green', 'red'], class_names)):
        mask = y == i
        ax.scatter(X_2d[mask, 0], X_2d[mask, 1], c=color, label=name, 
                   edgecolors='black', s=50, alpha=0.7)
    
    ax.set_xlabel('Sepal Length (scaled)', fontsize=11)
    ax.set_ylabel('Sepal Width (scaled)', fontsize=11)
    ax.set_title(f'{title}\nAccuracy: {model.score(X_test_2d, y_test_2d):.3f}', fontsize=12, fontweight='bold')
    ax.legend(loc='upper right', fontsize=9)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("OBSERVATION:")
print("  - All three methods produce LINEAR decision boundaries (because base is Logistic Regression)")
print("  - OvR: Each boundary separates one class from the rest")
print("  - OvO: Boundaries emerge from voting among pairwise classifiers")
print("  - Softmax: Single unified model produces all boundaries simultaneously")

## Experiment 3: Scaling to More Classes

Let's see how the methods compare as we increase the number of classes.

In [None]:
# Compare scaling behavior
n_classes_list = [3, 5, 7, 10]
results = []

for n_classes in n_classes_list:
    # Generate synthetic data
    X_synth, y_synth = make_classification(n_samples=500, n_features=10, n_informative=8,
                                            n_classes=n_classes, n_clusters_per_class=1,
                                            random_state=42)
    X_synth = StandardScaler().fit_transform(X_synth)
    X_tr, X_te, y_tr, y_te = train_test_split(X_synth, y_synth, test_size=0.3, random_state=42)
    
    # OvR
    start = time.time()
    ovr = OneVsRestClassifier(LogisticRegression(max_iter=1000)).fit(X_tr, y_tr)
    ovr_t = (time.time() - start) * 1000
    
    # OvO
    start = time.time()
    ovo = OneVsOneClassifier(LogisticRegression(max_iter=1000)).fit(X_tr, y_tr)
    ovo_t = (time.time() - start) * 1000
    
    # Softmax
    start = time.time()
    smax = LogisticRegression(multi_class='multinomial', max_iter=1000).fit(X_tr, y_tr)
    smax_t = (time.time() - start) * 1000
    
    results.append({
        'K': n_classes,
        'OvR_clf': n_classes,
        'OvO_clf': n_classes * (n_classes - 1) // 2,
        'OvR_time': ovr_t,
        'OvO_time': ovo_t,
        'Softmax_time': smax_t,
        'OvR_acc': ovr.score(X_te, y_te),
        'OvO_acc': ovo.score(X_te, y_te),
        'Softmax_acc': smax.score(X_te, y_te)
    })

print("=" * 70)
print("SCALING COMPARISON: Number of Classes vs Performance")
print("=" * 70)
print(f"\n{'K':<5} {'OvR clf':<10} {'OvO clf':<10} {'OvR acc':<10} {'OvO acc':<10} {'Softmax acc'}")
print("-" * 70)
for r in results:
    print(f"{r['K']:<5} {r['OvR_clf']:<10} {r['OvO_clf']:<10} {r['OvR_acc']:<10.3f} {r['OvO_acc']:<10.3f} {r['Softmax_acc']:.3f}")

print("\nKEY INSIGHT:")
print("  OvO classifier count grows as K(K-1)/2 - quadratic!")
print("  For K=10: OvO needs 45 classifiers vs OvR's 10")
print("  Softmax scales best: always just 1 model regardless of K")

## When to Use Which Strategy?

| Scenario | Recommended | Why |
|----------|-------------|-----|
| Few classes (K < 5) | **Any** | All perform similarly |
| Many classes (K > 10) | **Softmax** or **OvR** | OvO has too many classifiers |
| Binary classifiers are expensive | **Softmax** | Single model |
| Classes are imbalanced | **OvO** | Each classifier sees balanced subsets |
| Need probability outputs | **Softmax** | Natural probability interpretation |
| Using non-probabilistic base (SVM) | **OvR** or **OvO** | Standard approaches for SVM |

---

## Summary

**One-vs-Rest (OvR):**
- Train K binary classifiers
- Each sees all data (can be slow for large datasets)
- May have ambiguous regions

**One-vs-One (OvO):**
- Train K(K-1)/2 classifiers
- Each sees only subset of data (faster per classifier)
- Voting can resolve ties

**Softmax:**
- Single unified model
- Scales best with number of classes
- Outputs proper probabilities