# üöÄ Google Colab Setup

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ogautier1980/sandbox-ml/blob/main/cours/06_reseaux_neurones_fondamentaux/06_demo_mlp_numpy.ipynb)

**Si vous ex√©cutez ce notebook sur Google Colab**, ex√©cutez la cellule suivante pour installer les d√©pendances.

In [None]:
# Installation des d√©pendances (Google Colab uniquement)import sysIN_COLAB = 'google.colab' in sys.modulesif IN_COLAB:    print('üì¶ Installation des packages...')        # Packages ML de base    !pip install -q numpy pandas matplotlib seaborn scikit-learn        # D√©tection du chapitre et installation des d√©pendances sp√©cifiques    notebook_name = '06_demo_mlp_numpy.ipynb'  # Sera remplac√© automatiquement        # Ch 06-08 : Deep Learning    if any(x in notebook_name for x in ['06_', '07_', '08_']):        !pip install -q torch torchvision torchaudio        # Ch 08 : NLP    if '08_' in notebook_name:        !pip install -q transformers datasets tokenizers        if 'rag' in notebook_name:            !pip install -q sentence-transformers faiss-cpu rank-bm25        # Ch 09 : Reinforcement Learning    if '09_' in notebook_name:        !pip install -q gymnasium[classic-control]        # Ch 04 : Boosting    if '04_' in notebook_name and 'boosting' in notebook_name:        !pip install -q xgboost lightgbm catboost        # Ch 05 : Clustering avanc√©    if '05_' in notebook_name:        !pip install -q umap-learn        # Ch 11 : S√©ries temporelles    if '11_' in notebook_name:        !pip install -q statsmodels prophet        # Ch 12 : Vision avanc√©e    if '12_' in notebook_name:        !pip install -q ultralytics timm segmentation-models-pytorch        # Ch 13 : Recommandation    if '13_' in notebook_name:        !pip install -q scikit-surprise implicit        # Ch 14 : MLOps    if '14_' in notebook_name:        !pip install -q mlflow fastapi pydantic        print('‚úÖ Installation termin√©e !')else:    print('‚ÑπÔ∏è  Environnement local d√©tect√©, les packages sont d√©j√† install√©s.')

# Chapitre 06 - D√©monstration : MLP from Scratch avec NumPy

**Objectif** : Impl√©menter un r√©seau de neurones multicouche (MLP) complet en NumPy pur pour comprendre les m√©canismes internes.

**Contenu** :
1. Forward pass (propagation avant)
2. Backward pass (r√©tropropagation)
3. Optimisation SGD avec momentum
4. Classification MNIST
5. Visualisation des poids et d√©cision boundaries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml, make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns

np.random.seed(42)

## 1. Fonctions d'activation et leurs d√©riv√©es

In [None]:
class Activation:
    """Fonctions d'activation et leurs d√©riv√©es."""
    
    @staticmethod
    def sigmoid(z):
        """Sigmoid: œÉ(z) = 1 / (1 + e^(-z))"""
        return 1 / (1 + np.exp(-np.clip(z, -500, 500)))  # Clip pour stabilit√©
    
    @staticmethod
    def sigmoid_derivative(a):
        """D√©riv√©e: œÉ'(z) = œÉ(z) * (1 - œÉ(z))"""
        return a * (1 - a)
    
    @staticmethod
    def relu(z):
        """ReLU: max(0, z)"""
        return np.maximum(0, z)
    
    @staticmethod
    def relu_derivative(a):
        """D√©riv√©e: 1 si z > 0, 0 sinon"""
        return (a > 0).astype(float)
    
    @staticmethod
    def softmax(z):
        """Softmax: exp(z_i) / sum(exp(z_j))"""
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # Stabilit√© num√©rique
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

# Test des activations
z = np.linspace(-5, 5, 100)
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Sigmoid
axes[0].plot(z, Activation.sigmoid(z), label='Sigmoid')
axes[0].plot(z, Activation.sigmoid_derivative(Activation.sigmoid(z)), label="D√©riv√©e", linestyle='--')
axes[0].set_title('Sigmoid')
axes[0].legend()
axes[0].grid(True)

# ReLU
axes[1].plot(z, Activation.relu(z), label='ReLU')
axes[1].plot(z, Activation.relu_derivative(Activation.relu(z)), label="D√©riv√©e", linestyle='--')
axes[1].set_title('ReLU')
axes[1].legend()
axes[1].grid(True)

# Softmax
z_soft = np.array([[1, 2, 3], [1, 2, 3]]).T
softmax_out = Activation.softmax(z_soft)
axes[2].bar(range(3), softmax_out[0])
axes[2].set_title('Softmax (z=[1,2,3])')
axes[2].set_ylabel('Probabilit√©')
axes[2].grid(True)

plt.tight_layout()
plt.show()

print(f"Softmax([1, 2, 3]) = {softmax_out[0]}")
print(f"Somme = {softmax_out[0].sum():.6f}")

## 2. Classe MLP from Scratch

In [None]:
class MLP:
    """Multi-Layer Perceptron avec r√©tropropagation."""
    
    def __init__(self, layer_sizes, activation='relu', learning_rate=0.01, momentum=0.9):
        """
        Initialise le MLP.
        
        Parameters
        ----------
        layer_sizes : list
            [input_size, hidden1, hidden2, ..., output_size]
        activation : str
            'relu' ou 'sigmoid'
        learning_rate : float
            Taux d'apprentissage
        momentum : float
            Momentum SGD (0.0 = SGD vanilla)
        """
        self.layer_sizes = layer_sizes
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.activation = activation
        
        # Initialisation Xavier/He
        self.weights = []
        self.biases = []
        self.velocity_w = []  # Pour momentum
        self.velocity_b = []
        
        for i in range(len(layer_sizes) - 1):
            # Xavier init pour Sigmoid, He init pour ReLU
            if activation == 'relu':
                scale = np.sqrt(2.0 / layer_sizes[i])
            else:
                scale = np.sqrt(1.0 / layer_sizes[i])
            
            w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * scale
            b = np.zeros((1, layer_sizes[i+1]))
            
            self.weights.append(w)
            self.biases.append(b)
            self.velocity_w.append(np.zeros_like(w))
            self.velocity_b.append(np.zeros_like(b))
        
        self.history = {'loss': [], 'accuracy': []}
    
    def _activate(self, z, layer_idx):
        """Applique l'activation (softmax pour derni√®re couche)."""
        if layer_idx == len(self.weights) - 1:  # Derni√®re couche
            return Activation.softmax(z)
        elif self.activation == 'relu':
            return Activation.relu(z)
        else:
            return Activation.sigmoid(z)
    
    def _activate_derivative(self, a, layer_idx):
        """Calcule la d√©riv√©e de l'activation."""
        if layer_idx == len(self.weights) - 1:  # Softmax g√©r√© dans backprop
            return 1
        elif self.activation == 'relu':
            return Activation.relu_derivative(a)
        else:
            return Activation.sigmoid_derivative(a)
    
    def forward(self, X):
        """Propagation avant."""
        self.activations = [X]
        self.z_values = []
        
        for i, (w, b) in enumerate(zip(self.weights, self.biases)):
            z = self.activations[-1] @ w + b
            a = self._activate(z, i)
            
            self.z_values.append(z)
            self.activations.append(a)
        
        return self.activations[-1]
    
    def backward(self, X, y_true):
        """R√©tropropagation."""
        m = X.shape[0]
        y_pred = self.activations[-1]
        
        # Gradient de la derni√®re couche (Softmax + Cross-Entropy)
        delta = y_pred - y_true  # Simplification √©l√©gante!
        
        # R√©tropropagation
        for i in reversed(range(len(self.weights))):
            # Gradients
            grad_w = self.activations[i].T @ delta / m
            grad_b = np.sum(delta, axis=0, keepdims=True) / m
            
            # Mise √† jour avec momentum
            self.velocity_w[i] = self.momentum * self.velocity_w[i] - self.learning_rate * grad_w
            self.velocity_b[i] = self.momentum * self.velocity_b[i] - self.learning_rate * grad_b
            
            self.weights[i] += self.velocity_w[i]
            self.biases[i] += self.velocity_b[i]
            
            # Propager le gradient
            if i > 0:
                delta = (delta @ self.weights[i].T) * self._activate_derivative(self.activations[i], i-1)
    
    def fit(self, X, y, epochs=100, batch_size=32, X_val=None, y_val=None, verbose=True):
        """Entra√Æne le MLP."""
        n_samples = X.shape[0]
        
        for epoch in range(epochs):
            # Mini-batch SGD
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]
            
            for i in range(0, n_samples, batch_size):
                X_batch = X_shuffled[i:i+batch_size]
                y_batch = y_shuffled[i:i+batch_size]
                
                # Forward + Backward
                self.forward(X_batch)
                self.backward(X_batch, y_batch)
            
            # √âvaluation
            if (epoch + 1) % 10 == 0 or epoch == 0:
                y_pred = self.forward(X)
                loss = -np.mean(y * np.log(y_pred + 1e-8))  # Cross-entropy
                acc = accuracy_score(np.argmax(y, axis=1), np.argmax(y_pred, axis=1))
                
                self.history['loss'].append(loss)
                self.history['accuracy'].append(acc)
                
                if verbose:
                    val_str = ""
                    if X_val is not None:
                        y_val_pred = self.forward(X_val)
                        val_acc = accuracy_score(np.argmax(y_val, axis=1), np.argmax(y_val_pred, axis=1))
                        val_str = f" - Val Acc: {val_acc:.4f}"
                    
                    print(f"Epoch {epoch+1}/{epochs} - Loss: {loss:.4f} - Acc: {acc:.4f}{val_str}")
    
    def predict(self, X):
        """Pr√©diction (classe la plus probable)."""
        y_pred = self.forward(X)
        return np.argmax(y_pred, axis=1)

print("Classe MLP impl√©ment√©e avec succ√®s!")

## 3. Test sur dataset simple (Make Moons)

In [None]:
# G√©n√©ration dataset
X_moons, y_moons = make_moons(n_samples=1000, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_moons, y_moons, test_size=0.2, random_state=42)

# One-hot encoding
y_train_onehot = np.eye(2)[y_train]
y_test_onehot = np.eye(2)[y_test]

# Standardisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train shape: {X_train_scaled.shape}")
print(f"Test shape: {X_test_scaled.shape}")

In [None]:
# Entra√Ænement MLP
mlp = MLP(layer_sizes=[2, 16, 16, 2], activation='relu', learning_rate=0.1, momentum=0.9)
mlp.fit(X_train_scaled, y_train_onehot, epochs=200, batch_size=32, 
        X_val=X_test_scaled, y_val=y_test_onehot, verbose=True)

In [None]:
# Visualisation des courbes d'apprentissage
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(range(0, 200, 10), mlp.history['loss'], marker='o')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss (Cross-Entropy)')
axes[0].set_title('Loss pendant l\'entra√Ænement')
axes[0].grid(True)

axes[1].plot(range(0, 200, 10), mlp.history['accuracy'], marker='o', color='green')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy pendant l\'entra√Ænement')
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Decision boundary
def plot_decision_boundary(model, X, y, title="Decision Boundary"):
    """Visualise la fronti√®re de d√©cision du MLP."""
    h = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.figure(figsize=(8, 6))
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
    plt.scatter(X[:, 0], X[:, 1], c=y, s=30, edgecolors='k', cmap='coolwarm')
    plt.title(title)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.colorbar(label='Classe')
    plt.show()

plot_decision_boundary(mlp, X_test_scaled, y_test, title="MLP Decision Boundary (Make Moons)")

## 4. Classification MNIST (chiffres manuscrits)

In [None]:
# Chargement MNIST
print("Chargement MNIST...")
mnist = fetch_openml('mnist_784', version=1, parser='auto')
X_mnist = mnist.data.astype('float32') / 255.0  # Normalisation [0, 1]  # type: ignore
y_mnist = mnist.target.astype('int')  # type: ignore

# Sous-√©chantillon pour acc√©l√©rer (10% du dataset)
X_mnist_small = X_mnist[:7000]
y_mnist_small = y_mnist[:7000]

X_train, X_test, y_train, y_test = train_test_split(
    X_mnist_small, y_mnist_small, test_size=0.2, random_state=42
)

# One-hot encoding
y_train_onehot = np.eye(10)[y_train]
y_test_onehot = np.eye(10)[y_test]

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

In [None]:
# Visualisation √©chantillons
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
    ax.imshow(X_train[i].reshape(28, 28), cmap='gray')
    ax.set_title(f"Label: {y_train[i]}")
    ax.axis('off')
plt.suptitle('√âchantillons MNIST')
plt.tight_layout()
plt.show()

In [None]:
# Entra√Ænement MLP (784 -> 128 -> 64 -> 10)
mlp_mnist = MLP(
    layer_sizes=[784, 128, 64, 10], 
    activation='relu', 
    learning_rate=0.1, 
    momentum=0.9
)

print("\nEntra√Ænement MLP sur MNIST...")
mlp_mnist.fit(
    X_train, y_train_onehot, 
    epochs=50, 
    batch_size=64, 
    X_val=X_test, 
    y_val=y_test_onehot, 
    verbose=True
)

In [None]:
# √âvaluation finale
y_pred = mlp_mnist.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print(f"\nAccuracy test finale: {test_acc:.4f}")

# Matrice de confusion
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Matrice de Confusion - MNIST')
plt.xlabel('Pr√©diction')
plt.ylabel('V√©rit√©')
plt.show()

## 5. Visualisation des poids de la premi√®re couche

In [None]:
# Visualisation des 64 premiers neurones de la couche cach√©e 1
weights_layer1 = mlp_mnist.weights[0]  # Shape: (784, 128)

fig, axes = plt.subplots(8, 8, figsize=(12, 12))
for i, ax in enumerate(axes.flat):
    if i < 64:
        weight_image = weights_layer1[:, i].reshape(28, 28)
        ax.imshow(weight_image, cmap='coolwarm', vmin=-1, vmax=1)
        ax.axis('off')
    else:
        ax.axis('off')

plt.suptitle('Poids des 64 premiers neurones (Layer 1)', fontsize=16)
plt.tight_layout()
plt.show()

## 6. Pr√©dictions sur nouveaux √©chantillons

In [None]:
# Pr√©diction sur 10 √©chantillons de test
n_samples = 10
indices = np.random.choice(len(X_test), n_samples, replace=False)

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for i, ax in enumerate(axes.flat):
    idx = indices[i]
    image = X_test[idx].reshape(28, 28)
    true_label = y_test[idx]
    
    # Pr√©diction
    pred_probs = mlp_mnist.forward(X_test[idx:idx+1])[0]
    pred_label = np.argmax(pred_probs)
    confidence = pred_probs[pred_label]
    
    # Affichage
    ax.imshow(image, cmap='gray')
    color = 'green' if pred_label == true_label else 'red'
    ax.set_title(f"True: {true_label} | Pred: {pred_label} ({confidence:.2f})", color=color)
    ax.axis('off')

plt.tight_layout()
plt.show()

## Conclusion

**Points cl√©s** :
1. **Forward pass** : Calcul s√©quentiel des activations
2. **Backward pass** : R√©tropropagation du gradient avec la chain rule
3. **Optimisation SGD + Momentum** : Acc√©l√®re la convergence
4. **Initialisation Xavier/He** : Stabilise l'entra√Ænement
5. **Softmax + Cross-Entropy** : Simplifie le gradient (y_pred - y_true)

**R√©sultats MNIST** :
- Accuracy ~95% avec architecture simple (784-128-64-10)
- Les poids de la couche 1 apprennent des "features" (contours, formes)
- Mini-batch SGD acc√©l√®re l'entra√Ænement vs full-batch

**Prochaines √©tapes** :
- Ajouter **Dropout** pour r√©gularisation
- Impl√©menter **Batch Normalization**
- Utiliser **Adam optimizer** au lieu de SGD
- Tester sur datasets plus complexes (CIFAR-10)