# üöÄ Google Colab Setup

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ogautier1980/sandbox-ml/blob/main/cours/01_fondamentaux_mathematiques/01_demo_optimisation.ipynb)

**Si vous ex√©cutez ce notebook sur Google Colab**, ex√©cutez la cellule suivante pour installer les d√©pendances.

In [None]:
# Installation des d√©pendances (Google Colab uniquement)import sysIN_COLAB = 'google.colab' in sys.modulesif IN_COLAB:    print('üì¶ Installation des packages...')        # Packages ML de base    !pip install -q numpy pandas matplotlib seaborn scikit-learn        # D√©tection du chapitre et installation des d√©pendances sp√©cifiques    notebook_name = '01_demo_optimisation.ipynb'  # Sera remplac√© automatiquement        # Ch 06-08 : Deep Learning    if any(x in notebook_name for x in ['06_', '07_', '08_']):        !pip install -q torch torchvision torchaudio        # Ch 08 : NLP    if '08_' in notebook_name:        !pip install -q transformers datasets tokenizers        if 'rag' in notebook_name:            !pip install -q sentence-transformers faiss-cpu rank-bm25        # Ch 09 : Reinforcement Learning    if '09_' in notebook_name:        !pip install -q gymnasium[classic-control]        # Ch 04 : Boosting    if '04_' in notebook_name and 'boosting' in notebook_name:        !pip install -q xgboost lightgbm catboost        # Ch 05 : Clustering avanc√©    if '05_' in notebook_name:        !pip install -q umap-learn        # Ch 11 : S√©ries temporelles    if '11_' in notebook_name:        !pip install -q statsmodels prophet        # Ch 12 : Vision avanc√©e    if '12_' in notebook_name:        !pip install -q ultralytics timm segmentation-models-pytorch        # Ch 13 : Recommandation    if '13_' in notebook_name:        !pip install -q scikit-surprise implicit        # Ch 14 : MLOps    if '14_' in notebook_name:        !pip install -q mlflow fastapi pydantic        print('‚úÖ Installation termin√©e !')else:    print('‚ÑπÔ∏è  Environnement local d√©tect√©, les packages sont d√©j√† install√©s.')

# Chapitre 01 - D√©monstration : Calcul Diff√©rentiel et Optimisation

Ce notebook illustre les concepts d'optimisation fondamentaux au ML :
- Gradients et d√©riv√©es partielles
- Descente de gradient
- Convexit√©
- M√©thodes d'optimisation

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import seaborn as sns
from scipy import optimize

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
np.random.seed(42)

## 1. Gradient et D√©riv√©es Partielles

In [None]:
# Fonction 2D : f(x, y) = x^2 + 2y^2
def f(x, y):
    return x**2 + 2*y**2

# Gradient : ‚àáf = [‚àÇf/‚àÇx, ‚àÇf/‚àÇy] = [2x, 4y]
def grad_f(x, y):
    return np.array([2*x, 4*y])

# Cr√©er une grille
x_range = np.linspace(-3, 3, 50)
y_range = np.linspace(-3, 3, 50)
X, Y = np.meshgrid(x_range, y_range)
Z = f(X, Y)

# Visualisation 3D
fig = plt.figure(figsize=(16, 7))

# Surface 3D
ax1 = fig.add_subplot(121, projection='3d')
surf = ax1.plot_surface(X, Y, Z, cmap=cm.viridis, alpha=0.7)
ax1.set_xlabel('x', fontsize=12)
ax1.set_ylabel('y', fontsize=12)
ax1.set_zlabel('f(x, y)', fontsize=12)
ax1.set_title('Surface f(x, y) = x¬≤ + 2y¬≤', fontsize=14)
fig.colorbar(surf, ax=ax1, shrink=0.5)

# Contour + vecteurs gradient
ax2 = fig.add_subplot(122)
contour = ax2.contour(X, Y, Z, levels=20, cmap='viridis')
ax2.clabel(contour, inline=True, fontsize=8)

# Tracer quelques vecteurs gradient
points = [(2, 1), (1, 2), (-1.5, 1), (0, -2)]
for px, py in points:
    grad = grad_f(px, py)
    # Normaliser pour la visualisation
    grad_norm = grad / np.linalg.norm(grad) * 0.5
    ax2.arrow(px, py, grad_norm[0], grad_norm[1],
              head_width=0.15, head_length=0.2, fc='red', ec='red', linewidth=2)
    ax2.plot(px, py, 'ro', markersize=8)

ax2.set_xlabel('x', fontsize=12)
ax2.set_ylabel('y', fontsize=12)
ax2.set_title('Contours + Gradients (fl√®ches rouges)', fontsize=14)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Gradient en quelques points:")
for px, py in points:
    print(f"  ‚àáf({px:4.1f}, {py:4.1f}) = {grad_f(px, py)}")

## 2. Descente de Gradient - Visualisation

In [None]:
def gradient_descent(grad_func, x0, learning_rate=0.1, n_iterations=50):
    """
    Descente de gradient simple.
    
    Args:
        grad_func: fonction qui calcule le gradient
        x0: point de d√©part
        learning_rate: taux d'apprentissage
        n_iterations: nombre d'it√©rations
    
    Returns:
        history: liste des points visit√©s
    """
    x = x0.copy()
    history = [x.copy()]
    
    for _ in range(n_iterations):
        grad = grad_func(x[0], x[1])
        x = x - learning_rate * grad
        history.append(x.copy())
    
    return np.array(history)

# Point de d√©part
x0 = np.array([2.5, 2.0])

# Tester diff√©rents learning rates
learning_rates = [0.05, 0.2, 0.5]
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

for ax, lr in zip(axes, learning_rates):
    # Descente de gradient
    history = gradient_descent(grad_f, x0, learning_rate=lr, n_iterations=30)
    
    # Contours
    contour = ax.contour(X, Y, Z, levels=20, cmap='viridis', alpha=0.6)
    
    # Trajectoire
    ax.plot(history[:, 0], history[:, 1], 'r-o', linewidth=2, markersize=6,
            label='Trajectoire')
    ax.plot(x0[0], x0[1], 'g*', markersize=20, label='D√©part')
    ax.plot(history[-1, 0], history[-1, 1], 'r*', markersize=20, label='Arriv√©e')
    
    ax.set_xlabel('x', fontsize=12)
    ax.set_ylabel('y', fontsize=12)
    ax.set_title(f'Learning Rate Œ± = {lr}', fontsize=14)
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)
    
    # Afficher valeur finale
    final_val = f(history[-1, 0], history[-1, 1])
    ax.text(0.05, 0.95, f'f final = {final_val:.4f}',
            transform=ax.transAxes, fontsize=11,
            verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.suptitle('Descente de Gradient - Impact du Learning Rate', fontsize=16)
plt.tight_layout()
plt.show()

## 3. Convergence de la Descente de Gradient

In [None]:
# Comparer la convergence pour diff√©rents learning rates
x0 = np.array([2.5, 2.0])
learning_rates = [0.01, 0.1, 0.3, 0.5]
n_iterations = 50

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

for lr in learning_rates:
    history = gradient_descent(grad_f, x0, learning_rate=lr, n_iterations=n_iterations)
    
    # Valeurs de la fonction
    f_values = [f(x[0], x[1]) for x in history]
    
    # Norme du gradient
    grad_norms = [np.linalg.norm(grad_f(x[0], x[1])) for x in history]
    
    ax1.plot(f_values, '-o', label=f'Œ± = {lr}', markersize=4)
    ax2.semilogy(grad_norms, '-o', label=f'Œ± = {lr}', markersize=4)

ax1.set_xlabel('It√©ration', fontsize=12)
ax1.set_ylabel('f(x)', fontsize=12)
ax1.set_title('Convergence de la Fonction Objectif', fontsize=14)
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

ax2.set_xlabel('It√©ration', fontsize=12)
ax2.set_ylabel('||‚àáf||', fontsize=12)
ax2.set_title('Convergence de la Norme du Gradient', fontsize=14)
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Fonction Convexe vs Non-Convexe

In [None]:
# Fonction convexe : f(x) = x^2
x_range = np.linspace(-3, 3, 100)
f_convex = x_range**2

# Fonction non-convexe : f(x) = x^4 - 5x^2 + 4
f_nonconvex = x_range**4 - 5*x_range**2 + 4

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Convexe
ax1.plot(x_range, f_convex, 'b-', linewidth=2)
ax1.plot(0, 0, 'ro', markersize=15, label='Minimum global unique')
ax1.set_xlabel('x', fontsize=12)
ax1.set_ylabel('f(x)', fontsize=12)
ax1.set_title('Fonction Convexe: f(x) = x¬≤', fontsize=14)
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Non-convexe
ax2.plot(x_range, f_nonconvex, 'r-', linewidth=2)
# Minima locaux approximatifs
minima = [(-1.58, -2.25), (1.58, -2.25)]
for xm, ym in minima:
    ax2.plot(xm, ym, 'go', markersize=12, label='Minimum local')
ax2.plot(0, 4, 'yo', markersize=12, label='Maximum local')
ax2.set_xlabel('x', fontsize=12)
ax2.set_ylabel('f(x)', fontsize=12)
ax2.set_title('Fonction Non-Convexe: f(x) = x‚Å¥ - 5x¬≤ + 4', fontsize=14)
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Propri√©t√©s:")
print("  Fonction convexe: Tout minimum local est global")
print("  Fonction non-convexe: Peut avoir plusieurs minima locaux")
print("                       La descente de gradient peut converger vers un minimum local")

## 5. Comparaison : Gradient Descent vs Newton

In [None]:
# Fonction 1D : f(x) = 0.5 * x^2 - 2*x + 3
def f_1d(x):
    return 0.5 * x**2 - 2*x + 3

# D√©riv√©e : f'(x) = x - 2
def df_1d(x):
    return x - 2

# D√©riv√©e seconde : f''(x) = 1
def d2f_1d(x):
    return 1.0

# Descente de gradient
def gd_1d(x0, lr=0.3, n_iter=20):
    x = x0
    history = [x]
    for _ in range(n_iter):
        x = x - lr * df_1d(x)
        history.append(x)
    return np.array(history)

# M√©thode de Newton
def newton_1d(x0, n_iter=20):
    x = x0
    history = [x]
    for _ in range(n_iter):
        x = x - df_1d(x) / d2f_1d(x)
        history.append(x)
    return np.array(history)

# Initialisation
x0 = 5.0

# Optimisation
gd_history = gd_1d(x0, lr=0.5, n_iter=15)
newton_history = newton_1d(x0, n_iter=5)  # Newton converge beaucoup plus vite

# Visualisation
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Trajectoire sur la fonction
x_range = np.linspace(0, 6, 200)
ax1.plot(x_range, f_1d(x_range), 'b-', linewidth=2, label='f(x)')
ax1.plot(gd_history, f_1d(gd_history), 'ro-', markersize=8, linewidth=2, label='Gradient Descent')
ax1.plot(newton_history, f_1d(newton_history), 'gs-', markersize=10, linewidth=2, label='Newton')
ax1.plot(2, f_1d(2), 'k*', markersize=20, label='Minimum x=2')
ax1.set_xlabel('x', fontsize=12)
ax1.set_ylabel('f(x)', fontsize=12)
ax1.set_title('Trajectoires d\'Optimisation', fontsize=14)
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Convergence
gd_errors = np.abs(gd_history - 2)  # Distance au minimum
newton_errors = np.abs(newton_history - 2)

ax2.semilogy(range(len(gd_errors)), gd_errors, 'ro-', markersize=8, linewidth=2, label='Gradient Descent')
ax2.semilogy(range(len(newton_errors)), newton_errors, 'gs-', markersize=10, linewidth=2, label='Newton')
ax2.set_xlabel('It√©ration', fontsize=12)
ax2.set_ylabel('|x - x*|', fontsize=12)
ax2.set_title('Convergence (√©chelle log)', fontsize=14)
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Gradient Descent: {len(gd_errors)} it√©rations, erreur finale = {gd_errors[-1]:.6f}")
print(f"Newton: {len(newton_errors)} it√©rations, erreur finale = {newton_errors[-1]:.2e}")
print("\nNewton converge BEAUCOUP plus vite (convergence quadratique)")
print("Mais co√ªteux en calcul (n√©cessite la hessienne)")

## 6. Application : R√©gression Lin√©aire par Descente de Gradient

In [None]:
# G√©n√©rer des donn√©es synth√©tiques
np.random.seed(42)
n_samples = 100
X = 2 * np.random.rand(n_samples)
y = 4 + 3 * X + np.random.randn(n_samples) * 0.5

# Fonction de co√ªt : MSE = 1/n * sum((y_pred - y)^2)
def mse_cost(X, y, w, b):
    y_pred = w * X + b
    return np.mean((y_pred - y)**2)

# Gradient de MSE
def mse_gradient(X, y, w, b):
    y_pred = w * X + b
    error = y_pred - y
    grad_w = 2 * np.mean(error * X)
    grad_b = 2 * np.mean(error)
    return grad_w, grad_b

# Descente de gradient
w, b = 0.0, 0.0  # Initialisation
learning_rate = 0.1
n_iterations = 100

w_history = [w]
b_history = [b]
cost_history = [mse_cost(X, y, w, b)]

for i in range(n_iterations):
    grad_w, grad_b = mse_gradient(X, y, w, b)
    w = w - learning_rate * grad_w
    b = b - learning_rate * grad_b
    
    w_history.append(w)
    b_history.append(b)
    cost_history.append(mse_cost(X, y, w, b))

print(f"Param√®tres finaux apr√®s {n_iterations} it√©rations:")
print(f"  w (pente) = {w:.4f} (vrai: 3.0)")
print(f"  b (biais) = {b:.4f} (vrai: 4.0)")
print(f"  MSE final = {cost_history[-1]:.4f}")

In [None]:
# Visualisation
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# 1. Donn√©es + ligne de r√©gression finale
ax1.scatter(X, y, alpha=0.6, label='Donn√©es')
X_line = np.array([0, 2])
y_line = w * X_line + b
ax1.plot(X_line, y_line, 'r-', linewidth=2, label=f'y = {w:.2f}x + {b:.2f}')
ax1.set_xlabel('X', fontsize=12)
ax1.set_ylabel('y', fontsize=12)
ax1.set_title('R√©gression Lin√©aire Finale', fontsize=14)
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# 2. Convergence du co√ªt
ax2.plot(cost_history, 'b-', linewidth=2)
ax2.set_xlabel('It√©ration', fontsize=12)
ax2.set_ylabel('MSE', fontsize=12)
ax2.set_title('Convergence du Co√ªt (MSE)', fontsize=14)
ax2.grid(True, alpha=0.3)

# 3. √âvolution de w
ax3.plot(w_history, 'g-', linewidth=2)
ax3.axhline(3, color='r', linestyle='--', linewidth=2, label='Valeur vraie')
ax3.set_xlabel('It√©ration', fontsize=12)
ax3.set_ylabel('w (pente)', fontsize=12)
ax3.set_title('Convergence de w', fontsize=14)
ax3.legend(fontsize=11)
ax3.grid(True, alpha=0.3)

# 4. √âvolution de b
ax4.plot(b_history, 'purple', linewidth=2)
ax4.axhline(4, color='r', linestyle='--', linewidth=2, label='Valeur vraie')
ax4.set_xlabel('It√©ration', fontsize=12)
ax4.set_ylabel('b (biais)', fontsize=12)
ax4.set_title('Convergence de b', fontsize=14)
ax4.legend(fontsize=11)
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Paysage d'Optimisation (Surface de Co√ªt)

In [None]:
# Cr√©er une grille pour visualiser la surface de co√ªt
w_range = np.linspace(0, 6, 100)
b_range = np.linspace(0, 8, 100)
W, B = np.meshgrid(w_range, b_range)

# Calculer le co√ªt pour chaque (w, b)
Cost = np.zeros_like(W)
for i in range(W.shape[0]):
    for j in range(W.shape[1]):
        Cost[i, j] = mse_cost(X, y, W[i, j], B[i, j])

# Visualisation
fig = plt.figure(figsize=(18, 7))

# Surface 3D
ax1 = fig.add_subplot(121, projection='3d')
surf = ax1.plot_surface(W, B, Cost, cmap=cm.viridis, alpha=0.7)

# Trajectoire de la descente de gradient
ax1.plot(w_history, b_history, cost_history, 'r-o', linewidth=2, markersize=3)
ax1.scatter([w_history[0]], [b_history[0]], [cost_history[0]], 
            color='green', s=200, marker='*', label='D√©part')
ax1.scatter([w_history[-1]], [b_history[-1]], [cost_history[-1]], 
            color='red', s=200, marker='*', label='Arriv√©e')

ax1.set_xlabel('w (pente)', fontsize=12)
ax1.set_ylabel('b (biais)', fontsize=12)
ax1.set_zlabel('MSE', fontsize=12)
ax1.set_title('Surface de Co√ªt 3D', fontsize=14)
ax1.legend(fontsize=10)

# Contours 2D + trajectoire
ax2 = fig.add_subplot(122)
contour = ax2.contour(W, B, Cost, levels=30, cmap='viridis')
ax2.clabel(contour, inline=True, fontsize=8)

# Trajectoire
ax2.plot(w_history, b_history, 'r-o', linewidth=2, markersize=5, label='Descente de gradient')
ax2.plot(w_history[0], b_history[0], 'g*', markersize=20, label='D√©part')
ax2.plot(w_history[-1], b_history[-1], 'r*', markersize=20, label='Arriv√©e')

ax2.set_xlabel('w (pente)', fontsize=12)
ax2.set_ylabel('b (biais)', fontsize=12)
ax2.set_title('Contours de Co√ªt + Trajectoire GD', fontsize=14)
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Optimisation avec SciPy

In [None]:
# Fonction √† minimiser : Rosenbrock (fonction test classique)
def rosenbrock(x):
    return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2

# Gradient de Rosenbrock
def rosenbrock_grad(x):
    grad_x0 = -2*(1 - x[0]) - 400*x[0]*(x[1] - x[0]**2)
    grad_x1 = 200*(x[1] - x[0]**2)
    return np.array([grad_x0, grad_x1])

# Point initial
x0 = np.array([0.0, 0.0])

# M√©thodes d'optimisation
methods = ['BFGS', 'CG', 'Nelder-Mead']
results = {}

for method in methods:
    if method in ['BFGS', 'CG']:
        result = optimize.minimize(rosenbrock, x0, method=method, jac=rosenbrock_grad)
    else:
        result = optimize.minimize(rosenbrock, x0, method=method)
    
    results[method] = result
    print(f"\n{method}:")
    print(f"  Solution: {result.x}")
    print(f"  Valeur: {result.fun:.6e}")
    print(f"  It√©rations: {result.nit}")
    print(f"  Succ√®s: {result.success}")

# Le minimum global est √† (1, 1)
print("\nMinimum th√©orique: [1.0, 1.0], f = 0")

In [None]:
# Visualiser la fonction de Rosenbrock
x_range = np.linspace(-1.5, 1.5, 200)
y_range = np.linspace(-0.5, 1.5, 200)
X_rose, Y_rose = np.meshgrid(x_range, y_range)
Z_rose = (1 - X_rose)**2 + 100*(Y_rose - X_rose**2)**2

fig, ax = plt.subplots(figsize=(12, 10))

# Contours (√©chelle log pour mieux voir)
levels = np.logspace(-1, 3.5, 25)
contour = ax.contour(X_rose, Y_rose, Z_rose, levels=levels, 
                      cmap='viridis', linewidths=1)
ax.clabel(contour, inline=True, fontsize=8)

# Solutions des diff√©rentes m√©thodes
colors = {'BFGS': 'red', 'CG': 'blue', 'Nelder-Mead': 'green'}
for method, result in results.items():
    ax.plot(result.x[0], result.x[1], 'o', color=colors[method], 
            markersize=12, label=f'{method}: ({result.x[0]:.3f}, {result.x[1]:.3f})')

# Minimum th√©orique
ax.plot(1, 1, 'k*', markersize=20, label='Minimum global (1, 1)')

# Point de d√©part
ax.plot(x0[0], x0[1], 'wo', markersize=15, markeredgecolor='black', 
        markeredgewidth=2, label='D√©part (0, 0)')

ax.set_xlabel('x‚ÇÄ', fontsize=12)
ax.set_ylabel('x‚ÇÅ', fontsize=12)
ax.set_title('Fonction de Rosenbrock - Comparaison des M√©thodes', fontsize=14)
ax.legend(fontsize=11, loc='upper left')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## R√©sum√©

Dans ce notebook, nous avons explor√© :

1. **Gradient** : Direction de plus forte croissance, calcul de d√©riv√©es partielles
2. **Descente de gradient** : Algorithme d'optimisation de base en ML
3. **Learning rate** : Impact crucial sur la convergence
4. **Convexit√©** : Garantie de convergence vers le minimum global
5. **M√©thode de Newton** : Convergence quadratique mais co√ªteuse
6. **Application pratique** : R√©gression lin√©aire par gradient descent
7. **Paysage d'optimisation** : Visualisation de la surface de co√ªt
8. **SciPy optimize** : M√©thodes d'optimisation avanc√©es (BFGS, CG, etc.)

Ces techniques sont **fondamentales** pour :
- Entra√Ænement de r√©seaux de neurones (backpropagation)
- Optimisation d'hyperparam√®tres
- R√©solution de probl√®mes ML en g√©n√©ral

**Next steps** : SGD, Momentum, Adam dans le Chapitre 06 (Deep Learning) !