In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.datasets import make_regression
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
print("Libraries loaded successfully.")

## The Three Regularization Philosophies

All three methods add a penalty to the ordinary least squares cost function:

**Ordinary Least Squares (no regularization):**
$$J(\theta) = \frac{1}{2m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})^2$$

**Ridge Regression (L2 penalty):**
$$J(\theta) = \frac{1}{2m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})^2 + \lambda \sum_{j=1}^{n} \theta_j^2$$

**Lasso Regression (L1 penalty):**
$$J(\theta) = \frac{1}{2m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})^2 + \lambda \sum_{j=1}^{n} |\theta_j|$$

**Elastic Net (L1 + L2):**
$$J(\theta) = \frac{1}{2m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})^2 + \lambda_1 \sum_{j=1}^{n} |\theta_j| + \lambda_2 \sum_{j=1}^{n} \theta_j^2$$

---

## Experiment 1: Coefficient Shrinkage Visualization

Let's create a dataset with many features and see how each regularization method shrinks the coefficients as we increase the penalty strength.

In [None]:
# Create dataset with 10 features, only 5 are truly informative
X, y, true_coef = make_regression(n_samples=100, n_features=10, n_informative=5,
                                   noise=10, coef=True, random_state=42)

# Standardize features (important for regularization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Dataset created:")
print(f"  Samples: {X.shape[0]}, Features: {X.shape[1]}")
print(f"  True informative features: 5 out of 10")
print(f"\nTrue coefficients: {np.round(true_coef, 2)}")
print(f"  Non-zero coefficients: {np.sum(true_coef != 0)}")

In [None]:
# Track coefficients across different regularization strengths
alphas = np.logspace(-3, 3, 100)

ridge_coefs = []
lasso_coefs = []
elastic_coefs = []

for alpha in alphas:
    ridge = Ridge(alpha=alpha).fit(X_scaled, y)
    lasso = Lasso(alpha=alpha, max_iter=10000).fit(X_scaled, y)
    elastic = ElasticNet(alpha=alpha, l1_ratio=0.5, max_iter=10000).fit(X_scaled, y)
    
    ridge_coefs.append(ridge.coef_)
    lasso_coefs.append(lasso.coef_)
    elastic_coefs.append(elastic.coef_)

ridge_coefs = np.array(ridge_coefs)
lasso_coefs = np.array(lasso_coefs)
elastic_coefs = np.array(elastic_coefs)

print("Coefficient paths computed for 100 alpha values.")

In [None]:
# Visualize coefficient paths
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i in range(10):
    axes[0].plot(np.log10(alphas), ridge_coefs[:, i], label=f'Feature {i+1}')
    axes[1].plot(np.log10(alphas), lasso_coefs[:, i], label=f'Feature {i+1}')
    axes[2].plot(np.log10(alphas), elastic_coefs[:, i], label=f'Feature {i+1}')

axes[0].set_title('Ridge (L2): Coefficients Shrink Smoothly', fontsize=12, fontweight='bold')
axes[0].set_xlabel('log(alpha)')
axes[0].set_ylabel('Coefficient Value')
axes[0].axhline(y=0, color='black', linestyle='--', alpha=0.3)
axes[0].grid(True, alpha=0.3)

axes[1].set_title('Lasso (L1): Coefficients Hit Zero', fontsize=12, fontweight='bold')
axes[1].set_xlabel('log(alpha)')
axes[1].set_ylabel('Coefficient Value')
axes[1].axhline(y=0, color='black', linestyle='--', alpha=0.3)
axes[1].grid(True, alpha=0.3)

axes[2].set_title('Elastic Net: Hybrid Behavior', fontsize=12, fontweight='bold')
axes[2].set_xlabel('log(alpha)')
axes[2].set_ylabel('Coefficient Value')
axes[2].axhline(y=0, color='black', linestyle='--', alpha=0.3)
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nKEY OBSERVATIONS:")
print("  Ridge: Coefficients shrink gradually toward zero but NEVER reach exactly zero")
print("  Lasso: Coefficients are FORCED to zero - automatic feature selection!")
print("  Elastic Net: Some coefficients hit zero, but shrinkage is smoother than Lasso")

## Experiment 2: Feature Selection Comparison

The key difference between Ridge and Lasso is **sparsity**. Lasso can set coefficients exactly to zero, effectively selecting a subset of features. Let's quantify this.

In [None]:
# Train models with moderate regularization
alpha = 1.0

ols = LinearRegression().fit(X_scaled, y)
ridge = Ridge(alpha=alpha).fit(X_scaled, y)
lasso = Lasso(alpha=alpha, max_iter=10000).fit(X_scaled, y)
elastic = ElasticNet(alpha=alpha, l1_ratio=0.5, max_iter=10000).fit(X_scaled, y)

print("=" * 60)
print("COEFFICIENT COMPARISON (alpha = 1.0)")
print("=" * 60)
print(f"\n{'Feature':<10} {'True':<10} {'OLS':<10} {'Ridge':<10} {'Lasso':<10} {'Elastic':<10}")
print("-" * 60)

for i in range(10):
    print(f"Feature {i+1:<3} {true_coef[i]:<10.2f} {ols.coef_[i]:<10.2f} {ridge.coef_[i]:<10.2f} {lasso.coef_[i]:<10.2f} {elastic.coef_[i]:<10.2f}")

print("-" * 60)
print(f"{'Non-zero':<10} {np.sum(true_coef != 0):<10} {np.sum(np.abs(ols.coef_) > 0.01):<10} {np.sum(np.abs(ridge.coef_) > 0.01):<10} {np.sum(np.abs(lasso.coef_) > 0.01):<10} {np.sum(np.abs(elastic.coef_) > 0.01):<10}")

print("\nINTERPRETATION:")
print(f"  - True model has {np.sum(true_coef != 0)} non-zero features")
print(f"  - OLS and Ridge keep all {X.shape[1]} features (no selection)")
print(f"  - Lasso selected {np.sum(np.abs(lasso.coef_) > 0.01)} features (sparse!)")
print(f"  - Elastic Net selected {np.sum(np.abs(elastic.coef_) > 0.01)} features")

## Experiment 3: Cross-Validation for Hyperparameter Selection

How do we choose the right alpha? Cross-validation. We'll find the optimal regularization strength for each method.

In [None]:
# Find optimal alpha via cross-validation
alphas_cv = np.logspace(-4, 2, 50)

ridge_scores = []
lasso_scores = []
elastic_scores = []

for alpha in alphas_cv:
    ridge_cv = cross_val_score(Ridge(alpha=alpha), X_scaled, y, cv=5, scoring='neg_mean_squared_error')
    lasso_cv = cross_val_score(Lasso(alpha=alpha, max_iter=10000), X_scaled, y, cv=5, scoring='neg_mean_squared_error')
    elastic_cv = cross_val_score(ElasticNet(alpha=alpha, l1_ratio=0.5, max_iter=10000), X_scaled, y, cv=5, scoring='neg_mean_squared_error')
    
    ridge_scores.append(-ridge_cv.mean())
    lasso_scores.append(-lasso_cv.mean())
    elastic_scores.append(-elastic_cv.mean())

# Find best alphas
best_ridge_alpha = alphas_cv[np.argmin(ridge_scores)]
best_lasso_alpha = alphas_cv[np.argmin(lasso_scores)]
best_elastic_alpha = alphas_cv[np.argmin(elastic_scores)]

print("=" * 60)
print("OPTIMAL REGULARIZATION STRENGTH (5-Fold CV)")
print("=" * 60)
print(f"\nRidge:      alpha = {best_ridge_alpha:.4f}, MSE = {min(ridge_scores):.2f}")
print(f"Lasso:      alpha = {best_lasso_alpha:.4f}, MSE = {min(lasso_scores):.2f}")
print(f"Elastic Net: alpha = {best_elastic_alpha:.4f}, MSE = {min(elastic_scores):.2f}")

In [None]:
# Visualize CV results
plt.figure(figsize=(10, 6))
plt.plot(np.log10(alphas_cv), ridge_scores, 'b-', label='Ridge', linewidth=2)
plt.plot(np.log10(alphas_cv), lasso_scores, 'r-', label='Lasso', linewidth=2)
plt.plot(np.log10(alphas_cv), elastic_scores, 'g-', label='Elastic Net', linewidth=2)

plt.axvline(x=np.log10(best_ridge_alpha), color='b', linestyle='--', alpha=0.5)
plt.axvline(x=np.log10(best_lasso_alpha), color='r', linestyle='--', alpha=0.5)
plt.axvline(x=np.log10(best_elastic_alpha), color='g', linestyle='--', alpha=0.5)

plt.xlabel('log(alpha)', fontsize=12)
plt.ylabel('Mean Squared Error (CV)', fontsize=12)
plt.title('Cross-Validation: Finding Optimal Regularization Strength', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

print("INTERPRETATION:")
print("  - Too little regularization (left): Overfitting, high variance")
print("  - Too much regularization (right): Underfitting, high bias")
print("  - Sweet spot: Minimum of the CV curve (marked with dashed lines)")

## When to Use Which Regularization?

| Scenario | Recommended | Why |
|----------|-------------|-----|
| Many features, all potentially relevant | **Ridge** | Keeps all features, just shrinks them |
| Many features, only some are relevant | **Lasso** | Automatic feature selection (sparsity) |
| Correlated features + feature selection | **Elastic Net** | Groups correlated features together |
| Interpretability is critical | **Lasso** | Sparse models are easier to explain |
| Prediction accuracy is the only goal | **Ridge** or **Elastic Net** | More stable, less variance |

---

## Summary

**Ridge (L2):**
- Shrinks coefficients smoothly toward zero
- Never sets coefficients exactly to zero
- Best when all features contribute something

**Lasso (L1):**
- Forces some coefficients to exactly zero
- Automatic feature selection
- Best when you suspect many features are irrelevant

**Elastic Net:**
- Combines L1 and L2 penalties
- Handles correlated features better than pure Lasso
- Best of both worlds, but has two hyperparameters