## 1. Import Necessary Libraries

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler

### 2. Generate Synthetic Data (or Load Your Dataset)

In [5]:
# Create a synthetic dataset
X, y = make_regression(
    n_samples=1000,  # Number of samples
    n_features=20,   # Number of features
    noise=0.5,       # Add noise
    random_state=42
)

In [7]:
X

array([[ 0.22584183,  1.55137772, -0.10734682, ..., -1.35154745,
         0.3646018 ,  0.13316649],
       [ 0.11083638, -1.45461475,  0.26388849, ..., -1.42646391,
        -0.60451386, -0.16624207],
       [ 0.45860045, -0.08127955, -0.69847376, ...,  0.90822252,
         0.51141526, -2.09317802],
       ...,
       [-0.5125888 ,  1.12477672,  0.89835957, ...,  0.6832124 ,
         1.09848488,  1.27442843],
       [-2.96836843, -0.92984795,  0.05520801, ...,  1.34954631,
        -0.489467  ,  1.23228383],
       [-0.48716718,  2.80137299, -1.08863484, ..., -2.21760908,
         0.50011283,  0.83533344]])

In [9]:
y

array([ 3.41832812e+00, -2.54219266e+02,  1.88730797e+02, -1.79443856e+02,
       -4.77066691e+01,  2.39697529e+02, -1.51226845e+02,  5.00248849e+01,
       -4.38189513e+00, -1.54961312e+02,  5.33626104e+00,  1.59243519e+01,
       -3.11408338e+02, -2.15760915e+02,  1.89268259e+02,  1.34129242e+02,
        6.33870771e+01,  9.72652597e+01,  3.65517684e+01, -1.13996223e+01,
        3.44240836e+01,  5.63106694e-01, -1.99778815e+02,  1.56036100e+02,
       -1.92038946e+02,  5.08630037e+01,  1.97251654e+02,  1.02204962e+02,
        3.49296869e+02, -1.87741973e+02,  3.47048160e+02, -1.75313927e+02,
        7.37983742e+01, -2.10100574e+02,  1.69389369e+02,  9.68931631e+01,
       -1.88360613e+02, -1.89265371e+02,  2.33853661e+02,  1.74439396e+02,
        3.14249275e+02, -4.86212961e+02,  6.28214854e+01, -2.55028044e+02,
        6.76386718e+01,  1.91943459e+02,  2.08088386e+02, -2.38266176e+02,
       -1.69678921e+02, -1.75725266e+02,  1.93421213e+02,  3.78875352e+02,
        1.25659362e+02,  

## 3. Preprocess Data

In [12]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features (important for regularization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4. Train LASSO Model

In [15]:
# Initialize LASSO model with alpha (regularization strength)
lasso = Lasso(alpha=0.1, random_state=42)

# Fit the model
lasso.fit(X_train_scaled, y_train)

## 5. Evaluate the Model

In [18]:
# Make predictions
y_pred = lasso.predict(X_test_scaled)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Squared Error: 0.38
R² Score: 1.00


## 6. Analyze Coefficients

In [21]:
# Get coefficients and count non-zero features
coefficients = lasso.coef_
non_zero = np.sum(coefficients != 0)

print(f"Number of non-zero coefficients: {non_zero}")
print("Coefficients:", coefficients)

Number of non-zero coefficients: 10
Coefficients: [79.61103883 93.06986053  5.56915636  0.         85.77318034 -0.
 70.38887943  0.         -0.         -0.         18.58444918 39.88497844
 -0.          2.99006626 -0.         25.67382061 -0.         86.51511
  0.          0.        ]


## 7. Hyperparameter Tuning with Cross-Validation

In [24]:
# Use LassoCV to find optimal alpha
alphas = np.logspace(-4, 1, 50)  # Alpha values to test
lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=42)
lasso_cv.fit(X_train_scaled, y_train)

print(f"Optimal alpha: {lasso_cv.alpha_:.4f}")

# Evaluate with best alpha
best_lasso = Lasso(alpha=lasso_cv.alpha_)
best_lasso.fit(X_train_scaled, y_train)

Optimal alpha: 0.0069


Key Notes:

    Regularization Strength (α):

        Higher α → More regularization → Sparse model (more zero coefficients)

        Find optimal α using cross-validation (LassoCV)

    Feature Scaling:

        Always scale features before regularization (e.g., StandardScaler)

    Feature Selection:

        LASSO automatically performs feature selection by zeroing out irrelevant features

When to Use LASSO:

    When you have high-dimensional data

    To prevent overfitting

    When feature selection is needed