In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Load the data
df = pd.read_csv('data.csv', sep=';')
X = df.drop(['Class', 'Output'], axis=1)
y = df['Output']

# Function to evaluate models using cross-validation (copied from original notebook)
def evaluate_model(model, X, y, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    
    # Calculate cross-validation scores
    cv_r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    cv_neg_mse_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
    
    # Convert negative MSE to RMSE
    cv_rmse_scores = np.sqrt(-cv_neg_mse_scores)
    
    # Print results
    print(f"Alpha: {model.alpha:.8f}")
    print(f"Mean CV R²: {cv_r2_scores.mean():.4f} ± {cv_r2_scores.std():.4f}")
    print(f"Mean CV RMSE: {cv_rmse_scores.mean():.4f} ± {cv_rmse_scores.std():.4f}")
    
    return cv_r2_scores.mean(), cv_rmse_scores.mean()

In [None]:
# Apply feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Test a wide range of alpha values for Lasso
alpha_values = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
results = []

for alpha in alpha_values:
    print(f"\nTesting Lasso with alpha = {alpha}")
    lasso_model = Lasso(alpha=alpha, max_iter=10000)
    r2, rmse = evaluate_model(lasso_model, X_scaled, y)
    
    # Check how many coefficients are exactly zero
    lasso_model.fit(X_scaled, y)
    n_nonzero = np.sum(lasso_model.coef_ != 0)
    n_zero = len(lasso_model.coef_) - n_nonzero
    print(f"Number of non-zero coefficients: {n_nonzero} out of {len(lasso_model.coef_)}")
    print(f"Number of coefficients set to zero: {n_zero}")
    
    results.append({
        'alpha': alpha,
        'r2': r2,
        'rmse': rmse,
        'nonzero_coefs': n_nonzero,
        'zero_coefs': n_zero
    })

In [None]:
# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Plot the effect of alpha on R2 and RMSE
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Plot for R2
ax1.plot(results_df['alpha'], results_df['r2'], 'o-')
ax1.set_xscale('log')
ax1.set_title('Effect of Alpha on R² Score')
ax1.set_xlabel('Alpha (log scale)')
ax1.set_ylabel('R² Score')
ax1.grid(True)

# Plot for RMSE
ax2.plot(results_df['alpha'], results_df['rmse'], 'o-')
ax2.set_xscale('log')
ax2.set_title('Effect of Alpha on RMSE')
ax2.set_xlabel('Alpha (log scale)')
ax2.set_ylabel('RMSE')
ax2.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Plot the effect of alpha on the number of non-zero coefficients
plt.figure(figsize=(10, 6))
plt.plot(results_df['alpha'], results_df['nonzero_coefs'], 'o-')
plt.xscale('log')
plt.title('Effect of Alpha on Number of Non-Zero Coefficients')
plt.xlabel('Alpha (log scale)')
plt.ylabel('Number of Non-Zero Coefficients')
plt.grid(True)
plt.show()

In [None]:
# Find the best alpha value
best_alpha_idx = results_df['r2'].idxmax()
best_alpha = results_df.loc[best_alpha_idx, 'alpha']
best_r2 = results_df.loc[best_alpha_idx, 'r2']
best_rmse = results_df.loc[best_alpha_idx, 'rmse']
best_nonzero = results_df.loc[best_alpha_idx, 'nonzero_coefs']

print(f"Best alpha value: {best_alpha}")
print(f"Best R²: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")
print(f"Number of non-zero coefficients with best alpha: {best_nonzero}")

# Compare with the default Lasso model used in the original notebook
default_lasso = Lasso()
print(f"\nDefault Lasso alpha: {default_lasso.alpha}")

# Train a Lasso model with the best alpha
best_lasso = Lasso(alpha=best_alpha, max_iter=10000)
best_lasso.fit(X_scaled, y)

# Examine top coefficients from the best Lasso model
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': best_lasso.coef_
})
coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)

print("\nTop 20 features by coefficient magnitude:")
print(coef_df.head(20))

## Why Lasso Performs Poorly in the Original Notebook

Based on the experiments above, we can identify several reasons why Lasso performs poorly in the original regression notebook:

1. **Default Alpha Value**: The default alpha value in scikit-learn's Lasso implementation is 1.0, which appears to be too high for this dataset. This causes excessive penalization, leading to many coefficients being reduced to exactly zero.

2. **Feature Sparsity**: With the default alpha, Lasso has eliminated most of the features (set their coefficients to zero), retaining only a small subset of features that may not capture the complexity of the relationship between predictors and the target variable.

3. **Regularization Strength**: While regularization is useful for preventing overfitting, the penalty applied by the default Lasso model is too aggressive for this dataset, resulting in an underfitted model with poor predictive performance.

4. **Scaling Issues**: Even though the data was scaled before applying Lasso in the original notebook, the default alpha might still be inappropriate for the scale of the predictors or the scale of the target variable.

5. **Nature of the Relationship**: The target variable might depend on many features with small contributions rather than a few with large contributions, which is contrary to Lasso's assumption of sparsity.

To improve Lasso's performance, we need to tune the alpha parameter to find the optimal regularization strength. A smaller alpha value allows more features to contribute to the prediction, potentially leading to better performance for this specific dataset.