In [4]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing, load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [5]:
from sklearn.datasets import make_regression
X_cal, y_cal = make_regression(n_samples=20640, n_features=8, noise=0.5, random_state=42)
print(f"Created synthetic data: X shape {X_cal.shape}, y shape {y_cal.shape}")

Created synthetic data: X shape (20640, 8), y shape (20640,)


##PART 1: REGRESSION TASK

#Step 1: Baseline Model (No Regularization)

In [6]:
# Step 1: Baseline model - Linear Regression without regularization

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_cal, y_cal, test_size=0.2, random_state=42)

print("Data split completed:")
print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])
print()

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Check coefficients
print("Model coefficients:")
print("Number of coefficients:", len(model.coef_))
print("First 5 coefficients:", model.coef_[:5])
print("Intercept:", model.intercept_)
print()

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate MSE
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print("Mean Squared Error (MSE):")
print("Training MSE:", train_mse)
print("Test MSE:", test_mse)
print()

# Check if test error is higher (possible overfitting)
if test_mse > train_mse:
    print("Note: Test error is higher than training error.")
    print("This might indicate overfitting.")
else:
    print("Test error is similar to training error.")

Data split completed:
Training samples: 16512
Test samples: 4128

Model coefficients:
Number of coefficients: 8
First 5 coefficients: [16.85632917 30.32607011 60.5771295  29.44892142 98.30087311]
Intercept: -0.0016495937674477767

Mean Squared Error (MSE):
Training MSE: 0.2494619198239613
Test MSE: 0.25222024422494127

Note: Test error is higher than training error.
This might indicate overfitting.


#Step 2: Hyperparameter Tuning.

In [7]:
# Define parameter grid
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# For Ridge
ridge = Ridge()
ridge_grid = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train, y_train)

print("Best Ridge alpha:", ridge_grid.best_params_['alpha'])
print("Best Ridge score:", -ridge_grid.best_score_)

# For Lasso
lasso = Lasso(max_iter=10000)
lasso_grid = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train, y_train)

print("\nBest Lasso alpha:", lasso_grid.best_params_['alpha'])
print("Best Lasso score:", -lasso_grid.best_score_)

Best Ridge alpha: 0.01
Best Ridge score: 0.2497331240841608

Best Lasso alpha: 0.001
Best Lasso score: 0.24974113749573523


#Step 3: Regularization Experiments (L1 vs L2):

In [None]:
# Step 3: Compare L1 and L2 regularization

# Train with best parameters
best_ridge = Ridge(alpha=ridge_grid.best_params_['alpha'])
best_ridge.fit(X_train, y_train)

best_lasso = Lasso(alpha=lasso_grid.best_params_['alpha'], max_iter=10000)
best_lasso.fit(X_train, y_train)

# Get coefficients
ridge_coef = best_ridge.coef_
lasso_coef = best_lasso.coef_

print("Ridge coefficients (first 5):", ridge_coef[:5])
print("Lasso coefficients (first 5):", lasso_coef[:5])

# Count zero coefficients
print("\nLasso zero coefficients:", np.sum(lasso_coef == 0))
print("Ridge zero coefficients:", np.sum(ridge_coef == 0))

# Calculate MSE
y_pred_ridge = best_ridge.predict(X_test)
y_pred_lasso = best_lasso.predict(X_test)

ridge_mse = mean_squared_error(y_test, y_pred_ridge)
lasso_mse = mean_squared_error(y_test, y_pred_lasso)

print("\nTest MSE:")
print("Ridge:", ridge_mse)
print("Lasso:", lasso_mse)

Ridge coefficients (first 5): [16.8563192  30.32605215 60.57709337 29.4489041  98.3008124 ]
Lasso coefficients (first 5): [16.85538738 30.32511163 60.5761137  29.44787143 98.29990753]

Lasso zero coefficients: 0
Ridge zero coefficients: 0

Test MSE:
Ridge: 0.2522223133087173
Lasso: 0.2523074782081457


##Part 2: Classification Task (Breast Cancer)

In [None]:
# Load Breast Cancer dataset
X_cancer, y_cancer = load_breast_cancer(return_X_y=True)

# Split data (80% train, 20% test)
X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer = train_test_split(
    X_cancer, y_cancer, test_size=0.2, random_state=42
)

print("Dataset loaded successfully!")
print(f"Training samples: {X_train_cancer.shape[0]}")
print(f"Test samples: {X_test_cancer.shape[0]}")
print(f"Number of features: {X_train_cancer.shape[1]}")
print(f"Classes: {np.unique(y_cancer)} (0 = malignant, 1 = benign)")

Dataset loaded successfully!
Training samples: 455
Test samples: 114
Number of features: 30
Classes: [0 1] (0 = malignant, 1 = benign)


#Step 1: Baseline Logistic Regression

In [None]:
# Create and train baseline model
baseline_log = LogisticRegression(penalty=None, max_iter=1000)
baseline_log.fit(X_train_cancer, y_train_cancer)

# Make predictions and calculate accuracy
y_train_pred = baseline_log.predict(X_train_cancer)
y_test_pred = baseline_log.predict(X_test_cancer)

train_acc = accuracy_score(y_train_cancer, y_train_pred)
test_acc = accuracy_score(y_test_cancer, y_test_pred)

print("Training Accuracy:", train_acc)
print("Test Accuracy:", test_acc)
print("Number of coefficients:", len(baseline_log.coef_[0]))

Training Accuracy: 0.9736263736263736
Test Accuracy: 0.956140350877193
Number of coefficients: 30


#Step 2: Hyperparameter Tuning

In [None]:
# Define parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

# Create and run GridSearchCV
log_model = LogisticRegression(max_iter=1000)
grid_search = GridSearchCV(log_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_cancer, y_train_cancer)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation score: 0.9670329670329672


#Step 3: Regularization Experiments

In [None]:
# Train L1 model with best C
l1_model = LogisticRegression(penalty='l1', C=100, solver='liblinear', max_iter=1000)
l1_model.fit(X_train_cancer, y_train_cancer)

# Train L2 model with best C (use same C for fair comparison)
l2_model = LogisticRegression(penalty='l2', C=100, max_iter=1000)
l2_model.fit(X_train_cancer, y_train_cancer)

# Get coefficients
l1_coef = l1_model.coef_[0]
l2_coef = l2_model.coef_[0]

print("L1 (Lasso-like) coefficients analysis:")
print("Non-zero coefficients:", np.sum(l1_coef != 0))
print("Zero coefficients:", np.sum(l1_coef == 0))

print("\nL2 (Ridge-like) coefficients analysis:")
print("Non-zero coefficients:", np.sum(l2_coef != 0))
print("Zero coefficients:", np.sum(l2_coef == 0))

# Calculate test accuracy
y_pred_l1 = l1_model.predict(X_test_cancer)
y_pred_l2 = l2_model.predict(X_test_cancer)

acc_l1 = accuracy_score(y_test_cancer, y_pred_l1)
acc_l2 = accuracy_score(y_test_cancer, y_pred_l2)

print("\nTest Accuracy:")
print("L1 model:", acc_l1)
print("L2 model:", acc_l2)
print("Baseline:", test_acc)

L1 (Lasso-like) coefficients analysis:
Non-zero coefficients: 21
Zero coefficients: 9

L2 (Ridge-like) coefficients analysis:
Non-zero coefficients: 30
Zero coefficients: 0

Test Accuracy:
L1 model: 0.9824561403508771
L2 model: 0.956140350877193
Baseline: 0.956140350877193
