
Part 1: Regression Task (California Housing)

In [6]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("camnugent/california-housing-prices")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'california-housing-prices' dataset.
Path to dataset files: /kaggle/input/california-housing-prices


In [8]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import mean_squared_error

In [9]:
import pandas as pd
import os

#Load the datasets from the path download by kagglehub
#The 'path' variable is set in the first code cell.
housing_file_path = os.path.join(path, "housing.csv")
df = pd.read_csv(housing_file_path)

#Handle missing values in 'total_bedrooms' by imputing with the median
df['total_bedrooms'] =df['total_bedrooms'].fillna(df['total_bedrooms'].median())
print(df.head())

#Perform one-hot encoding on the 'ocean_proximity' column
df = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=False) #Drop_first=False to keep all categories


#Seperate features (X) and target (y)
#The correct target column name in this dataset is 'median_house_value'.
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(
 X, y, test_size=0.2, random_state=42
)

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


In [10]:
lin_reg=LinearRegression()
lin_reg.fit(X_train,y_train)

In [11]:
print("===Baseline Linear Regresssion===")
print("Coefficients:",lin_reg.coef_)
print("Intercept:",lin_reg.intercept_)

===Baseline Linear Regresssion===
Coefficients: [-2.68382734e+04 -2.54683520e+04  1.10218508e+03 -6.02150567e+00
  1.02789395e+02 -3.81729064e+01  4.82527528e+01  3.94739752e+04
 -1.89265829e+04 -5.87132390e+04  1.17198490e+05 -2.40632251e+04
 -1.54954428e+04]
Intercept: -2256620.7988545513


In [12]:
y_train_pred=lin_reg.predict(X_train)
y_test_pred=lin_reg.predict(X_test)

In [13]:
print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))

Train MSE: 4683203783.504252
Test MSE: 4908476721.156623


In [14]:
alpha_grid={"alpha": np.logspace(-3,0,13)} #0.001 ...1

In [20]:
ridge=Ridge(random_state=42)
lasso=Lasso(random_state=42,max_iter=10000)

In [22]:
ridge_cv=GridSearchCV(
    ridge,alpha_grid,cv=5, scoring="neg_mean_squared_error",n_jobs=-1
)
lasso_cv=GridSearchCV(
    lasso,alpha_grid,cv=5,scoring="neg_mean_squared_error",n_jobs=-1
)

In [23]:
ridge_cv.fit(X_train,y_train)
lasso_cv.fit(X_train,y_train)

  model = cd_fast.enet_coordinate_descent(


In [24]:
print("\n===Hyperparameter Tuning results===")
print("Best Ridge alpha:", ridge_cv.best_params_["alpha"])
print("Best Ridge CV MSE:", -ridge_cv.best_score_)
print("Best Lasso alpha:", lasso_cv.best_params_["alpha"])
print("Best Lasso CV MSE:", -lasso_cv.best_score_)


===Hyperparameter Tuning results===
Best Ridge alpha: 0.5623413251903491
Best Ridge CV MSE: 4711027514.741423
Best Lasso alpha: 0.001
Best Lasso CV MSE: 4711122401.897191


In [25]:
best_ridge =ridge_cv.best_estimator_
best_lasso=lasso_cv.best_estimator_

In [26]:
ridge_train_pred=best_ridge.predict(X_train)
ridge_test_pred=best_ridge.predict(X_test)
lasso_train_pred=best_lasso.predict(X_train)
lasso_test_pred=best_lasso.predict(X_test)


In [27]:
print("\n===Hyperparameter Tuning results===")
print("Best Ridge alpha:", ridge_cv.best_params_["alpha"])
print("Best Ridge CV MSE:", -ridge_cv.best_score_)
print("Best Lasso alpha:", lasso_cv.best_params_["alpha"])
print("Best Lasso CV MSE:", -lasso_cv.best_score_)


===Hyperparameter Tuning results===
Best Ridge alpha: 0.5623413251903491
Best Ridge CV MSE: 4711027514.741423
Best Lasso alpha: 0.001
Best Lasso CV MSE: 4711122401.897191


In [28]:
print("\n===Lasso(L1) with best alpha===")
print("Coefficient:",best_lasso.coef_)
print("Train MSE:", mean_squared_error(y_train, lasso_train_pred))
print("Test MSE:",mean_squared_error(y_test,lasso_test_pred))


===Lasso(L1) with best alpha===
Coefficient: [-2.68382793e+04 -2.54683579e+04  1.10218525e+03 -6.02150576e+00
  1.02789417e+02 -3.81729083e+01  4.82527346e+01  3.94739750e+04
  1.66420878e+04 -2.31445583e+04  1.52763022e+05  1.15054322e+04
  2.00732150e+04]
Train MSE: 4683203783.508417
Test MSE: 4908476947.002458


Part2:Classification Task (Breast Cancer)

In [29]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yasserh/breast-cancer-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'breast-cancer-dataset' dataset.
Path to dataset files: /kaggle/input/breast-cancer-dataset


In [30]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler


# Load dataset from the path downloaded by kagglehub:
df = pd.read_csv(path + "/breast-cancer.csv")
df.head()

# Separate features (X) and target (Y)
# the correct target column name in this dataset is 'median_house_value':
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Feature scaling
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 )

In [31]:
log_reg = LogisticRegression(max_iter=5000)
log_reg.fit(X_train, y_train)

In [32]:
log_reg.coef_

array([[ 4.41470952e-10, -9.90739776e-03, -1.80569429e-02,
        -5.93877477e-02, -1.71133973e-02, -1.06687155e-04,
        -1.66538334e-05,  9.53940445e-05,  4.46453855e-05,
        -2.02172454e-04, -8.17804040e-05, -6.17338761e-05,
        -1.31350048e-03, -9.79085421e-05,  2.66714451e-02,
        -7.79204511e-06, -7.75379435e-06, -5.03687770e-06,
        -3.54003090e-06, -2.17295912e-05, -3.48653062e-06,
        -1.05694403e-02, -2.39209316e-02, -6.26932225e-02,
         2.84854511e-02, -1.43990853e-04, -1.25080205e-05,
         1.28309254e-04,  2.32276941e-05, -2.99598218e-04,
        -9.20759207e-05]])

In [33]:
train_acc = accuracy_score(y_train, log_reg.predict(X_train))
test_acc = accuracy_score(y_test, log_reg.predict(X_test))

print("Baseline Training Accuracy:", train_acc)
print("Baseline Test Accuracy:", test_acc)

Baseline Training Accuracy: 0.9120879120879121
Baseline Test Accuracy: 0.956140350877193


In [34]:
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"]
}





In [35]:
grid = GridSearchCV(
    LogisticRegression(max_iter=5000),
    param_grid,
    cv=5,
    scoring="accuracy"
)

grid.fit(X_train, y_train)


In [36]:
print("Best Parameters:", grid.best_params_)

Best Parameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}


In [37]:
best_model = grid.best_estimator_

train_acc = accuracy_score(y_train, best_model.predict(X_train))
test_acc = accuracy_score(y_test, best_model.predict(X_test))

print("Tuned Training Accuracy:", train_acc)
print("Tuned Test Accuracy:", test_acc)

Tuned Training Accuracy: 0.9868131868131869
Tuned Test Accuracy: 0.9824561403508771


Step 3: Regularization Experiments

For L1 Regularization:

In [38]:
log_l1 = LogisticRegression(
    penalty="l1",
    C=grid.best_params_["C"],
    solver="liblinear",
    max_iter=5000
)

log_l1.fit(X_train, y_train)

For L2 Regularization:

In [39]:
log_l2 = LogisticRegression(
    penalty="l2",
    C=grid.best_params_["C"],
    solver="liblinear",
    max_iter=5000
)

log_l2.fit(X_train, y_train)



In [40]:

print("L1 Coefficients:\n", log_l1.coef_)
print("\nL2 Coefficients:\n", log_l2.coef_)

L1 Coefficients:
 [[ 4.83410985e-09 -6.28004944e-01  1.13109414e-01 -3.64168848e-02
  -2.67609223e-03  0.00000000e+00 -5.40042919e+01  6.59520958e+00
   1.54994669e+02 -2.14794209e+01  0.00000000e+00  9.06940892e+00
  -1.58517365e+00 -6.72172630e-02  1.09644749e-01  0.00000000e+00
   0.00000000e+00 -7.43062897e+01  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -3.44743798e-01  4.38996013e-01 -1.00663845e-01
   2.49847156e-02  8.90558278e+00 -9.80570015e+00  2.16777385e+01
   2.68629444e+01  2.73152885e+01  0.00000000e+00]]

L2 Coefficients:
 [[-2.12715847e-10 -4.73698845e-03 -8.57667361e-03 -2.84437840e-02
  -1.52333525e-02 -5.04849437e-05 -7.50239523e-06  4.45227214e-05
   2.14728348e-05 -9.57791799e-05 -3.91612506e-05 -3.09244197e-05
  -6.78907941e-04 -8.78248720e-05  1.11646787e-02 -4.10123930e-06
  -4.91532105e-06 -4.10095607e-06 -2.16000883e-06 -1.11155543e-05
  -1.87345006e-06 -4.85404239e-03 -1.09821084e-02 -2.87784619e-02
   1.85671621e-02 -6.64236049e-05  6.50727516e-06  7.

In [41]:
print("L1 Train Accuracy:", accuracy_score(y_train, log_l1.predict(X_train)))
print("L1 Test Accuracy:", accuracy_score(y_test, log_l1.predict(X_test)))

print("L2 Train Accuracy:", accuracy_score(y_train, log_l2.predict(X_train)))
print("L2 Test Accuracy:", accuracy_score(y_test, log_l2.predict(X_test)))



L1 Train Accuracy: 0.9868131868131869
L1 Test Accuracy: 0.9824561403508771
L2 Train Accuracy: 0.9098901098901099
L2 Test Accuracy: 0.956140350877193
