#**Part 1: Regression Task (California Housing)**

**Task 1: Load and Split Dataset**

In [29]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("camnugent/california-housing-prices")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'california-housing-prices' dataset.
Path to dataset files: /kaggle/input/california-housing-prices


In [30]:
import numpy as np

import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score



In [31]:
import os
import pandas as pd
# Load dataset from the path downloaded by kagglehub:
df = pd.read_csv(os.path.join(path, "housing.csv"))

# Handle missing values in 'total_bedrooms' by imputing with median:
df['total_bedrooms'] = df['total_bedrooms'].fillna(df['total_bedrooms'].median())
print(df.head())

# Perform one-hot encoding on the "ocean_proximity" column
df = pd.get_dummies(df, columns=['ocean_proximity'], drop_first= False)

# Separate features (X) and target (Y)
# the correct target column name in this dataset is 'median_house_value':
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


**Task 2: Complete all the Task**

*• Regression Task (California Housing):*

In [32]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

print("===Baseline Linear Regression===")
print("coefficients:", lin_reg.coef_)
print("intercept:", lin_reg.intercept_)

===Baseline Linear Regression===
coefficients: [-2.68382734e+04 -2.54683520e+04  1.10218508e+03 -6.02150567e+00
  1.02789395e+02 -3.81729064e+01  4.82527528e+01  3.94739752e+04
 -1.89265829e+04 -5.87132390e+04  1.17198490e+05 -2.40632251e+04
 -1.54954428e+04]
intercept: -2256620.7988547315


In [33]:
train_pred = lin_reg.predict(X_train)
test_pred = lin_reg.predict(X_test)

print("===Baseline Linear Regression===")
print("Train MSE:", mean_squared_error(y_train, train_pred))
print("Test MSE:", mean_squared_error(y_test, test_pred))

===Baseline Linear Regression===
Train MSE: 4683203783.504252
Test MSE: 4908476721.156615


In [34]:
alpha_grid = {"alpha": np.logspace(-3, 0, 13)} # 0.001....1

ridge= Ridge(random_state=42)
lasso = Lasso(random_state=42, max_iter= 10000)

In [35]:
ridge_cv = GridSearchCV(ridge, alpha_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
lasso_cv = GridSearchCV(lasso, alpha_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)


In [37]:
ridge_cv.fit(X_train, y_train)
lasso_cv.fit(X_train, y_train)




  model = cd_fast.enet_coordinate_descent(


In [38]:
print("===Hperparameter Tuning===")
print("Best Ridge Alpha:", ridge_cv.best_params_["alpha"])
print("Best Ridge CV MSE:", -ridge_cv.best_score_)
print("Best Lasso Alpha:", lasso_cv.best_params_["alpha"])
print("Best Lasso CV MSE:", -lasso_cv.best_score_)

===Hperparameter Tuning===
Best Ridge Alpha: 0.5623413251903491
Best Ridge CV MSE: 4711027514.741423
Best Lasso Alpha: 0.001
Best Lasso CV MSE: 4711122401.897192


#**Part 2: Classification Task (Breast Cancer)**

**Task 1: Load and Split Dataset**

In [14]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yasserh/breast-cancer-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'breast-cancer-dataset' dataset.
Path to dataset files: /kaggle/input/breast-cancer-dataset


In [16]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler


# Load dataset from the path downloaded by kagglehub:
df = pd.read_csv(path + "/breast-cancer.csv")
df.head()

# Separate features (X) and target (Y)
# the correct target column name in this dataset is 'median_house_value':
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Feature scaling
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42 )

**Task 2: Complete all the Task**

**• Classification Task (Diabetes):**


*– Step 1: Baseline Model (No Regularization)*

In [17]:
log_reg = LogisticRegression(max_iter=5000)
log_reg.fit(X_train, y_train)


In [19]:
log_reg.coef_


array([[-3.01637904e-10, -3.23160811e-02,  5.27405809e-02,
        -1.39827414e-01, -1.15402462e-02, -1.67793251e-05,
         1.54281110e-03,  2.46227821e-03,  9.69128312e-04,
        -2.24964017e-05, -1.03198493e-04, -3.69459455e-04,
         1.73860192e-03,  5.48837561e-03,  5.50329661e-02,
         1.98312448e-05,  4.14791342e-04,  5.33853725e-04,
         1.23529273e-04,  5.31516728e-05,  2.73863164e-05,
        -3.38152626e-02,  9.81815402e-02, -9.09567576e-02,
         3.11802682e-02,  1.98586223e-04,  5.65812068e-03,
         7.24768800e-03,  1.85711230e-03,  8.65418872e-04,
         2.94470144e-04]])

In [20]:
train_acc = accuracy_score(y_train, log_reg.predict(X_train))
test_acc = accuracy_score(y_test, log_reg.predict(X_test))

print("Baseline Training Accuracy:", train_acc)
print("Baseline Test Accuracy:", test_acc)


Baseline Training Accuracy: 0.9208791208791208
Baseline Test Accuracy: 0.956140350877193


*– Step 2: Hyperparameter Tuning*

In [21]:
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"]
}


In [22]:
grid = GridSearchCV(
    LogisticRegression(max_iter=5000),
    param_grid,
    cv=5,
    scoring="accuracy"
)

grid.fit(X_train, y_train)


In [23]:
print("Best Parameters:", grid.best_params_)


Best Parameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}


In [24]:
best_model = grid.best_estimator_

train_acc = accuracy_score(y_train, best_model.predict(X_train))
test_acc = accuracy_score(y_test, best_model.predict(X_test))

print("Tuned Training Accuracy:", train_acc)
print("Tuned Test Accuracy:", test_acc)


Tuned Training Accuracy: 0.9868131868131869
Tuned Test Accuracy: 0.9824561403508771


*– Step 3: Regularization Experiments*

*For L1 Regularization:*

In [25]:
log_l1 = LogisticRegression(
    penalty="l1",
    C=grid.best_params_["C"],
    solver="liblinear",
    max_iter=5000
)

log_l1.fit(X_train, y_train)


*For L2 Regularization:*

In [26]:
log_l2 = LogisticRegression(
    penalty="l2",
    C=grid.best_params_["C"],
    solver="liblinear",
    max_iter=5000
)

log_l2.fit(X_train, y_train)


In [27]:
print("L1 Coefficients:\n", log_l1.coef_)
print("\nL2 Coefficients:\n", log_l2.coef_)


L1 Coefficients:
 [[ 4.32035918e-09 -5.45513300e-01  1.10176611e-01 -8.84878290e-02
   1.39390714e-03  0.00000000e+00 -5.63692367e+01  4.04084126e+00
   1.60779365e+02 -2.30404605e+01  0.00000000e+00  1.04900834e+00
  -1.45177111e+00 -5.75514531e-01  2.22373668e-01  0.00000000e+00
  -1.82886791e+01 -6.11440103e+01  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -1.69023961e-01  3.86018590e-01 -3.30575602e-02
   1.43885711e-02  5.32877377e-03 -4.11299438e+00  1.92935440e+01
   1.91455994e+01  2.54206594e+01  0.00000000e+00]]

L2 Coefficients:
 [[-8.07161893e-10 -5.28163697e-04 -9.60466573e-04 -3.22984181e-03
  -4.56820539e-03 -5.41454407e-06 -1.71702539e-06  3.02661197e-06
   1.60284122e-06 -1.02805371e-05 -4.11975083e-06 -2.97213407e-06
  -7.77325615e-05 -1.79794677e-05  1.05164425e-03 -4.73720995e-07
  -8.60438676e-07 -9.39192453e-07 -3.71296684e-07 -1.30901885e-06
  -2.24882925e-07 -4.98817405e-04 -1.20684227e-03 -3.03582434e-03
   4.28644772e-03 -7.00526314e-06 -1.58072733e-06  4.

In [28]:
print("L1 Train Accuracy:", accuracy_score(y_train, log_l1.predict(X_train)))
print("L1 Test Accuracy:", accuracy_score(y_test, log_l1.predict(X_test)))

print("L2 Train Accuracy:", accuracy_score(y_train, log_l2.predict(X_train)))
print("L2 Test Accuracy:", accuracy_score(y_test, log_l2.predict(X_test)))


L1 Train Accuracy: 0.9868131868131869
L1 Test Accuracy: 0.9824561403508771
L2 Train Accuracy: 0.8923076923076924
L2 Test Accuracy: 0.9473684210526315
