In [1]:
# Imports
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier, Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [2]:
# Load the Dataset
housing = fetch_california_housing()

# Print description of the Dataset
print(housing.DESCR)
print(housing.feature_names)

# Create data and target DataFrames
data = pd.DataFrame(housing.data, columns=housing.feature_names)
target = pd.DataFrame(housing.target, columns=["Price"])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [3]:
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [4]:
target.head()

Unnamed: 0,Price
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [5]:
# Modify the target variable to binary values (1 for under 100k $, 0 for over 100k $)
target['Price'] = (target['Price'] <= 1).astype(int) 
target.head()

Unnamed: 0,Price
0,0
1,0
2,0
3,0
4,0


In [6]:
# Feature Selection: a subset of features for modeling
selected_features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms','Population']
X = data[selected_features]
y = target['Price'] 

# Data Preparation: standardization of the features data using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [7]:
# Model Selection and Tuning

# kNN
knn = KNeighborsClassifier()
knn_params = {'n_neighbors': np.arange(1, 21)}
knn_cv = GridSearchCV(knn, knn_params, cv=5)
knn_cv.fit(X_train, y_train)

knn_best_params = knn_cv.best_params_
print("KNN best parameters:", knn_best_params )

knn_best_model = KNeighborsClassifier(**knn_best_params)
knn_best_model.fit(X_train, y_train)

KNN best parameters: {'n_neighbors': 15}


In [8]:
# Ridge Regression
ridge = RidgeClassifier()
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge_cv = GridSearchCV(ridge, ridge_params, cv=5)
ridge_cv.fit(X_train, y_train)
ridge_best_params = ridge_cv.best_params_
ridge_best_model = RidgeClassifier(**ridge_best_params)
ridge_best_model.fit(X_train, y_train)

In [9]:
# LASSO Regression
lasso = Lasso()
lasso_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_cv = GridSearchCV(lasso, lasso_params, cv=5)
lasso_cv.fit(X_train, y_train)
lasso_best_params = lasso_cv.best_params_
lasso_best_model = Lasso(**lasso_best_params)
lasso_best_model.fit(X_train, y_train)

In [10]:
# Logistic Regression with L1 regularization
logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear')
logreg_l1_params = {'C': [0.01, 0.1, 1, 10, 100]}
logreg_l1_cv = GridSearchCV(logreg_l1, logreg_l1_params, cv=5)
logreg_l1_cv.fit(X_train, y_train)
logreg_l1_best_params = logreg_l1_cv.best_params_
logreg_l1_best_model = LogisticRegression(penalty='l1', solver='liblinear', **logreg_l1_best_params)
logreg_l1_best_model.fit(X_train, y_train)

In [11]:
# Logistic Regression with L2 regularization
logreg_l2 = LogisticRegression(penalty='l2',max_iter=1000)
logreg_l2_params = {'C': [0.01, 0.1, 1, 10, 100]}

logreg_l2_cv = GridSearchCV(logreg_l2, logreg_l2_params, cv=5)

logreg_l2_cv.fit(X_train, y_train)

logreg_l2_best_params = logreg_l2_cv.best_params_
logreg_l2_best_model = LogisticRegression(penalty='l2', **logreg_l2_best_params, max_iter=1000)
logreg_l2_best_model.fit(X_train, y_train)

In [12]:
# Model Evaluation
models = [knn_best_model, ridge_best_model, lasso_best_model, logreg_l1_best_model, logreg_l2_best_model]
model_names = ['kNN', 'Ridge', 'Lasso', 'Logistic L1', 'Logistic L2']

for model, name in zip(models, model_names):
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    print(f'{name} - Mean ROC AUC: {scores.mean():.4f}')

kNN - Mean ROC AUC: 0.8899
Ridge - Mean ROC AUC: 0.8897
Lasso - Mean ROC AUC: 0.8709
Logistic L1 - Mean ROC AUC: 0.8899
Logistic L2 - Mean ROC AUC: 0.8899


In [13]:
# Make predictions on the test set using the best model: Logistic L2
y_pred = logreg_l2_best_model.predict(X_test)

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{cm}')

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy :.4f}')

# Calculate the F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1:.4f}')

Confusion Matrix:
[[3220  169]
 [ 390  349]]
Accuracy: 0.8646
F1 Score: 0.5553
