<a href="https://colab.research.google.com/github/mohamed-bahaa/APIs/blob/master/fetch_california_housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# =========================================
# 1. IMPORT LIBRARIES
# =========================================
# We import the core libraries for:
# - Data handling (pandas, numpy)
# - Dataset loading (fetch_california_housing)
# - Model building and evaluation (scikit-learn)

In [17]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# =========================================
# 2. LOAD AND INSPECT DATA
# =========================================
# We load the California Housing dataset as a DataFrame.
# This dataset contains 8 features describing California districts
# and a target column 'MedHouseVal' (median house value).

In [18]:
cal = fetch_california_housing(as_frame=True)
df = cal.frame

# Quick look at the first few rows
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


# =========================================
# 3. CHECK FOR MISSING VALUES
# =========================================
# Ensures there are no NaN values that could break our model.

In [19]:
df.isnull().sum()

Unnamed: 0,0
MedInc,0
HouseAge,0
AveRooms,0
AveBedrms,0
Population,0
AveOccup,0
Latitude,0
Longitude,0
MedHouseVal,0


# =========================================
# 4. FEATURE-TARGET CORRELATION
# =========================================
# We calculate the correlation of each feature with the target.
# This helps us understand which features have the strongest
# linear relationship with house value.

In [20]:
correlations_with_target = df.corr(numeric_only=True)['MedHouseVal'].sort_values(ascending=False)
print(correlations_with_target)

MedHouseVal    1.000000
MedInc         0.688075
AveRooms       0.151948
HouseAge       0.105623
AveOccup      -0.023737
Population    -0.024650
Longitude     -0.045967
AveBedrms     -0.046701
Latitude      -0.144160
Name: MedHouseVal, dtype: float64


# =========================================
# 5. SPLIT FEATURES AND TARGET
# =========================================
# X = all features except the target
# y = target column (median house value)

In [21]:
X = df.drop(columns=['MedHouseVal'])
y = df['MedHouseVal']

# =========================================
# 6. TRAIN-TEST SPLIT
# =========================================
# We split the data into:
# - 80% training set (for fitting the model)
# - 20% test set (for evaluating performance)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size:     {X_test.shape[0]}")
print(f"Total size:        {X_train.shape[0] + y_test.shape[0]}")
print(f"Training Target size: {y_train.shape[0]}")
print(f"Test Target size:     {y_test.shape[0]}")
print(f"Total Target size:        {y_train.shape[0] + y_test.shape[0]}")

Training set size: 16512
Test set size:     4128
Total size:        20640
Training Target size: 16512
Test Target size:     4128
Total Target size:        20640


# =========================================
# 7. BASELINE LINEAR REGRESSION
# =========================================
# We train a simple Linear Regression model on the original features
# to establish a baseline performance for comparison.

In [34]:
lin = LinearRegression()
lin.fit(X_train, y_train)

# Predictions
y_train_pred = lin.predict(X_train)
y_test_pred = lin.predict(X_test)

# Evaluation metrics
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Baseline — MSE (Train): {mse_train:.4f}")
print(f"Baseline — MSE (Test):  {mse_test:.4f}")
print(f"Baseline — R² (Train):  {r2_train:.4f}")
print(f"Baseline — R² (Test):   {r2_test:.4f}")


Baseline — MSE (Train): 0.5179
Baseline — MSE (Test):  0.5559
Baseline — R² (Train):  0.6126
Baseline — R² (Test):   0.5758


# =========================================
# 8. PCA WITH SCALING (5 COMPONENTS)
# =========================================
# PCA is scale-sensitive, so we standardize features first.
# Then we reduce the dataset to 5 principal components.
# Finally, we train Linear Regression on these components.

In [24]:
pipe_pca5 = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ('lin', LinearRegression())
])

pipe_pca5.fit(X_train, y_train)
y_test_pred_pca5 = pipe_pca5.predict(X_test)

# Evaluation
mse_pca5 = mean_squared_error(y_test, y_test_pred_pca5)
r2_pca5 = r2_score(y_test, y_test_pred_pca5)

print(f"PCA(5) — MSE (Test): {mse_pca5:.4f}")
print(f"PCA(5) — R² (Test):  {r2_pca5:.4f}")

PCA(5) — MSE (Test): 0.7431
PCA(5) — R² (Test):  0.4329


# =========================================
# 9. GRID SEARCH FOR BEST PCA COMPONENTS
# =========================================
# We test all possible numbers of PCA components (1 to 8)
# using 5-fold cross-validation to find the best R² score.

In [25]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('lin', LinearRegression())
])

param_grid = {
    'pca__n_components': list(range(1, X_train.shape[1] + 1))
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='r2',
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print(f"Best CV R²: {grid.best_score_:.4f}")

Best parameters: {'pca__n_components': 8}
Best CV R²: 0.6115


# =========================================
# 10. EVALUATE BEST PCA MODEL ON TEST SET
# =========================================
# We retrain the best PCA + Linear Regression pipeline
# on the full training set and evaluate it on the test set.

In [26]:
best_pca_k = grid.best_params_['pca__n_components']
best_model = grid.best_estimator_
y_test_pred_best = best_model.predict(X_test)

mse_best = mean_squared_error(y_test, y_test_pred_best)
r2_best = r2_score(y_test, y_test_pred_best)

print(f"Best PCA(k={best_pca_k}) — MSE (Test): {mse_best:.4f}")
print(f"Best PCA(k={best_pca_k}) — R² (Test):  {r2_best:.4f}")

Best PCA(k=8) — MSE (Test): 0.5559
Best PCA(k=8) — R² (Test):  0.5758


# =========================================
# 11. PERFORMANCE COMPARISON TABLE
# =========================================
# We summarize all results in one table for easy comparison.

In [27]:
summary = pd.DataFrame([
    {
        'Stage': 'Baseline (no PCA)',
        'Features': X_train.shape[1],
        'MSE_Test': mse_test,
        'R2_Test': r2_test
    },
    {
        'Stage': 'PCA (5 components)',
        'Features': 5,
        'MSE_Test': mse_pca5,
        'R2_Test': r2_pca5
    },
    {
        'Stage': f'Best PCA (k={best_pca_k})',
        'Features': best_pca_k,
        'MSE_Test': mse_best,
        'R2_Test': r2_best
    }
])

print(summary)

                Stage  Features  MSE_Test   R2_Test
0   Baseline (no PCA)         8  0.555892  0.575788
1  PCA (5 components)         5  0.743103  0.432923
2      Best PCA (k=8)         8  0.555892  0.575788
