In [18]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

In [2]:
Data = fetch_california_housing(data_home ='./scikit_learn_data', as_frame=True)
X = pd.DataFrame(Data.data.copy())
Y = pd.Series(Data.target.copy())

print("Loaded California Housing dataset.")
print('X shape', X.shape,' \n','Y shape',  Y.shape)
display(X.head())
display(pd.DataFrame({'MedianHousevalueTarget': Y}).head())

Loaded California Housing dataset.
X shape (20640, 8)  
 Y shape (20640,)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


Unnamed: 0,MedianHousevalueTarget
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [3]:
X.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [4]:
RANDOM_STATE = 42
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=RANDOM_STATE)
X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
14196,3.2596,33.0,5.017657,1.006421,2300.0,3.691814,32.71,-117.03
8267,3.8125,49.0,4.473545,1.041005,1314.0,1.738095,33.77,-118.16
17445,4.1563,4.0,5.645833,0.985119,915.0,2.723214,34.66,-120.48
14265,1.9425,36.0,4.002817,1.033803,1418.0,3.994366,32.69,-117.11
2271,3.5542,43.0,6.268421,1.134211,874.0,2.300000,36.78,-119.80
...,...,...,...,...,...,...,...,...
11284,6.3700,35.0,6.129032,0.926267,658.0,3.032258,33.78,-117.96
11964,3.0500,33.0,6.868597,1.269488,1753.0,3.904232,34.02,-117.43
5390,2.9344,36.0,3.986717,1.079696,1756.0,3.332068,34.03,-118.38
860,5.7192,15.0,6.395349,1.067979,1777.0,3.178891,37.58,-121.96


In [5]:
Y_train

14196    1.030
8267     3.821
17445    1.726
14265    0.934
2271     0.965
         ...  
11284    2.292
11964    0.978
5390     2.221
860      2.835
15795    3.250
Name: MedHouseVal, Length: 16512, dtype: float64

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [7]:
pipelines = {
    'Linear Regression': Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]),
    'Decision Tree': Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('model', DecisionTreeRegressor(random_state=RANDOM_STATE))
    ]),
    'Random Forest': Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('model', RandomForestRegressor(
            n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1
        ))
    ]),
    'Gradient Boosting': Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('model', GradientBoostingRegressor(random_state=RANDOM_STATE))
    ]),
    'SVR (RBF kernel)': Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', SVR(kernel='rbf', C=10.0, epsilon=0.1))
    ]),
}

print('Defined models:', ', '.join(pipelines.keys()))

Defined models: Linear Regression, Decision Tree, Random Forest, Gradient Boosting, SVR (RBF kernel)


In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [13]:
Results = []

for name, pipe in pipelines.items():
    pipe.fit(X_train, Y_train)
    Predict = pipe.predict(X_test)
    MSE = mean_squared_error(Y_test, Predict)
    MAE = mean_absolute_error(Y_test, Predict)
    R2 = r2_score(Y_test, Predict)
    Results.append([name,MSE,MAE,R2])

Results_DF = pd.DataFrame(Results, columns=['Model', 'MSE', 'MAE', 'R2'])
Results_DF= Results_DF.sort_values(by='R2', ascending=False).reset_index(drop=True)
display(Results_DF)

Unnamed: 0,Model,MSE,MAE,R2
0,Random Forest,0.253434,0.326607,0.8066
1,Gradient Boosting,0.293997,0.371643,0.775645
2,SVR (RBF kernel),0.323697,0.377445,0.75298
3,Decision Tree,0.495235,0.454679,0.622076
4,Linear Regression,0.555892,0.5332,0.575788


In [14]:
Best_Row = Results_DF.iloc[0]
Worst_Row = Results_DF.iloc[-1]

print('Best Model by R2 :')
display(Best_Row.to_frame().T)

print('Worst Model by R2:')
display(Worst_Row.to_frame().T)

Best Model by R2 :


Unnamed: 0,Model,MSE,MAE,R2
0,Random Forest,0.253434,0.326607,0.8066


Worst Model by R2:


Unnamed: 0,Model,MSE,MAE,R2
4,Linear Regression,0.555892,0.5332,0.575788


In [17]:
print(f"- **Best:** {Best_Row['Model']} achieved the highest R2 Value ({Best_Row['R2']:.3f}) and the lowest value (MSE={Best_Row['MSE']:.3f}, MAE={Best_Row['MAE']:.3f})")

print(f"""- **Worst:** {Worst_Row['Model']} had the lowset R2 ({Worst_Row['R2']:.3f}), indicating it explains less variance, its error values (MSE={Worst_Row['MSE']:.3f}, MAE={Worst_Row['MAE']:.3f}) are comparatively higher """)

- **Best:** Random Forest achieved the highest R2 Value (0.807) and the lowest value (MSE=0.253, MAE=0.327)
- **Worst:** Linear Regression had the lowset R2 (0.576), indicating it explains less variance, its error values (MSE=0.556, MAE=0.533) are comparatively higher 


**Explanations of Each Algorithm**

-**1. Linear Regression**- Fits a Straight Hyperplane minimizing squared errors. Coefficients show strength of each features effect.
     *Why* - If the relationship between features and price is approximately linear, it can be strong and very interpretable.
     
     
-**2. Decision Tree Regressor**- Splits the feature space into regions by threshold rules, predicting the average in each leaf.
     *Why* - It captures Non-Linear relationships and interactions without needing scaling. It can overfit unless constrained.
      
      
-**3. Gradient Boosting Regressor**- Builds trees sequentially, where each new tree tries to correct the errors of the combined previous trees.
     *Why* - Often delivers state of the art results on structured data, handles non linearities and interactions well.
      
      
-**4. Random Forest Regressor**- An ensemble of many decision trees trained on bootstrapped samples and random feature subsets, average predictions.
     *Why* - Reduces variance of a single tree, improving generalizations. Great defaukt for tabular data with complex patterns.
     
     
-**5. Support Vector Regressor**- Uses a kernel to map features to a higher dimensional space, fitting a function within an ε‑insensitive tube,                                          controlled by C(regularization) and y(kernel speed).
     *Why* - It can model Smooth non-linear trends ; benefits strongly from scaling 
     