In [23]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
housing = fetch_california_housing(as_frame=True)
print(housing.data.shape, housing.target.shape)
print(housing.feature_names)
print(housing.DESCR)
housing.frame

(20640, 8) (20640,)
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [24]:

from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

# Create and train the model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Evaluate
score = linear_model.score(X_test, y_test)
print(f"Linear Regression R² score: {score:.4f}")

Linear Regression R² score: 0.5758


In [29]:
from sklearn.ensemble import RandomForestRegressor

# Load data
housing = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

# Create and train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

for i in zip(housing.feature_names, rf_model.feature_importances_):
    print(i)

# Evaluate
score = rf_model.score(X_test, y_test)
print(f"Random Forest R² score: {score:.4f}")

('MedInc', 0.5250374507714012)
('HouseAge', 0.05455772334006866)
('AveRooms', 0.044194922413444394)
('AveBedrms', 0.02961956224930236)
('Population', 0.030503186662991705)
('AveOccup', 0.13856367279657208)
('Latitude', 0.08885463157770296)
('Longitude', 0.08866885018851674)
Random Forest R² score: 0.8049


In [26]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load data
housing = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

# Create pipeline with preprocessing and model
model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('gbr', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

# Train the pipeline
model_pipeline.fit(X_train, y_train)

# Evaluate
score = model_pipeline.score(X_test, y_test)
print(f"Gradient Boosting Pipeline R² score: {score:.4f}")

Gradient Boosting Pipeline R² score: 0.7756
