In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

In [3]:
# --- 1. Load Dataset ---
housing = fetch_california_housing(as_frame=True)
df = housing.frame

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head())

Dataset shape: (20640, 9)
Columns: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MedHouseVal']


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [4]:
# --- 2. Features & Target ---
X = df.drop("MedHouseVal", axis=1)  # all features
y = df["MedHouseVal"]               # target: median house value (in $100,000s)

# --- 3. Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- 4. Train Model ---
model = LinearRegression()
model.fit(X_train, y_train)

# --- 5. Evaluate ---
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", rmse)
print()

# --- 6. Save Model ---
joblib.dump(model, "house_price_prediction_model.joblib")
print("Model saved as house_price_model.joblib")

Root Mean Squared Error (RMSE): 0.7455813830127764

Model saved as house_price_model.joblib


In [5]:
# --- 7. Demo Predictions ---
print("\n--- Sample Predictions ---")

# Take 5 random houses from test set
sample_data = X_test.iloc[:5]
sample_true = y_test.iloc[:5]
sample_pred = model.predict(sample_data)

for i in range(len(sample_data)):
    print(f"\nHouse {i+1}:")
    print(sample_data.iloc[i].to_dict())
    print(f"True Value: ${sample_true.iloc[i]*100000:,.0f}")
    print(f"Predicted: ${sample_pred[i]*100000:,.0f}")


--- Sample Predictions ---

House 1:
{'MedInc': 1.6812, 'HouseAge': 25.0, 'AveRooms': 4.192200557103064, 'AveBedrms': 1.0222841225626742, 'Population': 1392.0, 'AveOccup': 3.8774373259052926, 'Latitude': 36.06, 'Longitude': -119.01}
True Value: $47,700
Predicted: $71,912

House 2:
{'MedInc': 2.5313, 'HouseAge': 30.0, 'AveRooms': 5.039383561643835, 'AveBedrms': 1.1934931506849316, 'Population': 1565.0, 'AveOccup': 2.6797945205479454, 'Latitude': 35.14, 'Longitude': -119.46}
True Value: $45,800
Predicted: $176,402

House 3:
{'MedInc': 3.4801, 'HouseAge': 52.0, 'AveRooms': 3.977154724818276, 'AveBedrms': 1.185877466251298, 'Population': 1310.0, 'AveOccup': 1.3603322949117342, 'Latitude': 37.8, 'Longitude': -122.44}
True Value: $500,001
Predicted: $270,966

House 4:
{'MedInc': 5.7376, 'HouseAge': 17.0, 'AveRooms': 6.163636363636364, 'AveBedrms': 1.02020202020202, 'Population': 1705.0, 'AveOccup': 3.4444444444444446, 'Latitude': 34.28, 'Longitude': -118.72}
True Value: $218,600
Predicted: 