In [1]:
!pip install -q xgboost


In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import joblib
import matplotlib.pyplot as plt


In [3]:
data = fetch_california_housing(as_frame=True)
df = data.frame
df.rename(columns={"MedHouseVal": "target"}, inplace=True)

df.head()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [4]:
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [5]:
num_cols = df.drop(columns=['target']).columns.tolist()

preprocessor = ColumnTransformer([
    ('scale', StandardScaler(), num_cols)
])


In [6]:
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((16512, 8), (4128, 8))

In [8]:
lr_model = Pipeline([
    ('pre', preprocessor),
    ('lr', LinearRegression())
])

lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

lr_rmse = mean_squared_error(y_test, lr_pred) ** 0.5

lr_r2   = r2_score(y_test, lr_pred)

print("Linear Regression → RMSE:", lr_rmse, " R²:", lr_r2)


Linear Regression → RMSE: 0.7455813830127763  R²: 0.575787706032451


In [11]:
rf_model = Pipeline([
    ('pre', preprocessor),
    ('rf', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
])

rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

rf_rmse = mean_squared_error(y_test, rf_pred) ** 0.5
rf_r2   = r2_score(y_test, rf_pred)

print("Random Forest → RMSE:", rf_rmse, " R²:", rf_r2)


Random Forest → RMSE: 0.5038019900730704  R²: 0.8063074586513359


In [12]:
xgb_model = Pipeline([
    ('pre', preprocessor),
    ('xgb', XGBRegressor(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        n_jobs=-1
    ))
])

xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

xgb_rmse = mean_squared_error(y_test, xgb_pred) ** 0.5
xgb_r2   = r2_score(y_test, xgb_pred)

print("XGBoost → RMSE:", xgb_rmse, " R²:", xgb_r2)


XGBoost → RMSE: 0.45664349131075066  R²: 0.8408716044998452


In [13]:
print("MODEL COMPARISON")
print("-----------------")
print("Linear Regression:", lr_rmse, lr_r2)
print("Random Forest:", rf_rmse, rf_r2)
print("XGBoost:", xgb_rmse, xgb_r2)


MODEL COMPARISON
-----------------
Linear Regression: 0.7455813830127763 0.575787706032451
Random Forest: 0.5038019900730704 0.8063074586513359
XGBoost: 0.45664349131075066 0.8408716044998452


In [14]:
param_grid = {
    'xgb__n_estimators': [200, 300],
    'xgb__learning_rate': [0.05, 0.1],
    'xgb__max_depth': [4, 6]
}

grid = GridSearchCV(
    xgb_model,
    param_grid,
    cv=3,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
best_model = grid.best_estimator_


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Params: {'xgb__learning_rate': 0.1, 'xgb__max_depth': 6, 'xgb__n_estimators': 300}


In [15]:
best_pred = best_model.predict(X_test)
# rmse = mean_squared_error(y_test, best_pred, squared=False)
rmse = mean_squared_error(y_test, best_pred) ** 0.5

r2 = r2_score(y_test, best_pred)

print("Best Tuned Model → RMSE:", rmse, " R²:", r2)


Best Tuned Model → RMSE: 0.45664349131075066  R²: 0.8408716044998452


In [16]:
results = {
    "Linear Regression": lr_rmse,
    "Random Forest": rf_rmse,
    "XGBoost": xgb_rmse
}

best_model_name = min(results, key=results.get)
print("Best model based on RMSE is:", best_model_name)


Best model based on RMSE is: XGBoost


In [17]:
joblib.dump(best_model, "house_price_best_model.pkl")
print("Saved as house_price_best_model.pkl")


Saved as house_price_best_model.pkl


In [27]:
idx = X_test.index[5]   # actual row index from original dataset

sample = X_test.loc[[idx]]
pred = best_model.predict(sample)
print(sample)
print("Predicted:", pred[0])
print("Actual:", y_test.loc[idx])


       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
13311  4.7147      12.0  5.251483   0.975089      2400.0  2.846975     34.08   

       Longitude  
13311    -117.61  
Predicted: 1.534694
Actual: 1.587
