## Model Validation and Hyperparameter Tuning for House Price Prediction 
#### Objective
To validate and improve the house price prediction model by controlling overfitting and applying hyperparameter tuning

### Libraries Import

In [1]:
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Scikit-learn utilities
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Dataset Load

In [2]:
data = fetch_california_housing(as_frame=True)
df = pd.concat([data.data, data.target.rename("HousePrice")], axis=1)

In [3]:
# Display first 5 rows
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,HousePrice
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [4]:
# Splitting features and target
X = df.drop("HousePrice", axis=1)
y = df["HousePrice"]

In [5]:
# Scaling features for stable learning
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


### Train-Test Split


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

### Implementation of Regression Algorithms

In [7]:
# Training a Decision Tree without constraints
tree = DecisionTreeRegressor(random_state=42)
tree.fit(X_train, y_train)

# Predictions
train_pred = tree.predict(X_train)
test_pred = tree.predict(X_test)

# RMSE comparison
train_rmse = mean_squared_error(y_train, train_pred, squared=False)
test_rmse = mean_squared_error(y_test, test_pred, squared=False)

train_rmse, test_rmse
tree = DecisionTreeRegressor(random_state=42)
tree.fit(X_train, y_train)

train_pred = tree.predict(X_train)
test_pred = tree.predict(X_test)

train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
test_rmse  = np.sqrt(mean_squared_error(y_test, test_pred))

print("Training RMSE:", train_rmse)
print("Testing RMSE :", test_rmse)


Training RMSE: 3.218325866275131e-16
Testing RMSE : 0.7030445773467542


##### Cross-Validation

In [8]:
cv_scores = cross_val_score(
    tree,
    X_scaled,
    y,
    scoring="neg_root_mean_squared_error",
    cv=5
)

cv_rmse = -cv_scores.mean()
cv_rmse

0.8957031908951016

##### Hyperparameter Grid

In [9]:
param_grid = {
    "max_depth": [3, 5, 7, 10],
    "min_samples_split": [2, 5, 10]
}

# Grid search for optimal parameters
grid = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid,
    scoring="neg_root_mean_squared_error",
    cv=5
)

grid.fit(X_train, y_train)

# Best parameters
grid.best_params_

{'max_depth': 10, 'min_samples_split': 10}

In [10]:
# Best tuned model
best_tree = grid.best_estimator_

# Predictions on test data
y_pred = best_tree.predict(X_test)

# Evaluation metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

rmse, r2

(0.6454300828015771, 0.6820992539714815)

#### Implementation of Regression Algorithms

In [11]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

lr_pred = lr.predict(X_test)

lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))
lr_r2 = r2_score(y_test, lr_pred)

print("Linear Regression RMSE:", lr_rmse)
print("Linear Regression R2  :", lr_r2)

Linear Regression RMSE: 0.7455813830127763
Linear Regression R2  : 0.575787706032451


In [12]:
# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

ridge_pred = ridge.predict(X_test)

ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
ridge_r2 = r2_score(y_test, ridge_pred)

print("Ridge Regression RMSE:", ridge_rmse)
print("Ridge Regression R2  :", ridge_r2)

Ridge Regression RMSE: 0.7455542909384607
Ridge Regression R2  : 0.5758185345441325


#### Comparative Performance Analysis

In [13]:
# Model Comparison Table
results = {
    "Model": ["Linear Regression", "Ridge Regression", "Tuned Decision Tree"],
    "RMSE": [lr_rmse, ridge_rmse, rmse],
    "R2 Score": [lr_r2, ridge_r2, r2]
}

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,RMSE,R2 Score
0,Linear Regression,0.745581,0.575788
1,Ridge Regression,0.745554,0.575819
2,Tuned Decision Tree,0.64543,0.682099


### Conclusion 
The tuned Decision Tree model was chosen as the final model because it showed better performance on the test data. It produced a higher RÂ² score and a lower RMSE value compared to the basic Linear Regression model.The Decision Tree was overfitting the data, but after applying hyperparameter tuning, the model became more balanced and stable. Cross-validation was also performed to check the consistency of the model across different data splits instead of relying only on a single train-test split.
This approach balances model performance and generalization, which is critical for real-world machine learning applications.