In [3]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load dataset
df = pd.read_csv("/kaggle/input/housing-prices-dataset/Housing.csv")
df.dropna(inplace=True)

# Remove outliers using IQR
Q1, Q3 = df["price"].quantile([0.25, 0.75])
IQR = Q3 - Q1
lower_bound, upper_bound = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
df_cleaned = df[(df["price"] >= lower_bound) & (df["price"] <= upper_bound)].copy()

# Feature engineering
df_cleaned["log_area"] = np.log(df_cleaned["area"])
features = ["log_area", "bedrooms", "bathrooms", "stories", "parking",
            "mainroad", "guestroom", "basement", "hotwaterheating",
            "airconditioning", "prefarea", "furnishingstatus"]
target = df_cleaned["price"]

X = df_cleaned[features]
y = target

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ["log_area", "bathrooms", "bedrooms", "stories", "parking"]),
        ("cat", OneHotEncoder(), ["mainroad", "guestroom", "basement", "hotwaterheating",
                                  "airconditioning", "prefarea", "furnishingstatus"]),
    ]
)

# Pipeline
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", XGBRegressor(objective="reg:squarederror", random_state=42))
    ]
)

# Hyperparameter tuning
param_grid = {
    "regressor__n_estimators": [100, 200, 500],
    "regressor__learning_rate": [0.01, 0.05, 0.1],
    "regressor__max_depth": [3, 5, 7],
    "regressor__subsample": [0.8, 1.0],
    "regressor__colsample_bytree": [0.8, 1.0]
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring="neg_mean_squared_error", verbose=1)
grid_search.fit(X, y)

# Best model
best_model = grid_search.best_estimator_
print(f"Best hyperparameters: {grid_search.best_params_}")
print(f"Best score (neg MSE): {grid_search.best_score_}")

# Cross-validation evaluation
mse_list, r2_list, mae_list = [], [], []

for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model = best_model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse_list.append(mean_squared_error(y_test, y_pred))
    r2_list.append(r2_score(y_test, y_pred))
    mae_list.append(mean_absolute_error(y_test, y_pred))

    print(f"Fold {fold + 1}: MSE={mse_list[-1]}, R²={r2_list[-1]}, MAE={mae_list[-1]}")

# Cross-validation results
print("\nCross-Validation Results:")
print(f"Average MSE: {np.mean(mse_list):.2f}")
print(f"Average R²: {np.mean(r2_list):.4f}")
print(f"Average MAE: {np.mean(mae_list):.2f}")

# Get Predictions
# Example new data (replace with your actual data)
new_data = pd.DataFrame({
    "log_area": [8.5, 9.0, 7.8],
    "bedrooms": [3, 4, 2],
    "bathrooms": [2, 3, 1],
    "stories": [2, 3, 1],
    "parking": [1, 2, 0],
    "mainroad": ["yes", "yes", "no"],
    "guestroom": ["no", "yes", "no"],
    "basement": ["no", "yes", "no"],
    "hotwaterheating": ["no", "no", "no"],
    "airconditioning": ["yes", "yes", "no"],
    "prefarea": ["yes", "no", "no"],
    "furnishingstatus": ["furnished", "semi-furnished", "unfurnished"]
})

# Get predictions
predictions = best_model.predict(new_data)

# Print predictions
print("\nPredicted House Prices:")
print(predictions)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best hyperparameters: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__n_estimators': 100, 'regressor__subsample': 0.8}
Best score (neg MSE): -845086862452.5889
Fold 1: MSE=1149840969015.694, R²=0.6679274665556774, MAE=803013.2759433963
Fold 2: MSE=1164599392120.7954, R²=0.5917222778453479, MAE=844065.1344339623
Fold 3: MSE=515956421858.6102, R²=0.6361916696987691, MAE=579082.766509434
Fold 4: MSE=750675662815.648, R²=0.6915832353459215, MAE=657300.9599056604
Fold 5: MSE=644361866452.1969, R²=0.7300459914435349, MAE=636941.7075471698

Cross-Validation Results:
Average MSE: 845086862452.59
Average R²: 0.6635
Average MAE: 704080.77

Predicted House Prices:
[6913160.5 7696461.  2568069.2]
