In [10]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from scipy.stats import skew

# -----------------------------
# 1. Data Loading
# -----------------------------
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
test_ID = test['Id']  # Save test IDs for submission

# -----------------------------
# 2. Preprocessing & Feature Engineering
# -----------------------------
# Log-transform the target variable for a better distribution
train["SalePrice"] = np.log1p(train["SalePrice"])

# Combine train and test data for consistent preprocessing
ntrain = train.shape[0]
all_data = pd.concat((train.drop('SalePrice', axis=1), test)).reset_index(drop=True)

# Fill missing values:
# - For numeric features, fill with the median
# - For categorical features, fill with "None"
numeric_feats = all_data.select_dtypes(include=[np.number]).columns
for col in numeric_feats:
    all_data[col].fillna(all_data[col].median(), inplace=True)

categorical_feats = all_data.select_dtypes(include=[object]).columns
for col in categorical_feats:
    all_data[col].fillna("None", inplace=True)

# Transform skewed numeric features (threshold: absolute skew > 0.75)
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_features = skewed_feats[abs(skewed_feats) > 0.75].index
print("Skewed features:", list(skewed_features))
for feat in skewed_features:
    all_data[feat] = np.log1p(all_data[feat])

# One-hot encode categorical features
all_data = pd.get_dummies(all_data)

# Split the data back into training and test sets
train_data = all_data[:ntrain]
test_data = all_data[ntrain:]
y = train["SalePrice"]

# -----------------------------
# 3. Train / Holdout Split for Grid Search
# -----------------------------
X_train, X_holdout, y_train, y_holdout = train_test_split(
    train_data, y, test_size=0.2, random_state=42
)

# -----------------------------
# 4. Grid Search with XGBoost
# -----------------------------
# Define a modest parameter grid (including regularization parameters) to keep runtime reasonable
param_grid = {
    "max_depth": [3, 4],
    "learning_rate": [0.01, 0.05],
    "n_estimators": [100, 200],
    "reg_alpha": [0, 0.1, 1],
    "reg_lambda": [1, 1.5],
    "subsample": [0.8],
    "colsample_bytree": [0.8]
}

xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Higher is better in GridSearchCV
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print("Best XGBoost parameters found:", grid_search.best_params_)
print("Best CV score (negative MSE):", grid_search.best_score_)

# -----------------------------
# 5. Evaluation on the Holdout Set with XGBoost
# -----------------------------
best_xgb = grid_search.best_estimator_
holdout_preds_xgb = best_xgb.predict(X_holdout)
holdout_rmse_xgb = np.sqrt(mean_squared_error(np.expm1(y_holdout), np.expm1(holdout_preds_xgb)))
print("Holdout RMSE (XGBoost):", holdout_rmse_xgb)

# -----------------------------
# 6. Refit Best XGBoost on Full Training Data with Early Stopping
# -----------------------------
X_full_train, X_val, y_full_train, y_val = train_test_split(
    train_data, y, test_size=0.1, random_state=42
)

best_xgb.fit(
    X_full_train, y_full_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)

# -----------------------------
# 7. Train a Ridge Regression Model
# -----------------------------
ridge = Ridge()
ridge_param_grid = {
    "alpha": [0.1, 1.0, 10.0]
}

ridge_grid = GridSearchCV(
    estimator=ridge,
    param_grid=ridge_param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

ridge_grid.fit(X_train, y_train)
print("Best Ridge parameters found:", ridge_grid.best_params_)
best_ridge = ridge_grid.best_estimator_

# Evaluate Ridge on the holdout set
holdout_preds_ridge = best_ridge.predict(X_holdout)
holdout_rmse_ridge = np.sqrt(mean_squared_error(np.expm1(y_holdout), np.expm1(holdout_preds_ridge)))
print("Holdout RMSE (Ridge):", holdout_rmse_ridge)

# -----------------------------
# 8. Ensemble Predictions on Test Set
# -----------------------------
# Get predictions from both models on the test set
test_preds_xgb = best_xgb.predict(test_data)
test_preds_ridge = best_ridge.predict(test_data)

# Ensemble: weighted average of XGBoost and Ridge predictions
# Here we weight XGBoost higher; you can tune these weights further.
ensemble_preds = 0.7 * test_preds_xgb + 0.3 * test_preds_ridge
ensemble_preds = np.expm1(ensemble_preds)  # reverse the log transform

# -----------------------------
# 9. Create Submission File
# -----------------------------
submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": ensemble_preds
})
submission.to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")


Skewed features: ['MSSubClass', 'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtHalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best XGBoost parameters found: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.8}
Best CV score (negative MSE): -0.01578985541269833
Holdout RMSE (XGBoost): 25268.083955857885
[0]	validation_0-rmse:0.42818
[1]	validation_0-rmse:0.41310
[2]	validation_0-rmse:0.39931
[3]	validation_0-rmse:0.38535
[4]	validation_0-rmse:0.37365
[5]	validation_0-rmse:0.35993
[6]	validation_0-rmse:0.34791
[7]	validation_0-rmse:0.33701
[8]	validation_0-rmse:0.32623
[9]	validation_0-rmse:0.31522
[10]	validation_0-rmse:0.30600
[11]	validation_0-r