In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import tensorflow as tf
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

In [None]:
#  Load data

train_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_df  = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
test_ids = test_df["Id"].copy()

In [None]:
#  feature engineering

for df in [train_df, test_df]:
    df["TotalSF"] = df["TotalBsmtSF"].fillna(0) + df["1stFlrSF"].fillna(0) + df["2ndFlrSF"].fillna(0)
    df["TotalBath"] = (
        df["FullBath"].fillna(0) + 0.5*df["HalfBath"].fillna(0) +
        df["BsmtFullBath"].fillna(0) + 0.5*df["BsmtHalfBath"].fillna(0)
    )
    df["HouseAge"] = df["YrSold"] - df["YearBuilt"]
    df["RemodAge"] = df["YrSold"] - df["YearRemodAdd"]

for df in [train_df, test_df]:
    df["GrLivArea_Qual"] = df["GrLivArea"] * df["OverallQual"]
    df["TotalSF_Qual"]   = df["TotalSF"]   * df["OverallQual"]
    df["LotArea_Qual"]   = df["LotArea"]   * df["OverallQual"]
    df["GarageCars_SF"]  = df["GarageCars"] * df["TotalSF"]

# Drop Id
train_df = train_df.drop(columns=["Id"])
test_df  = test_df.drop(columns=["Id"])

In [None]:
# Missing values

for col in train_df.select_dtypes(include=["float64", "int64"]).columns:
    train_df[col] = train_df[col].fillna(train_df[col].median())
for col in train_df.select_dtypes(include=["object"]).columns:
    train_df[col] = train_df[col].fillna("Missing")

for col in test_df.select_dtypes(include=["float64", "int64"]).columns:
    test_df[col] = test_df[col].fillna(train_df[col].median())
for col in test_df.select_dtypes(include=["object"]).columns:
    test_df[col] = test_df[col].fillna("Missing")

In [None]:
# Features and target

X_df = train_df.drop(columns=["SalePrice"])
y    = np.log1p(train_df["SalePrice"])

num_cols = X_df.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_df.select_dtypes(include=["object"]).columns

num_pipe = Pipeline(steps=[('scaler', StandardScaler())])
cat_pipe = Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipe, num_cols),
        ('cat', cat_pipe, cat_cols)
    ]
)

# Fit on train 
X_all   = preprocessor.fit_transform(X_df)
X_test  = preprocessor.transform(test_df)

# Get feature names for pruning
feat_names = preprocessor.get_feature_names_out()

# Convert to arrays 
X_all  = X_all.toarray()
X_test = X_test.toarray()


# Feature importance pruning (LightGBM on full train)

lgb_probe = lgb.LGBMRegressor(
    n_estimators=2000, learning_rate=0.05,
    num_leaves=31, subsample=0.7, colsample_bytree=0.7,
    random_state=42
)
lgb_probe.fit(X_all, y)
importances = lgb_probe.feature_importances_.astype(float)
importances = importances / (importances.sum() + 1e-12)

# Rank features
idx_sorted = np.argsort(importances)[::-1]
cum_imp = np.cumsum(importances[idx_sorted])

# Choose K 
K_floor = min(300, len(idx_sorted))
K_cum = int(np.searchsorted(cum_imp, 0.95) + 1)
K = max(K_floor, K_cum)
K = min(K, len(idx_sorted))  

keep_idx = idx_sorted[:K]
keep_idx_sorted = np.sort(keep_idx)  

X_all_pruned  = X_all[:, keep_idx_sorted]
X_test_pruned = X_test[:, keep_idx_sorted]
feat_names_pruned = feat_names[keep_idx_sorted]

print(f"Kept top {K} features out of {len(feat_names)} by LightGBM importance (~95% cumulative gain).")

In [None]:
# Model

def build_nn(input_dim, seed=None):
    if seed is not None:
        tf.keras.utils.set_random_seed(seed)
    l2 = tf.keras.regularizers.l2(5e-4)
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=l2, input_shape=(input_dim,)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),

        tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=l2),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.25),

        tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=l2),
        tf.keras.layers.Dropout(0.2),

        tf.keras.layers.Dense(1)  # regression (log-price)
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss=tf.keras.losses.Huber(delta=1.0)
    )
    return model

xgb_params = dict(
    n_estimators=5000,       
    learning_rate=0.03,
    max_depth=4,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    early_stopping_rounds=200
)

lgb_params = dict(
    n_estimators=5000,
    learning_rate=0.03,
    num_leaves=32,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42
)

cat_params = dict(
    iterations=5000,
    learning_rate=0.03,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=False,
    od_type='Iter',
    od_wait=200
)

In [None]:
# 10-Fold 

kf = KFold(n_splits=10, shuffle=True, random_state=42)
n = X_all_pruned.shape[0]
m = X_test_pruned.shape[0]

oof_nn  = np.zeros(n)
oof_xgb = np.zeros(n)
oof_lgb = np.zeros(n)
oof_cat = np.zeros(n)

test_pred_nn  = np.zeros((m, 10))
test_pred_xgb = np.zeros((m, 10))
test_pred_lgb = np.zeros((m, 10))
test_pred_cat = np.zeros((m, 10))

for fold, (tr_idx, va_idx) in enumerate(kf.split(X_all_pruned), start=1):
    print(f"\n===== FOLD {fold}/10 =====")
    X_tr, X_va = X_all_pruned[tr_idx], X_all_pruned[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    
    preds_va_bag = []
    preds_te_bag = []
    for seed in [fold*13, fold*29]:  
        nn = build_nn(X_tr.shape[1], seed=seed)
        cb_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
        cb_rlr   = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=8, min_lr=1e-6)
        nn.fit(X_tr, y_tr, validation_data=(X_va, y_va),
               epochs=500, batch_size=32, verbose=0, callbacks=[cb_early, cb_rlr])
        preds_va_bag.append(nn.predict(X_va, verbose=0).flatten())
        preds_te_bag.append(nn.predict(X_test_pruned, verbose=0).flatten())

    va_nn  = np.mean(preds_va_bag, axis=0)
    te_nn  = np.mean(preds_te_bag, axis=0)
    oof_nn[va_idx] = va_nn
    test_pred_nn[:, fold-1] = te_nn
    

In [None]:
    # --- XGBoost ---
    xgb_model = xgb.XGBRegressor(**xgb_params)
    xgb_model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
    oof_xgb[va_idx] = xgb_model.predict(X_va)
    test_pred_xgb[:, fold-1] = xgb_model.predict(X_test_pruned)

In [None]:
    # --- LightGBM ---
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)],
                  callbacks=[lgb.early_stopping(200), lgb.log_evaluation(0)])
    oof_lgb[va_idx] = lgb_model.predict(X_va)
    test_pred_lgb[:, fold-1] = lgb_model.predict(X_test_pruned)

In [None]:
    # --- CatBoost ---
    cat = CatBoostRegressor(**cat_params)
    cat.fit(X_tr, y_tr, eval_set=(X_va, y_va), verbose=False)
    oof_cat[va_idx] = cat.predict(X_va)
    test_pred_cat[:, fold-1] = cat.predict(X_test_pruned)

In [None]:
# OOF RMSE diagnostics
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

rmse_nn  = rmse(y, oof_nn)
rmse_xgb = rmse(y, oof_xgb)
rmse_lgb = rmse(y, oof_lgb)
rmse_cat = rmse(y, oof_cat)
print("\nOOF RMSE (log-space):")
print(f"NN:   {rmse_nn:.6f}")
print(f"XGB:  {rmse_xgb:.6f}")
print(f"LGBM: {rmse_lgb:.6f}")
print(f"CAT:  {rmse_cat:.6f}")

In [None]:
#  Meta-model stacking (Ridge)

stack_X = np.vstack([oof_nn, oof_xgb, oof_lgb, oof_cat]).T
stack_T = np.vstack([
    test_pred_nn.mean(axis=1),
    test_pred_xgb.mean(axis=1),
    test_pred_lgb.mean(axis=1),
    test_pred_cat.mean(axis=1)
]).T

meta = Ridge(alpha=1.0, random_state=42)
meta.fit(stack_X, y)
oof_stack = meta.predict(stack_X)
stack_rmse = rmse(y, oof_stack)
test_stack_log = meta.predict(stack_T)

print(f"\nOOF RMSE (Stacked Ridge): {stack_rmse:.6f}")

In [None]:
# Optimized convex blending (non-negative weights sum to 1)

models_oof = np.vstack([oof_nn, oof_xgb, oof_lgb, oof_cat])  # shape (4, n)
models_test = np.vstack([
    test_pred_nn.mean(axis=1),
    test_pred_xgb.mean(axis=1),
    test_pred_lgb.mean(axis=1),
    test_pred_cat.mean(axis=1)
])  

best_rmse = 1e9
best_w = None

step = 0.05
weights = np.arange(0.0, 1.0 + 1e-9, step)

for w1 in weights:
    for w2 in weights:
        for w3 in weights:
            w4 = 1.0 - (w1 + w2 + w3)
            if w4 < -1e-9 or w4 > 1.0:  
                continue
            if w4 < 0:  
                w4 = 0.0
            # normalize to sum 1
            s = w1 + w2 + w3 + w4
            if s == 0:
                continue
            w = np.array([w1, w2, w3, w4]) / s
            blend_oof = (w[:, None] * models_oof).sum(axis=0)
            score = rmse(y, blend_oof)
            if score < best_rmse:
                best_rmse = score
                best_w = w

print(f"\nBest OOF RMSE (Optimized Blend): {best_rmse:.6f} with weights [NN, XGB, LGB, CAT] = {best_w}")

blend_test_log = (best_w[:, None] * models_test).sum(axis=0)

In [None]:
# Choose between Stacking vs Optimized Blend (lower OOF RMSE wins)

use_stack = stack_rmse <= best_rmse
final_log = test_stack_log if use_stack else blend_test_log
choice = "STACKED (Ridge meta-model)" if use_stack else "OPTIMIZED BLEND (grid search)"
print(f"\nChosen finalizer: {choice}")

In [None]:
# Back-transform & save submission

final_pred = np.expm1(final_log)  # inverse of log1p

submission = pd.DataFrame({"Id": test_ids, "SalePrice": final_pred})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv saved.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



--- Fold 1 ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4587
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 171
[LightGBM] [Info] Start training from score 12.023838
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[268]	valid_0's l2: 0.00931374

--- Fold 2 ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000907 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4591
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 171
[LightGBM] [Info] Start training from score 12.025222
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[431]	valid_0's l2: 0.015207

--- Fold 3 ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000975 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4582
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 171
[LightGBM] [Info] Start training from score 12.024459
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[215]	valid_0's l2: 0.0234593

--- Fold 4 ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


KeyboardInterrupt: 