# Final Evaluation/Submission - House Pricing

This notebook:
 - trains the best models on the full training data
 - makes predictions on the test set
 - reverses log transformation
 - saves prediction for kaggle

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# Reverse log1p after prediction
def reverse_log(preds_log):
    return np.expm1(preds_log)
reverse_log = lambda x: np.expm1(x)


## Preprocess Test Data

In [2]:
train_df = pd.read_csv('../data/processed_train.csv')
X_train = train_df.drop('SalePrice', axis=1)
y_train = train_df['SalePrice']

# Load original test set for prediction
test_ids = pd.read_csv('../data/test.csv')['Id']
test_raw = pd.read_csv('../data/test.csv')

# Drop same columns removed in cleaning
cols_to_drop = ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu']
test_raw = test_raw.drop(columns=cols_to_drop)

# Fill missing values
cat_cols = test_raw.select_dtypes(include='object').columns
test_raw[cat_cols] = test_raw[cat_cols].fillna('None')

num_cols = test_raw.select_dtypes(include=np.number).columns
test_raw[num_cols] = test_raw[num_cols].fillna(test_raw[num_cols].median())

# Log-transform skewed features
skewed_features = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
    'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
    'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
    '3SsnPorch', 'ScreenPorch', 'MiscVal'
]
for feature in skewed_features:
    if feature in test_raw.columns:
        test_raw[feature] = np.log1p(test_raw[feature])

# One-hot encode and align
test_encoded = pd.get_dummies(test_raw, drop_first=True)
X_test = test_encoded.reindex(columns=X_train.columns, fill_value=0)


# Train Final Ridge Model on full training data

In [3]:
ridge = Ridge(alpha=10)
ridge.fit(X_train, y_train)
ridge_preds = reverse_log(ridge.predict(X_test))

submission_ridge = pd.DataFrame({'Id': test_ids, 'SalePrice': ridge_preds})
submission_ridge.to_csv('../data/submission_ridge.csv', index=False)
print("✅ Ridge submission saved.")


✅ Ridge submission saved.


## Gradient Boosting Model

In [4]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr.fit(X_train, y_train)
gbr_preds = reverse_log(gbr.predict(X_test))

submission_gbr = pd.DataFrame({'Id': test_ids, 'SalePrice': gbr_preds})
submission_gbr.to_csv('../data/submission_gbr.csv', index=False)
print("✅ Gradient Boosting submission saved.")


✅ Gradient Boosting submission saved.


## Decision Tree Model

In [5]:
tree = DecisionTreeRegressor(max_depth=5, min_samples_leaf=3, random_state=42)
tree.fit(X_train, y_train)
tree_preds = reverse_log(tree.predict(X_test))

submission_tree = pd.DataFrame({'Id': test_ids, 'SalePrice': tree_preds})
submission_tree.to_csv('../data/submission_tree.csv', index=False)
print("✅ Decision Tree submission saved.")


✅ Decision Tree submission saved.


In [6]:
## XGBoost

In [7]:
xgb = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=3, random_state=42)
xgb.fit(X_train, y_train)
xgb_preds = reverse_log(xgb.predict(X_test))
submission_xgb = pd.DataFrame({'Id': test_ids, 'SalePrice': xgb_preds})
submission_xgb.to_csv('../data/submission_xgb.csv', index=False)

## LightGBM

In [8]:
lgb = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    min_gain_to_split=0.0,
    random_state=42
)
lgb.fit(X_train, y_train)
lgb_preds = reverse_log(lgb.predict(X_test))
submission_lgb = pd.DataFrame({'Id': test_ids, 'SalePrice': lgb_preds})
submission_lgb.to_csv('../data/submission_lgb.csv', index=False)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002789 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3632
[LightGBM] [Info] Number of data points in the train set: 1458, number of used features: 152
[LightGBM] [Info] Start training from score 12.024015


## Final Summary