# House Prices - Advanced Regression Techniques
## SCORE: .12150

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, RobustScaler, PowerTransformer
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.feature_selection import mutual_info_regression
from scipy.optimize import minimize
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

n_jobs = max(1, os.cpu_count() - 1)
print(f"Using {n_jobs} CPU cores (leaving 1 free)")

data_dir = 'house-prices-advanced-regression-techniques'
train = pd.read_csv(f'{data_dir}/train.csv')
test = pd.read_csv(f'{data_dir}/test.csv')

print(f"Train: {train.shape}, Test: {test.shape}")

Using 11 CPU cores (leaving 1 free)
Train: (1460, 81), Test: (1459, 80)


In [2]:
train_target = train['SalePrice'].copy()
test_ids = test['Id'].copy()

train_idx = len(train)
all_data = pd.concat([train.drop('SalePrice', axis=1), test], ignore_index=True)

y_train_log = np.log1p(train_target)

In [3]:
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)

none_cols = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
             'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 
             'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

for col in none_cols:
    if col in all_data.columns:
        all_data[col].fillna('None', inplace=True)

if 'LotFrontage' in all_data.columns:
    all_data['LotFrontage'].fillna(all_data['LotFrontage'].median(), inplace=True)
if 'MasVnrType' in all_data.columns:
    all_data['MasVnrType'].fillna('None', inplace=True)
if 'MasVnrArea' in all_data.columns:
    all_data['MasVnrArea'].fillna(0, inplace=True)
if 'Electrical' in all_data.columns:
    all_data['Electrical'].fillna(all_data['Electrical'].mode()[0], inplace=True)
if 'GarageYrBlt' in all_data.columns:
    all_data['GarageYrBlt'].fillna(all_data['YearBuilt'], inplace=True)

numerical_cols = all_data.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    if all_data[col].isnull().sum() > 0:
        all_data[col].fillna(0, inplace=True)

categorical_cols = all_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if all_data[col].isnull().sum() > 0:
        all_data[col].fillna(all_data[col].mode()[0], inplace=True)

skewed_features = ['MiscVal', 'PoolArea', 'LotArea', '3SsnPorch', 'LowQualFinSF', 
                   'BsmtFinSF2', 'ScreenPorch', 'EnclosedPorch', 'MasVnrArea', 
                   'OpenPorchSF', 'LotFrontage', 'BsmtFinSF1', 'WoodDeckSF']
for col in skewed_features:
    if col in all_data.columns:
        all_data[f'{col}_log'] = np.log1p(all_data[col])

In [4]:
if all(col in all_data.columns for col in ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']):
    all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
    all_data['TotalSF_log'] = np.log1p(all_data['TotalSF'])

if all(col in all_data.columns for col in ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']):
    all_data['TotalBathrooms'] = (all_data['FullBath'] + 
                                  all_data['HalfBath'] * 0.5 + 
                                  all_data['BsmtFullBath'] + 
                                  all_data['BsmtHalfBath'] * 0.5)

if 'YrSold' in all_data.columns and 'YearBuilt' in all_data.columns:
    all_data['HouseAge'] = all_data['YrSold'] - all_data['YearBuilt']
    all_data['YearsSinceRemodel'] = all_data['YrSold'] - all_data['YearRemodAdd']
    all_data['Remodeled'] = (all_data['YearBuilt'] != all_data['YearRemodAdd']).astype(int)
    if 'GarageYrBlt' in all_data.columns:
        all_data['GarageAge'] = all_data['YrSold'] - all_data['GarageYrBlt']
        all_data['GarageAge'] = all_data['GarageAge'].fillna(0)

if 'TotalBsmtSF' in all_data.columns:
    all_data['HasBasement'] = (all_data['TotalBsmtSF'] > 0).astype(int)
if 'GarageArea' in all_data.columns:
    all_data['HasGarage'] = (all_data['GarageArea'] > 0).astype(int)
if '2ndFlrSF' in all_data.columns:
    all_data['Has2ndFloor'] = (all_data['2ndFlrSF'] > 0).astype(int)

if 'OverallQual' in all_data.columns:
    all_data['OverallQual2'] = all_data['OverallQual'] ** 2
    if 'GrLivArea' in all_data.columns:
        all_data['OverallQual_GrLivArea'] = all_data['OverallQual'] * all_data['GrLivArea']
    if 'TotalBsmtSF' in all_data.columns:
        all_data['OverallQual_TotalBsmtSF'] = all_data['OverallQual'] * all_data['TotalBsmtSF']
    if 'GarageCars' in all_data.columns:
        all_data['OverallQual_GarageCars'] = all_data['OverallQual'] * all_data['GarageCars']
    if 'OverallCond' in all_data.columns:
        all_data['OverallQual_OverallCond'] = all_data['OverallQual'] * all_data['OverallCond']

if 'GrLivArea' in all_data.columns:
    all_data['GrLivArea_log'] = np.log1p(all_data['GrLivArea'])
    if 'TotalBathrooms' in all_data.columns:
        all_data['AreaPerBath'] = all_data['GrLivArea'] / (all_data['TotalBathrooms'] + 0.1)
    if 'TotRmsAbvGrd' in all_data.columns:
        all_data['AreaPerRoom'] = all_data['GrLivArea'] / (all_data['TotRmsAbvGrd'] + 0.1)

if 'GarageCars' in all_data.columns and 'GarageArea' in all_data.columns:
    all_data['GarageAreaPerCar'] = all_data['GarageArea'] / (all_data['GarageCars'] + 0.1)

if all(col in all_data.columns for col in ['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'WoodDeckSF']):
    all_data['TotalPorchSF'] = (all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + 
                                all_data['3SsnPorch'] + all_data['ScreenPorch'] + all_data['WoodDeckSF'])

if 'OverallQual' in all_data.columns and 'OverallCond' in all_data.columns:
    all_data['QualityScore'] = all_data['OverallQual'] * all_data['OverallCond']
    all_data['QualityScore2'] = all_data['QualityScore'] ** 2

if 'YearBuilt' in all_data.columns and 'YearRemodAdd' in all_data.columns:
    all_data['Remodeled'] = (all_data['YearBuilt'] != all_data['YearRemodAdd']).astype(int)
    all_data['RemodelAge'] = all_data['YrSold'] - all_data['YearRemodAdd']

if 'GrLivArea' in all_data.columns and 'TotalBsmtSF' in all_data.columns:
    all_data['GrLivArea_TotalBsmtSF'] = all_data['GrLivArea'] * all_data['TotalBsmtSF']
    all_data['GrLivArea_TotalBsmtSF_log'] = np.log1p(all_data['GrLivArea_TotalBsmtSF'])

if 'OverallQual' in all_data.columns and 'GrLivArea' in all_data.columns:
    all_data['OverallQual_GrLivArea_log'] = all_data['OverallQual'] * np.log1p(all_data['GrLivArea'])

if 'TotalSF' in all_data.columns:
    all_data['TotalSF2'] = all_data['TotalSF'] ** 2
    all_data['TotalSF_sqrt'] = np.sqrt(all_data['TotalSF'])

if 'GrLivArea' in all_data.columns:
    all_data['GrLivArea_sqrt'] = np.sqrt(all_data['GrLivArea'])
    all_data['GrLivArea_cbrt'] = np.power(all_data['GrLivArea'], 1/3)

In [5]:
quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}
quality_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 
                'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']

for col in quality_cols:
    if col in all_data.columns:
        all_data[col] = all_data[col].map(quality_map).fillna(0).astype(int)

exposure_map = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0}
if 'BsmtExposure' in all_data.columns:
    all_data['BsmtExposure'] = all_data['BsmtExposure'].map(exposure_map).fillna(0).astype(int)

finish_map = {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0}
for col in ['BsmtFinType1', 'BsmtFinType2']:
    if col in all_data.columns:
        all_data[col] = all_data[col].map(finish_map).fillna(0).astype(int)

functional_map = {'Typ': 7, 'Min1': 6, 'Min2': 5, 'Mod': 4, 'Maj1': 3, 'Maj2': 2, 'Sev': 1, 'Sal': 0}
if 'Functional' in all_data.columns:
    all_data['Functional'] = all_data['Functional'].map(functional_map).fillna(7).astype(int)

if 'Neighborhood' in all_data.columns:
    train_temp = all_data[:train_idx].copy()
    train_temp['SalePrice'] = train_target
    train_temp['SalePrice_log'] = np.log1p(train_target)
    
    neighborhood_stats = train_temp.groupby('Neighborhood')['SalePrice_log'].agg(['mean', 'std', 'count'])
    global_mean = train_target.mean()
    global_mean_log = np.log1p(global_mean)
    
    alpha = 5
    neighborhood_encoded = (neighborhood_stats['mean'] * neighborhood_stats['count'] + global_mean_log * alpha) / (neighborhood_stats['count'] + alpha)
    all_data['NeighborhoodEncoded'] = all_data['Neighborhood'].map(neighborhood_encoded.to_dict()).fillna(global_mean_log)
    all_data['NeighborhoodEncoded_log'] = all_data['NeighborhoodEncoded']
    
    all_data['NeighborhoodStd'] = all_data['Neighborhood'].map(neighborhood_stats['std'].to_dict()).fillna(train_target.std())
    all_data['NeighborhoodCount'] = all_data['Neighborhood'].map(neighborhood_stats['count'].to_dict()).fillna(0)

if 'SaleType' in all_data.columns:
    sale_type_map = {'New': 1, 'Con': 1, 'CWD': 0.8, 'ConLI': 0.7, 'WD': 0.5, 
                     'COD': 0.3, 'ConLw': 0.3, 'ConLD': 0.2, 'Oth': 0.1}
    all_data['SaleTypeValue'] = all_data['SaleType'].map(sale_type_map).fillna(0.5)

if 'SaleCondition' in all_data.columns:
    sale_cond_map = {'Partial': 1.0, 'Normal': 0.8, 'Alloca': 0.7, 'Family': 0.6, 
                     'Abnorml': 0.4, 'AdjLand': 0.2}
    all_data['SaleConditionValue'] = all_data['SaleCondition'].map(sale_cond_map).fillna(0.8)

low_importance_features = ['Utilities', 'Street']
for col in low_importance_features:
    if col in all_data.columns:
        all_data = all_data.drop(col, axis=1)

label_encoders = {}
categorical_cols = all_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col].astype(str))
    label_encoders[col] = le

In [6]:
train_processed = all_data[:train_idx].copy()
test_processed = all_data[train_idx:].copy()

train_processed = train_processed.drop('Id', axis=1)
test_processed = test_processed.drop('Id', axis=1)

outliers_grliv = train_processed[(train_processed['GrLivArea'] > 4000) & (y_train_log < 12.5)].index

Q1 = train_processed['GrLivArea'].quantile(0.25)
Q3 = train_processed['GrLivArea'].quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = train_processed[(train_processed['GrLivArea'] < (Q1 - 3 * IQR)) | 
                                (train_processed['GrLivArea'] > (Q3 + 3 * IQR))].index

z_scores = np.abs(stats.zscore(train_processed[['GrLivArea', 'TotalBsmtSF']].fillna(0)))
outliers_z = train_processed[(z_scores > 4).any(axis=1)].index

outliers = list(set(list(outliers_grliv) + list(outliers_iqr) + list(outliers_z)))
train_processed = train_processed.drop(outliers)
y_train_log = y_train_log.drop(outliers)
print(f"Removed {len(outliers)} outliers")

mi_scores = mutual_info_regression(train_processed.fillna(0), y_train_log, random_state=42)
mi_df = pd.DataFrame({
    'feature': train_processed.columns,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)

rf_selector = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42, n_jobs=n_jobs)
rf_selector.fit(train_processed.fillna(0), y_train_log)
feature_importance = pd.DataFrame({
    'feature': train_processed.columns,
    'rf_importance': rf_selector.feature_importances_
})

combined_importance = pd.merge(mi_df, feature_importance, on='feature')
combined_importance['combined_score'] = (combined_importance['mi_score'] * 0.5 + 
                                      combined_importance['rf_importance'] * 0.5)
combined_importance = combined_importance.sort_values('combined_score', ascending=False)

important_features = combined_importance[combined_importance['combined_score'] > 0.0003]['feature'].tolist()
train_processed = train_processed[important_features]
test_processed = test_processed[important_features]
print(f"Selected {len(important_features)} features (from {len(combined_importance)})")


Removed 8 outliers
Selected 123 features (from 126)


In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

seeds = [42, 123, 456, 789, 2024, 999, 1337, 2023, 3141, 2718]
all_rf_predictions = []

for seed in seeds:
    rf_model = RandomForestRegressor(
        n_estimators=1500,
        max_depth=28,
        min_samples_split=3,
        min_samples_leaf=1,
        max_features='sqrt',
        random_state=seed,
        n_jobs=n_jobs
    )
    rf_model.fit(train_processed, y_train_log)
    all_rf_predictions.append(np.expm1(rf_model.predict(test_processed)))

rf_predictions = np.mean(all_rf_predictions, axis=0)
print(f"RF: {len(seeds)} seeds averaged")

RF: 10 seeds averaged


In [8]:
try:
    import xgboost as xgb
    
    all_xgb_predictions = []
    for seed in seeds:
        xgb_model = xgb.XGBRegressor(
            n_estimators=15000,
            learning_rate=0.0025,
            max_depth=7,
            min_child_weight=2,
            subsample=0.85,
            colsample_bytree=0.85,
            gamma=0.05,
            reg_alpha=0.05,
            reg_lambda=0.8,
            random_state=seed,
            n_jobs=n_jobs
        )
        xgb_model.fit(train_processed, y_train_log, verbose=False)
        all_xgb_predictions.append(np.expm1(xgb_model.predict(test_processed)))
    
    xgb_predictions = np.mean(all_xgb_predictions, axis=0)
    print(f"XGB: {len(seeds)} seeds averaged")
except ImportError:
    xgb_predictions = None

XGB: 10 seeds averaged


In [9]:
try:
    import lightgbm as lgb
    
    all_lgb_predictions = []
    for seed in seeds:
        lgb_model = lgb.LGBMRegressor(
            n_estimators=15000,
            learning_rate=0.0025,
            max_depth=7,
            num_leaves=127,
            subsample=0.85,
            colsample_bytree=0.85,
            reg_alpha=0.05,
            reg_lambda=0.8,
            random_state=seed,
            n_jobs=n_jobs,
            verbose=-1
        )
        lgb_model.fit(train_processed, y_train_log)
        all_lgb_predictions.append(np.expm1(lgb_model.predict(test_processed)))
    
    lgb_predictions = np.mean(all_lgb_predictions, axis=0)
    print(f"LGB: {len(seeds)} seeds averaged")
except ImportError:
    lgb_predictions = None

cat_predictions = None
try:
    import catboost as cb
    all_cat_predictions = []
    for seed in seeds:
        cat_model = cb.CatBoostRegressor(
            iterations=15000,
            learning_rate=0.0025,
            depth=7,
            l2_leaf_reg=3,
            loss_function='RMSE',
            eval_metric='RMSE',
            random_seed=seed,
            verbose=False,
            thread_count=n_jobs
        )
        cat_model.fit(train_processed, y_train_log, verbose=False)
        all_cat_predictions.append(np.expm1(cat_model.predict(test_processed)))
    
    cat_predictions = np.mean(all_cat_predictions, axis=0)
    print(f"CAT: {len(seeds)} seeds averaged")
except Exception as e:
    print(f"CatBoost error: {type(e).__name__}: {str(e)}")
    print("Skipping CatBoost")



LGB: 10 seeds averaged
CAT: 10 seeds averaged


In [10]:
initial_predictions = (rf_predictions * 0.1 + 
                      (xgb_predictions * 0.3 if xgb_predictions is not None else rf_predictions * 0.3) +
                      (lgb_predictions * 0.3 if lgb_predictions is not None else rf_predictions * 0.3) +
                      (cat_predictions * 0.3 if cat_predictions is not None else rf_predictions * 0.3))

test_confident = np.abs(initial_predictions - np.median(initial_predictions)) < (np.std(initial_predictions) * 2.5)
confident_indices = np.where(test_confident)[0]

if len(confident_indices) > 150:
    for iteration in range(2):
        pseudo_train = test_processed.iloc[confident_indices].copy()
        pseudo_target = initial_predictions[confident_indices]
        pseudo_target_log = np.log1p(pseudo_target)
        
        train_enhanced = pd.concat([train_processed, pseudo_train], ignore_index=True)
        y_enhanced = pd.concat([pd.Series(y_train_log), pd.Series(pseudo_target_log)], ignore_index=True)
        
        print(f"Iteration {iteration+1}: Added {len(confident_indices)} pseudo-labeled samples")
        
        rf_enhanced = RandomForestRegressor(n_estimators=1500, max_depth=28, min_samples_split=3,
                                            min_samples_leaf=1, max_features='sqrt', random_state=42, n_jobs=n_jobs)
        rf_enhanced.fit(train_enhanced, y_enhanced)
        rf_predictions = np.expm1(rf_enhanced.predict(test_processed))
        
        if xgb_predictions is not None:
            try:
                import xgboost as xgb
                xgb_enhanced = xgb.XGBRegressor(n_estimators=15000, learning_rate=0.0025, max_depth=7,
                                               min_child_weight=2, subsample=0.85, colsample_bytree=0.85,
                                               gamma=0.05, reg_alpha=0.05, reg_lambda=0.8, random_state=42, n_jobs=n_jobs)
                xgb_enhanced.fit(train_enhanced, y_enhanced, verbose=False)
                xgb_predictions = np.expm1(xgb_enhanced.predict(test_processed))
            except:
                pass
        
        if lgb_predictions is not None:
            try:
                import lightgbm as lgb
                lgb_enhanced = lgb.LGBMRegressor(n_estimators=15000, learning_rate=0.0025, max_depth=7,
                                                num_leaves=127, subsample=0.85, colsample_bytree=0.85,
                                                reg_alpha=0.05, reg_lambda=0.8, random_state=42, n_jobs=n_jobs, verbose=-1)
                lgb_enhanced.fit(train_enhanced, y_enhanced)
                lgb_predictions = np.expm1(lgb_enhanced.predict(test_processed))
            except:
                pass
        
        if cat_predictions is not None:
            try:
                import catboost as cb
                cat_enhanced = cb.CatBoostRegressor(iterations=15000, learning_rate=0.0025, depth=7,
                                                   l2_leaf_reg=3, loss_function='RMSE', eval_metric='RMSE',
                                                   random_seed=42, verbose=False, thread_count=n_jobs)
                cat_enhanced.fit(train_enhanced, y_enhanced, verbose=False)
                cat_predictions = np.expm1(cat_enhanced.predict(test_processed))
            except Exception as e:
                print(f"CatBoost pseudo-labeling error: {type(e).__name__}: {str(e)}")
        
        if iteration < 1:
            updated_predictions = (rf_predictions * 0.1 + 
                                  (xgb_predictions * 0.3 if xgb_predictions is not None else rf_predictions * 0.3) +
                                  (lgb_predictions * 0.3 if lgb_predictions is not None else rf_predictions * 0.3) +
                                  (cat_predictions * 0.3 if cat_predictions is not None else rf_predictions * 0.3))
            test_confident = np.abs(updated_predictions - np.median(updated_predictions)) < (np.std(updated_predictions) * 2.3)
            confident_indices = np.where(test_confident)[0]
            initial_predictions = updated_predictions
else:
    print("Not enough confident predictions for pseudo-labeling")


Iteration 1: Added 1393 pseudo-labeled samples
Iteration 2: Added 1377 pseudo-labeled samples


In [11]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(train_processed)
X_test_scaled = scaler.transform(test_processed)

all_ridge_predictions = []
for alpha in [0.1, 1.0, 10.0, 100.0, 1000.0]:
    ridge_model = Ridge(alpha=alpha, random_state=42)
    ridge_model.fit(X_train_scaled, y_train_log)
    all_ridge_predictions.append(np.expm1(ridge_model.predict(X_test_scaled)))

all_elastic_predictions = []
for alpha in [0.0001, 0.001, 0.01, 0.1, 1.0]:
    for l1_ratio in [0.3, 0.5, 0.7]:
        elastic_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42, max_iter=2000)
        elastic_model.fit(X_train_scaled, y_train_log)
        all_elastic_predictions.append(np.expm1(elastic_model.predict(X_test_scaled)))

all_gbr_predictions = []
for seed in seeds[:5]:
    gbr_model = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01, max_depth=5,
                                          random_state=seed, subsample=0.8)
    gbr_model.fit(X_train_scaled, y_train_log)
    all_gbr_predictions.append(np.expm1(gbr_model.predict(X_test_scaled)))

ridge_predictions = np.mean(all_ridge_predictions, axis=0)
elastic_predictions = np.mean(all_elastic_predictions, axis=0)
gbr_predictions = np.mean(all_gbr_predictions, axis=0)
print(f"Ridge: {len(all_ridge_predictions)} alphas averaged")
print(f"ElasticNet: {len(all_elastic_predictions)} configs averaged")
print(f"GBR: {len(all_gbr_predictions)} seeds averaged")

oof_predictions = np.zeros((len(train_processed), 7))
test_predictions = np.zeros((len(test_processed), 7))

for fold, (train_idx_fold, val_idx_fold) in enumerate(kf.split(train_processed)):
    X_train_fold = train_processed.iloc[train_idx_fold]
    X_val_fold = train_processed.iloc[val_idx_fold]
    y_train_fold = y_train_log.iloc[train_idx_fold]
    y_val_fold = y_train_log.iloc[val_idx_fold]
    
    scaler_fold = RobustScaler()
    X_train_scaled_fold = scaler_fold.fit_transform(X_train_fold)
    X_val_scaled_fold = scaler_fold.transform(X_val_fold)
    X_test_scaled_fold = scaler_fold.transform(test_processed)
    
    rf_fold = RandomForestRegressor(n_estimators=1500, max_depth=28, min_samples_split=3,
                                    min_samples_leaf=1, max_features='sqrt', random_state=42, n_jobs=n_jobs)
    rf_fold.fit(X_train_fold, y_train_fold)
    oof_predictions[val_idx_fold, 0] = rf_fold.predict(X_val_fold)
    test_predictions[:, 0] += np.expm1(rf_fold.predict(test_processed)) / kf.n_splits
    
    if xgb_predictions is not None:
        try:
            import xgboost as xgb
            xgb_fold = xgb.XGBRegressor(n_estimators=15000, learning_rate=0.0025, max_depth=7,
                                        min_child_weight=2, subsample=0.85, colsample_bytree=0.85,
                                        gamma=0.05, reg_alpha=0.05, reg_lambda=0.8, random_state=42, n_jobs=n_jobs)
            xgb_fold.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], verbose=False)
            oof_predictions[val_idx_fold, 1] = xgb_fold.predict(X_val_fold)
            test_predictions[:, 1] += np.expm1(xgb_fold.predict(test_processed)) / kf.n_splits
        except:
            pass
    
    if lgb_predictions is not None:
        try:
            import lightgbm as lgb
            lgb_fold = lgb.LGBMRegressor(n_estimators=15000, learning_rate=0.0025, max_depth=7,
                                        num_leaves=127, subsample=0.85, colsample_bytree=0.85,
                                        reg_alpha=0.05, reg_lambda=0.8, random_state=42, n_jobs=n_jobs, verbose=-1)
            lgb_fold.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], 
                        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(0)])
            oof_predictions[val_idx_fold, 2] = lgb_fold.predict(X_val_fold)
            test_predictions[:, 2] += np.expm1(lgb_fold.predict(test_processed)) / kf.n_splits
        except:
            pass
    
    ridge_fold = Ridge(alpha=10.0, random_state=42)
    ridge_fold.fit(X_train_scaled_fold, y_train_fold)
    oof_predictions[val_idx_fold, 3] = ridge_fold.predict(X_val_scaled_fold)
    test_predictions[:, 3] += np.expm1(ridge_fold.predict(X_test_scaled_fold)) / kf.n_splits
    
    elastic_fold = ElasticNet(alpha=0.01, l1_ratio=0.5, random_state=42, max_iter=2000)
    elastic_fold.fit(X_train_scaled_fold, y_train_fold)
    oof_predictions[val_idx_fold, 4] = elastic_fold.predict(X_val_scaled_fold)
    test_predictions[:, 4] += np.expm1(elastic_fold.predict(X_test_scaled_fold)) / kf.n_splits
    
    if cat_predictions is not None:
        try:
            import catboost as cb
            cat_fold = cb.CatBoostRegressor(iterations=15000, learning_rate=0.0025, depth=7,
                                           l2_leaf_reg=3, loss_function='RMSE', eval_metric='RMSE',
                                           random_seed=42, verbose=False, thread_count=n_jobs)
            cat_fold.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
            oof_predictions[val_idx_fold, 5] = cat_fold.predict(X_val_fold)
            test_predictions[:, 5] += np.expm1(cat_fold.predict(test_processed)) / kf.n_splits
        except Exception as e:
            if fold == 0:
                print(f"CatBoost stacking error: {type(e).__name__}: {str(e)}")
    
    gbr_fold = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01, max_depth=5,
                                        random_state=42, subsample=0.8)
    gbr_fold.fit(X_train_scaled_fold, y_train_fold)
    oof_predictions[val_idx_fold, 6] = gbr_fold.predict(X_val_scaled_fold)
    test_predictions[:, 6] += np.expm1(gbr_fold.predict(X_test_scaled_fold)) / kf.n_splits

valid_cols = [i for i in range(7) if oof_predictions[:, i].sum() != 0]
oof_stack = oof_predictions[:, valid_cols]
test_stack = test_predictions[:, valid_cols]

test_stack_log = np.log1p(test_stack)

try:
    import xgboost as xgb
    meta_model = xgb.XGBRegressor(
        n_estimators=500,
        learning_rate=0.01,
        max_depth=3,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=n_jobs
    )
    meta_model.fit(oof_stack, y_train_log)
    meta_rmse = np.sqrt(np.mean((meta_model.predict(oof_stack) - y_train_log) ** 2))
    print(f"Non-linear stacking (XGB) OOF RMSE: {meta_rmse:.4f}")
    final_predictions = np.expm1(meta_model.predict(test_stack_log))
except:
    def objective(weights):
        weights = np.array(weights)
        weights = weights / weights.sum()
        blend = np.dot(oof_stack, weights)
        return np.sqrt(np.mean((blend - y_train_log) ** 2))
    
    initial_weights = np.ones(len(valid_cols)) / len(valid_cols)
    bounds = [(0, 1) for _ in range(len(valid_cols))]
    result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, 
                      constraints={'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
    optimal_weights = result.x / result.x.sum()
    print(f"Optimal weights: {optimal_weights}")
    print(f"OOF RMSE with optimal weights: {result.fun:.4f}")
    final_predictions = np.expm1(np.dot(test_stack_log, optimal_weights))

Ridge: 5 alphas averaged
ElasticNet: 15 configs averaged
GBR: 5 seeds averaged
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[5682]	valid_0's l2: 0.0158028
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[3864]	valid_0's l2: 0.0154766
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[5750]	valid_0's l2: 0.0194279
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[4217]	valid_0's l2: 0.0149986
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[3387]	valid_0's l2: 0.0114392
Non-linear stacking (XGB) OOF RMSE: 0.0996


In [12]:
predictions_for_geom = [rf_predictions]
if xgb_predictions is not None:
    predictions_for_geom.append(xgb_predictions)
if lgb_predictions is not None:
    predictions_for_geom.append(lgb_predictions)
if cat_predictions is not None:
    predictions_for_geom.append(cat_predictions)
if 'gbr_predictions' in locals():
    predictions_for_geom.append(gbr_predictions)

geometric_mean = np.exp(np.mean([np.log(pred + 1) for pred in predictions_for_geom], axis=0)) - 1

simple_weighted = (rf_predictions * 0.08 + 
                  (xgb_predictions * 0.25 if xgb_predictions is not None else rf_predictions * 0.25) +
                  (lgb_predictions * 0.25 if lgb_predictions is not None else rf_predictions * 0.25) +
                  (cat_predictions * 0.25 if cat_predictions is not None else rf_predictions * 0.25) +
                  (gbr_predictions * 0.17 if 'gbr_predictions' in locals() else rf_predictions * 0.17))

median_pred = np.median([rf_predictions, 
                         xgb_predictions if xgb_predictions is not None else rf_predictions,
                         lgb_predictions if lgb_predictions is not None else rf_predictions,
                         cat_predictions if cat_predictions is not None else rf_predictions,
                         gbr_predictions if 'gbr_predictions' in locals() else rf_predictions], axis=0)

final_blend = 0.50 * final_predictions + 0.25 * geometric_mean + 0.15 * simple_weighted + 0.10 * median_pred

submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': final_blend.clip(min=0)
})
submission.to_csv('submission.csv', index=False)
print("Submission saved: submission.csv")
print(f"Final predictions range: {final_blend.min():.2f} - {final_blend.max():.2f}")

Submission saved: submission.csv
Final predictions range: 49816.53 - 474660.44
