In [None]:
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor

from preprocessing_transformers import *

pd.set_option('display.max_columns', 100)

# 1. Import the DFs and Preprocess

I imported the DataFrames created during `1_preprocessing`.

In [None]:
train_train_df = pd.read_csv('train_train.csv')
train_test_df = pd.read_csv('train_test.csv')

In [None]:
X_train = train_train_df.drop(columns='SalePrice').copy()
y_train = np.log(train_train_df['SalePrice'])

X_test = train_test_df.drop(columns='SalePrice').copy()
y_test = np.log(train_test_df['SalePrice'])

In [None]:
preprocessing_pipeline = joblib.load('preprocessing_pipeline.joblib')

In [None]:
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)
X_test_preprocessed = preprocessing_pipeline.transform(X_test)



This error occurred because the test set contained some values in column [11] that were not present in the training set. The split was performed in `1_preprocessing.ipynb`

# 2. Feature selection

## Check initial performance

I used `Linear Regression` and `Decision Tree` to see how the *RMSE* performed before and after **Feature Selection**.

In [None]:
model = LinearRegression()
model.fit(X_train_preprocessed, y_train)
y_pred = model.predict(X_test_preprocessed)
print(root_mean_squared_error(y_test, y_pred))

model = DecisionTreeRegressor(random_state=42)
model.fit(X_train_preprocessed, y_train)
y_pred = model.predict(X_test_preprocessed)
print(root_mean_squared_error(y_test, y_pred))

0.1668506975392772
0.19826424727886635


## RFECV

In [None]:
model = RandomForestRegressor(n_jobs=-1, random_state=42)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
rfecv = RFECV(model, cv=kfold, step=1, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)

In [None]:
pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('feature_selection', rfecv)
])

In [None]:
pipeline.fit(X_train, y_train)

Fitting estimator with 243 features.
Fitting estimator with 242 features.
Fitting estimator with 241 features.
Fitting estimator with 240 features.
Fitting estimator with 239 features.
Fitting estimator with 238 features.
Fitting estimator with 237 features.
Fitting estimator with 236 features.
Fitting estimator with 235 features.
Fitting estimator with 234 features.
Fitting estimator with 233 features.
Fitting estimator with 232 features.
Fitting estimator with 231 features.
Fitting estimator with 230 features.
Fitting estimator with 229 features.
Fitting estimator with 228 features.
Fitting estimator with 227 features.
Fitting estimator with 226 features.
Fitting estimator with 225 features.
Fitting estimator with 224 features.
Fitting estimator with 223 features.
Fitting estimator with 222 features.
Fitting estimator with 221 features.
Fitting estimator with 220 features.
Fitting estimator with 219 features.
Fitting estimator with 218 features.
Fitting estimator with 217 features.
F

In [None]:
pipeline['feature_selection'].cv_results_['mean_test_score'].max()

np.float64(-0.13698067766333638)

In [None]:
pipeline['feature_selection'].n_features_

np.int64(153)

In [None]:
X_train_selected = pipeline.transform(X_train)
X_test_selected = pipeline.transform(X_test)



In [None]:
model = LinearRegression()
model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)
print(root_mean_squared_error(y_test, y_pred))

model = DecisionTreeRegressor(random_state=42)
model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)
print(root_mean_squared_error(y_test, y_pred))

0.1578940521169381
0.1906192792810787


After **Feature Selection** the *RMSE* improved by almost 0.01, which is a good result.

In [None]:
joblib.dump(pipeline, 'preprocessing_feature_pipeline.joblib')

['preprocessing_feature_pipeline.joblib']

# 3. Model Selection with Grid Search

In [None]:
models = {
    # Linear models
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(max_iter=10000, random_state=42),
    'ElasticNet': ElasticNet(max_iter=10000, random_state=42),

    # Tree-based models
    'RandomForestRegressor': RandomForestRegressor(n_jobs=-1, random_state=42),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42),
    'XGBoostRegressor': XGBRegressor(objective='reg:squarederror', eval_metric='rmse', n_jobs=-1, random_state=42),

    # Other models
    'SVR': SVR(),
    'CatBoostRegressor': CatBoostRegressor(verbose=0, random_state=42),
    'MLPRegressor': MLPRegressor(max_iter=10000, random_state=42)
}

In [None]:
models_and_parameters = {
    # Linear models
    'Ridge': (Ridge(random_state=42), {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sag']
    }),
    'Lasso': (Lasso(max_iter=10000, random_state=42), {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'fit_intercept': [True, False],
        'selection': ['cyclic', 'random']
    }),
    'ElasticNet': (ElasticNet(max_iter=10000, random_state=42), {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'l1_ratio': [0, 0.25, 0.5, 0.75, 1]
    }),

    # Tree-based models
    'RandomForestRegressor': (RandomForestRegressor(n_jobs=-1, random_state=42), {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10, 15],
        'max_features': ['sqrt', 'log2'],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }),
    'GradientBoostingRegressor': (GradientBoostingRegressor(random_state=42), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1],
        'max_depth': [2, 4, 6, 8],
        'subsample': [0.8, 1.0],
        'min_samples_leaf': [1, 2]
    }),
    'XGBoostRegressor': (XGBRegressor(objective='reg:squarederror', eval_metric='rmse', n_jobs=-1, random_state=42), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1],
        'max_depth': [2, 4, 6, 8],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }),

    # Other models
    'SVR': (SVR(), {
        'kernel': ['rbf', 'poly'],  # Linear models were bad in new data, so didn't put linear kernel
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto'],
        'epsilon': [0.1, 0.01, 0.001],
        'degree': [2, 3]
    }),
    'CatBoostRegressor': (CatBoostRegressor(verbose=0, random_state=42), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1],
        'depth': [4, 6, 8, 10], # "In most cases, the optimal depth ranges from 4 to 10. Values in the range from 6 to 10 are recommended." - Documentation
        'l2_leaf_reg': [1, 3, 5],
    }),
    'MLPRegressor': (MLPRegressor(max_iter=10000, random_state=42, early_stopping=True), {
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'lbfgs'],
        'alpha': [0.0001, 0.001, 0.01]
    })
}

In [None]:
def grid_search(name):
    print(f'--- {name} ---\n')

    # Grid Search
    print('- Grid Search -')
    model, param_grid = models_and_parameters[name]
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_root_mean_squared_error',
        cv=kfold,
        n_jobs=-1,
        verbose=3
    )
    grid.fit(X_train_selected, y_train)
    print(f'Best parameters: {grid.best_params_}')
    print(f'Best RMSE: {-grid.best_score_:.5f}')

    print('\n- Evaluating on Test Set -')
    # Default Model on Test
    default_model = models[name]
    default_model.fit(X_train_selected, y_train)
    y_pred = default_model.predict(X_test_selected)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f'Default Model - RMSE: {rmse:.5f}')

    # Grid Model on Test Set
    grid_model = grid.best_estimator_
    grid_model.fit(X_train_selected, y_train)
    y_pred = grid_model.predict(X_test_selected)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f'Grid Model - RMSE: {rmse:.5f}')

    return grid_model

In [None]:
ridge_model = grid_search('Ridge')

--- Ridge ---

- Grid Search -
Fitting 10 folds for each of 25 candidates, totalling 250 fits
Best parameters: {'alpha': 1, 'solver': 'auto'}
Best RMSE: 0.11698

- Evaluating on Test Set -
Default Model - RMSE: 0.16034
Grid Model - RMSE: 0.16034


`Ridge` performed very well on the training set, but poorly on the test set (new/unseen data), indicating overfitting.

In [None]:
lasso_model = grid_search('Lasso')

--- Lasso ---

- Grid Search -
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Best parameters: {'alpha': 0.0001, 'fit_intercept': True, 'selection': 'cyclic'}
Best RMSE: 0.11645

- Evaluating on Test Set -
Default Model - RMSE: 0.38625
Grid Model - RMSE: 0.15911


After hyperparameter tuning, *RMSE* improved significantly. However, the model is overfitting too.

In [None]:
elastic_model = grid_search('ElasticNet')

--- ElasticNet ---

- Grid Search -
Fitting 10 folds for each of 25 candidates, totalling 250 fits
Best parameters: {'alpha': 0.001, 'l1_ratio': 0.25}
Best RMSE: 0.11510

- Evaluating on Test Set -
Default Model - RMSE: 0.38625
Grid Model - RMSE: 0.16040


In [None]:
rfr_model = grid_search('RandomForestRegressor')

--- RandomForestRegressor ---

- Grid Search -
Fitting 10 folds for each of 96 candidates, totalling 960 fits
Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best RMSE: 0.13050

- Evaluating on Test Set -
Default Model - RMSE: 0.15418
Grid Model - RMSE: 0.15470


In [None]:
gbr_model = grid_search('GradientBoostingRegressor')

--- GradientBoostingRegressor ---

- Grid Search -
Fitting 10 folds for each of 96 candidates, totalling 960 fits
Best parameters: {'learning_rate': 0.1, 'max_depth': 2, 'min_samples_leaf': 1, 'n_estimators': 300, 'subsample': 0.8}
Best RMSE: 0.12487

- Evaluating on Test Set -
Default Model - RMSE: 0.13997
Grid Model - RMSE: 0.13790


In [None]:
xgb_model = grid_search('XGBoostRegressor')

--- XGBoostRegressor ---

- Grid Search -
Fitting 10 folds for each of 96 candidates, totalling 960 fits
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200, 'subsample': 0.8}
Best RMSE: 0.12300

- Evaluating on Test Set -
Default Model - RMSE: 0.14853
Grid Model - RMSE: 0.13473


In [None]:
svr_model = grid_search('SVR')

--- SVR ---

- Grid Search -
Fitting 10 folds for each of 72 candidates, totalling 720 fits
Best parameters: {'C': 1, 'degree': 2, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
Best RMSE: 0.12058

- Evaluating on Test Set -
Default Model - RMSE: 0.14327
Grid Model - RMSE: 0.12816


In [None]:
cat_model = grid_search('CatBoostRegressor')

--- CatBoostRegressor ---

- Grid Search -
Fitting 10 folds for each of 72 candidates, totalling 720 fits
Best parameters: {'depth': 6, 'l2_leaf_reg': 1, 'learning_rate': 0.1, 'n_estimators': 300}
Best RMSE: 0.11990

- Evaluating on Test Set -
Default Model - RMSE: 0.13741
Grid Model - RMSE: 0.14003


In [None]:
mlp_model = grid_search('MLPRegressor')

--- MLPRegressor ---

- Grid Search -
Fitting 10 folds for each of 48 candidates, totalling 480 fits
Best parameters: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (100, 50), 'solver': 'lbfgs'}
Best RMSE: 0.14430

- Evaluating on Test Set -
Default Model - RMSE: 0.39830
Grid Model - RMSE: 0.20241


Table with the results:

| Modelo                  | Best RMSE (Grid Search) | Grid Model RMSE (Test Set) |
| :---------------------- | :---------------------- | :------------------------- |
| SVR                     | 0.12058                 | **0.12816** |
| XGBoostRegressor        | 0.12300                 | **0.13473** |
| GradientBoostingRegressor | 0.12487                 | **0.13790** |
| CatBoostRegressor       | 0.11990                 | **0.14003** |
| RandomForestRegressor   | 0.13050                 | **0.15470** |
| Lasso                   | 0.11645                 | **0.15911** |
| Ridge                   | 0.11698                 | **0.16034** |
| ElasticNet              | 0.11510                 | **0.16040** |
| MLPRegressor            | 0.14430                 | **0.20241** |

`SVR` and `XGBoost` are the best models. Let's try **ensemble techniques** with them.

# 4. Ensemble

## Voting Regressor

In [None]:
simple_voting_regressor = VotingRegressor([
    ('svr', svr_model),
    ('xgb', xgb_model)
])
simple_voting_regressor.fit(X_train_selected, y_train)
y_pred = simple_voting_regressor.predict(X_test_selected)
rmse = root_mean_squared_error(y_test, y_pred)
print(f'Simple Voting Regressor - RMSE = {rmse:.5f}')

Simple Voting Regressor - RMSE = 0.12650


In [None]:
weighted_voting_regressor = VotingRegressor(
    estimators=[
    ('svr', svr_model),
    ('xgb', xgb_model)
    ],
    weights=[2, 1] # I tried other values
)
weighted_voting_regressor.fit(X_train_selected, y_train)
y_pred = weighted_voting_regressor.predict(X_test_selected)
rmse = root_mean_squared_error(y_test, y_pred)
print(f'Weighted Voting Regressor - RMSE = {rmse:.5f}')

Weighted Voting Regressor - RMSE = 0.12592


Weighted model is a little better. Let's try stacking them with the others.

## Stacking regressor

In [None]:
# I created this function to get the best model to stack with SVR and XGBoost
def stacking_regressor(model, name):
    stack_models = [
        ('svr', svr_model),
        ('xgb', xgb_model),
    ]
    final_estimator = model

    stack = StackingRegressor(estimators=stack_models, final_estimator=final_estimator)
    stack.fit(X_train_selected, y_train)
    y_pred = stack.predict(X_test_selected)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f'{name} - Stacking Regressor - RMSE = {rmse:.5f}')

    return StackingRegressor(estimators=stack_models, final_estimator=final_estimator)

In [None]:
_ = stacking_regressor(ridge_model, 'Ridge')
_ = stacking_regressor(lasso_model, 'Lasso')
_ = stacking_regressor(elastic_model, 'Elastic')
_ = stacking_regressor(rfr_model, 'Random')
_ = stacking_regressor(gbr_model, 'Gradient')
_ = stacking_regressor(cat_model, 'Cat')
_ = stacking_regressor(mlp_model, 'MLP')
_ = stacking_regressor(simple_voting_regressor, 'SimpleVR')
_ = stacking_regressor(weighted_voting_regressor, 'WeightedVR')

Ridge - Stacking Regressor - RMSE = 0.12516
Lasso - Stacking Regressor - RMSE = 0.12496
Elastic - Stacking Regressor - RMSE = 0.12520
Random - Stacking Regressor - RMSE = 0.13069
Gradient - Stacking Regressor - RMSE = 0.12547
Cat - Stacking Regressor - RMSE = 0.12168
MLP - Stacking Regressor - RMSE = 0.12294
SimpleVR - Stacking Regressor - RMSE = 0.12560
WeightedVR - Stacking Regressor - RMSE = 0.12525


The best stacking is with `CatBoost`.

In [None]:
stack_model = stacking_regressor(cat_model, 'Cat')
stack_model

Cat - Stacking Regressor - RMSE = 0.12168


This is the final model.

# 5. Final Model Validation with Cross-Validation

In [None]:
scores = cross_val_score(stack_model, X_train_selected, y_train, scoring='neg_root_mean_squared_error', cv=kfold, n_jobs=-1)
print(f'CV RMSE: {-np.mean(scores):.5f} ± {np.std(scores):.5f}')

CV RMSE: 0.12738 ± 0.02307


This isn't a good score compared to the **0.12168** obtained on the Test set. Let's try stacking with `MLP`.

In [None]:
mlp_stack_model = stacking_regressor(mlp_model, 'MLP')
mlp_stack_model

MLP - Stacking Regressor - RMSE = 0.12294


In [None]:
scores = cross_val_score(mlp_stack_model, X_train_selected, y_train, scoring='neg_root_mean_squared_error', cv=kfold, n_jobs=-1)
print(f'CV RMSE: {-np.mean(scores):.5f} ± {np.std(scores):.5f}')

CV RMSE: 0.11678 ± 0.02441


The score is better than with `CatBoost`, so stacking with `MLP` is the final model.

In [None]:
joblib.dump(mlp_stack_model, 'final_model.joblib')

['final_model.joblib']