# House Pricing: Automatic Data preprocessing & Modeling Techniques Selection using Pipelines

Notebook written by Pedro de Matos Gonçalves

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import collections
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 100) # Setting pandas to display a N number of columns
pd.set_option('display.max_rows', 10) # Setting pandas to display a N number rows
pd.set_option('display.width', 1000) # Setting pandas dataframe display width to N
import matplotlib.pyplot as plt # data visualization library
import plotly.graph_objs as go # interactive plotting library
from IPython.display import display # display from IPython.display
from itertools import cycle # function used for cycling over values


# Libraries used for Modeling
from scipy import stats
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from category_encoders import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from xgboost import XGBRegressor, plot_importance
from lightgbm import LGBMRegressor, plot_importance, plot_tree, create_tree_digraph


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing the data and displaying some rows
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

display(df.head(10))

In [None]:
# Checking for columns where null values are higher than 50% of it's total
df_aux_nulls = [(c, df[c].isna().mean()*100) for c in df]
df_aux_nulls = pd.DataFrame(df_aux_nulls, columns=["column_name", "null_percentage"])

df_aux_nulls = df_aux_nulls[df_aux_nulls.null_percentage > 50]
print("Columns with more than 50% null percentage:")
df_aux_nulls.sort_values("null_percentage", ascending=False) # These are the 3 columns with more than 50% of nulls

In [None]:
# Transforming our target into log scale, to help improve generalization
df['SalePrice'] = np.log(df.SalePrice)

# Separating our target
target = df['SalePrice']

# Let's drop the highly null columns and our target from the original dataframe.
df.drop(['Id', 'Alley','PoolQC','Fence','MiscFeature', 'SalePrice'], axis=1, inplace=True)

In [None]:
# Taking sell month and year as categorical columns
df['YrSold'].astype('object')
df['MoSold'].astype('object')

# Now, we separate categorical and numerical column dataframes.
categorical_df = df.select_dtypes(include=['object'])
numeric_df = df.select_dtypes(exclude=['object'])

# And then, we store the names of the categorical and numerical columns.
categorical_features = list(categorical_df.columns)
numeric_features = list(numeric_df.columns)

print("Categorical features:\n", categorical_features)
print("\nNumeric features:\n", numeric_features)

In [None]:
# Note: The main goal of this cell is to simulate the transformations that a pipeline would do in our features 
# (mainly, on the categorical ones), so we can take a look on the column order after OneHotEncoding.
ohe = OneHotEncoder(use_cat_names=True) # Creating the one-hot encoder object
df_onehotencoded = ohe.fit_transform(df) # Transforming our original data

# Reorganizing columns the same way pipeline will be set
# Our pipeline will be constructed following in logic: 
# -> 1st: Deal with numeric columns. 
# -> 2nd: Then, Categorical columns.
df_onehotencoded.drop(numeric_features, axis=1, inplace=True) 
df_all_features = pd.concat([df[numeric_features], df_onehotencoded], axis=1)

display(df_all_features.head(10)) # Taking a look how the dataset looks after being passed through the pipeline


# We are also going to store the names of the features in the correct order for plotting Feature Importances later.
feature_names = list(df_all_features.columns) # Storing feature names in the correct order

feature_names = [item for item in feature_names if '_nan' not in item] # Null values are filled with Imputer inside our pipeline,
                                                                       # so "_nan" columns will not be in our final dummies created
                                                                       # by the OneHotEncoder object.
    
print("Feature names after Pipeline transformation (numeric ones first, then categorical ones):\n\n", feature_names)


## Model training & Evaluation functions

After all the preprocessing, we are now ready for building and evaluating different Machine Learning models.

First, let's create a function responsible for evaluating our regressor on the test set we have created.


In [None]:
def testSetResultsRegressor(regressor, x_test, y_test):
    predictions = regressor.predict(x_test)
    
    results = []
    
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)
    
    results.append(mae)
    results.append(mse)
    results.append(rmse)
    results.append(r2)
    
    print("\n\n#---------------- Test set results (Best Regressor) ----------------#\n")
    print("Mean Absolute Error (MAE), Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R² Score:")
    print(results)
    
    return results

Now, we fit several different data preprocessing, feature selection and modeling techniques inside a Pipeline, to check which group of techniques has better performance.

In [None]:
# Building a Pipeline responsible for finding best model and it's parameters
def defineBestModelPipeline(df, target, categorical_features, numeric_features):
    
    # Splitting data into Train and Test
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=1)
    y_train = y_train.to_numpy() # Transforming training targets into numpy arrays
    y_test = y_test.to_numpy() # Transforming test targets into numpy arrays
    
    
    # Pipeline's data transformations
    # 1st -> Numeric Transformers (we'll try several different ones)
    numeric_transformer_1 = Pipeline(steps=[('imp', IterativeImputer(max_iter=10, random_state=1)),
                                            ('scaler', MinMaxScaler())])
    
    numeric_transformer_2 = Pipeline(steps=[('imp', IterativeImputer(max_iter=30, random_state=7)),
                                            ('scaler', StandardScaler())])
    
    numeric_transformer_3 = Pipeline(steps=[('imp', SimpleImputer(strategy='mean')),
                                            ('scaler', MinMaxScaler())])
    
    numeric_transformer_4 = Pipeline(steps=[('imp', SimpleImputer(strategy='median')),
                                            ('scaler', StandardScaler())])
    
    
    # 2nd -> Categorical Transformer
    categorical_transformer = Pipeline(steps=[('frequent', SimpleImputer(strategy='most_frequent')),
                                              ('onehot', OneHotEncoder(use_cat_names=True))])
    
    
    # 3rd -> Different Data Transformation Steps, each one with a different numerical transformation
    data_transformations_1 = ColumnTransformer(transformers=[('num', numeric_transformer_1, numeric_features),
                                                             ('cat', categorical_transformer, categorical_features)])
    
    data_transformations_2 = ColumnTransformer(transformers=[('num', numeric_transformer_2, numeric_features),
                                                             ('cat', categorical_transformer, categorical_features)])
    
    data_transformations_3 = ColumnTransformer(transformers=[('num', numeric_transformer_3, numeric_features),
                                                             ('cat', categorical_transformer, categorical_features)])
    
    data_transformations_4 = ColumnTransformer(transformers=[('num', numeric_transformer_4, numeric_features),
                                                             ('cat', categorical_transformer, categorical_features)])
    
    
    
    # Applying different data transformations in RandomSearchCV to find 
    # the best imputing strategy, the best feature engineering strategy
    # and the best model with it's parameters
    pipe = Pipeline(steps=[('data_transformations', data_transformations_1), # Initializing data transformation step by choosing any of the above
                           ('feature_eng', PCA()), # Initializing feature engineering step by choosing any desired method
                           ('reg', SVR())]) # Initializing modeling step of the pipeline with any model object
                           #memory='cache_folder') -> Used to optimize memory when needed
    
    
    
    # Now, defining the grid of parameters to search for. RandomSearchCV will randomly chose
    # options for each step inside the dictionaries, and return the best one for us as our final pipeline.
    params_grid = [
                    {'data_transformations': [data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4],
                     'feature_eng': [None, 
                                     PCA(n_components=round(df_all_features.shape[1]*0.9)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.7)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.5)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.8)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.6)), 
                                     TSNE(n_components=round(df_all_features.shape[1]*0.4))],
                     'reg': [KNeighborsRegressor()],
                     'reg__n_neighbors': stats.randint(1, 30),
                     'reg__metric': ['minkowski', 'euclidean']},

        

                    {'data_transformations': [data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4],
                     'feature_eng': [None, 
                                     PCA(n_components=round(df_all_features.shape[1]*0.9)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.7)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.5)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.8)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.6)), 
                                     TSNE(n_components=round(df_all_features.shape[1]*0.4))],
                     'reg': [LinearRegression()]},


        
                    {'data_transformations': [data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4],
                     'feature_eng': [None, 
                                     PCA(n_components=round(df_all_features.shape[1]*0.9)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.7)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.5)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.8)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.6)), 
                                     TSNE(n_components=round(df_all_features.shape[1]*0.4))],
                     'reg': [SVR()],
                     'reg__C': stats.uniform(0.01, 10),
                     'reg__gamma': stats.uniform(0.01, 10)},


        
                    {'data_transformations': [data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4],
                     'feature_eng': [None, 
                                     PCA(n_components=round(df_all_features.shape[1]*0.9)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.7)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.5)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.8)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.6)), 
                                     TSNE(n_components=round(df_all_features.shape[1]*0.4))],
                     'reg': [DecisionTreeRegressor()],
                     'reg__criterion': ['gini', 'entropy'],
                     'reg__max_features': [None, "auto", "log2"],
                     'reg__max_depth': stats.randint(1, 7)},


        
                    {'data_transformations': [data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4],
                     'feature_eng': [None, 
                                     PCA(n_components=round(df_all_features.shape[1]*0.9)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.7)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.5)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.8)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.6)), 
                                     TSNE(n_components=round(df_all_features.shape[1]*0.4))],
                     'reg': [RandomForestRegressor()],
                     'reg__n_estimators': stats.randint(10, 300),
                     'reg__max_features': [None, "auto", "log2"],
                     'reg__max_depth': stats.randint(1, 7)},
        
                    
        
                    {'data_transformations': [data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4],
                     'feature_eng': [None, 
                                     PCA(n_components=round(df_all_features.shape[1]*0.9)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.7)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.5)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.8)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.6)), 
                                     TSNE(n_components=round(df_all_features.shape[1]*0.4))],
                     'reg': [ExtraTreesRegressor()],
                     'reg__n_estimators': stats.randint(10, 300),
                     'reg__max_features': [None, "auto", "log2"],
                     'reg__max_depth': stats.randint(1, 7)},

                    
        
                    {'data_transformations': [data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4],
                     'feature_eng': [None, 
                                     PCA(n_components=round(df_all_features.shape[1]*0.9)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.7)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.5)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.8)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.6)), 
                                     TSNE(n_components=round(df_all_features.shape[1]*0.4))],
                     'reg': [GradientBoostingRegressor()],
                     'reg__n_estimators': stats.randint(10, 200),
                     'reg__learning_rate': stats.uniform(0.01, 1.2),
                     'reg__max_depth': stats.randint(1, 9)},

        
        
                    {'data_transformations': [data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4],
                     'feature_eng': [None, 
                                     PCA(n_components=round(df_all_features.shape[1]*0.9)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.7)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.5)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.8)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.6)), 
                                     TSNE(n_components=round(df_all_features.shape[1]*0.4))],
                     'reg': [LGBMRegressor()],
                     'reg__n_estimators': stats.randint(1, 150),
                     'reg__learning_rate': stats.uniform(0.01, 1),
                     'reg__max_depth': stats.randint(1, 5)},


        
                    {'data_transformations': [data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4],
                     'feature_eng': [None, 
                                     PCA(n_components=round(df_all_features.shape[1]*0.9)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.7)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.5)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.8)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.6)), 
                                     TSNE(n_components=round(df_all_features.shape[1]*0.4))],
                     'reg': [XGBRegressor()],
                     'reg__n_estimators': stats.randint(1, 125),
                     'reg__eta': stats.uniform(0.01, 1),
                     'reg__max_depth': stats.randint(1, 8),
                     'reg__gamma': stats.uniform(0.01, 1)},


        
                    {'data_transformations': [data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4],
                     'feature_eng': [None, 
                                     PCA(n_components=round(df_all_features.shape[1]*0.9)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.7)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.5)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.8)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.6)), 
                                     TSNE(n_components=round(df_all_features.shape[1]*0.4))],
                     'reg': [StackingRegressor(estimators=[('svr', SVR(C=10, gamma=10)),
                                                           ('rf', RandomForestRegressor(max_depth=5, max_features=None, n_estimators=50, n_jobs=-1)),
                                                           ('xgb', XGBRegressor(eta=0.5, gamma=0.5, max_depth=4, n_estimators=25))], 
                                                final_estimator=LinearRegression())]},
        
        
        
                    {'data_transformations': [data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4],
                     'feature_eng': [None, 
                                     PCA(n_components=round(df_all_features.shape[1]*0.9)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.7)),
                                     PCA(n_components=round(df_all_features.shape[1]*0.5)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.8)),
                                     TSNE(n_components=round(df_all_features.shape[1]*0.6)), 
                                     TSNE(n_components=round(df_all_features.shape[1]*0.4))],
                     'reg': [StackingRegressor(estimators=[('lgbm', LGBMRegressor(n_estimators=30, learning_rate=0.4, max_depth=6)),
                                                           ('etc', ExtraTreesRegressor(max_depth=6, max_features=None, n_estimators=30)),
                                                           ('gbt', GradientBoostingRegressor(learning_rate=0.6, max_depth=5, n_estimators=15))], 
                                                final_estimator=LinearRegression())]}
                ]
    
    
    # Now, we fit a RandomSearchCV to search over the grid of parameters defined above
    metrics = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'r2']
    
    best_model_pipeline = RandomizedSearchCV(pipe, params_grid, n_iter=50, scoring=metrics, 
                                             refit='neg_root_mean_squared_error', 
                                             n_jobs=-1, cv=5, random_state=1)

    best_model_pipeline.fit(x_train, y_train)
    
    
    # At last, we check the final results
    print("\n\n#---------------- Best Data Pipeline found in RandomSearchCV  ----------------#\n\n", best_model_pipeline.best_estimator_[0])
    print("\n\n#---------------- Best Feature Engineering technique found in RandomSearchCV  ----------------#\n\n", best_model_pipeline.best_estimator_[1])
    print("\n\n#---------------- Best Regressor found in RandomSearchCV  ----------------#\n\n", best_model_pipeline.best_estimator_[2])
    print("\n\n#---------------- Best Estimator's average RMSE Score on CV (validation set) ----------------#\n\n", best_model_pipeline.best_score_)
    
    return x_train, x_test, y_train, y_test, best_model_pipeline

In [None]:
# Calling the function above, returing train/test data and best model's pipeline
x_train, x_test, y_train, y_test, best_model_pipeline = defineBestModelPipeline(df, target, categorical_features, numeric_features)


# Checking best model's performance on test data
test_set_results = testSetResultsRegressor(best_model_pipeline, x_test, y_test)

After going through all steps in RandomSearchCV, we can check the results from it's steps using the "cvresults" atrribute

In [None]:
# Visualizing all results and metrics, from all models, obtained by the RandomSearchCV steps
df_results = pd.DataFrame(best_model_pipeline.cv_results_)

display(df_results)

In [None]:
# Now visualizing all results and metrics obtained only by the best classifier
display(df_results[df_results['rank_test_neg_root_mean_squared_error'] == 1])

If we want to, it's also possible to check the feature importances of the best model, in case they're easy to understand and explain.

Just remember that, if the best pipeline found in RandomSearchCV applies dimensionality reduction or creates new features using PolynomialFeatures, it will be much harder to explain importances.

In a scenario that no transformations are applied to the features inside the pipeline, if the model is tree-based (RandomForestClassifier, for example), or linear regression-based (Logistic Regression, for example), then explaining most important features becomes much easier.


In [None]:
# # Plotting feature importances of the best model, if tree-based (top 5 features)
# print("\n#---------------- Bar plot with feature importances ----------------#")
# feat_importances = pd.Series(best_model_pipeline.best_estimator_.named_steps['reg'].feature_importances_, index=feature_names)
# feat_importances.nlargest(5).plot(kind='barh')


# # Plotting feature importances of the best model, if linear regression-based (top 5 features)
# print("\n#---------------- Bar plot with feature importances ----------------#")
# feat_importances = pd.Series(best_model_pipeline.best_estimator_.named_steps['reg'].coef_[0], index=feature_names)
# feat_importances.nlargest(5).plot(kind='barh')


# Plotting feature importances from LGBM Regressor
plot_importance(best_model_pipeline.best_estimator_[2], figsize=(10, 14))

---

# Predictions

Now that we have tried different preprocessing and modeling techniques, resulting in a final best pipeline, let's use it to predict the test data provided by kaggle

In [None]:
# Importing the data and displaying some rows
df_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

# Dropping the unnecessary columns
df_test.drop(['Id', 'Alley','PoolQC','Fence','MiscFeature'], axis=1, inplace=True)

# Applying best_model_pipeline:
# Step 1 -> Transforming data the same way we did in the training set;
# Step 2 -> making predictions using the best model obtained by RandomSearchCV.
test_predictions = best_model_pipeline.predict(df_test)

# Because our model was trained using a logarithmic scale of the target, it's predictions will also
# be log. We need to get them back to linear scale using np.exp()
test_predictions = np.exp(test_predictions)
print(test_predictions)

In [None]:
# Generating the predictions file that is going to be submitted to the competition
df_submission = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

df_submission['SalePrice'] = test_predictions # Adding a column with predicted values

df_submission.drop(df_submission.columns.difference(['Id', 'SalePrice']), axis=1, inplace=True) # Selecting only needed columns

df_submission.head(10)

In [None]:
# Checking if the number of rows is OK (the file is expected to have 1459 rows)
df_submission.count()

In [None]:
# Writing submitions to CSV file
df_submission.to_csv('submission.csv', index=False)