In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'house-prices-advanced-regression-techniques:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F5407%2F868283%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240821%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240821T112103Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9ef080e2973d73840f3b7744419b5d1f174c6443bd0e81488c73820c1d0f5223363d78ad612efba68912c37e16b15d68de5545ca596711b75ac7b3979af96ee36dc6e55ba88cea0ddea572fe8eecfb1b8cea4076f792693275cdcf447c845089a1ecb6881ddba454d71cca2a984a09520c7c3e4c84ac3b38cd588d34a65256778369867bf8ae1f65a322571560c19cf9d9a2e65b2556fdcebce1b00494483a822505089e2ddd123d765d2d78c68da6a665206cc45cccef16b2e2a7f7ab8c99131a024c220bafc6961c03171326bd53d6436e507170a8530ec45882e11ac579518e587dd0a54d1cbe03addfc48076063b4c492392d378f32c3d82c4180763f712'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading house-prices-advanced-regression-techniques, 203809 bytes compressed
Downloaded and uncompressed: house-prices-advanced-regression-techniques
Data source import complete.


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv
/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt


# Notebook description
**Goal** : use data preprocessing and data engineering to improve results.

Used only 1 regressor (XGBoost), just iterate on the data preprocessing to improve the result.

**Future plan:** After improving data preprocessing and feature engineering, use it on different regressors.

MSE changes:

1. 0.00018350863778397952 (no feature engineering)
2. 0.0001883511693566672 (no feature engineering + pca)
3. 0.021290293756569583 (feature engineering + pca)



# Importing the dataset

In [3]:
df_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [4]:
# splitting train dataset into training and test dataset
from sklearn.model_selection import train_test_split

X_train = df_train.drop("SalePrice", axis = 1)
y_train = np.log(df_train["SalePrice"])

# Data exploration

In [None]:
df_train.shape

In [None]:
df_train.describe().T

In [None]:
import matplotlib.pyplot as plt

sale_price = df_train["SalePrice"]
plt.hist(sale_price, bins=50, edgecolor='black')
plt.title('Sale price histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
sale_price.describe()

In [None]:
log_sale_price = np.log(sale_price)
plt.hist(log_sale_price, bins=50, edgecolor='black')
plt.title('Log sale price Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
log_sale_price.describe()

# Data preprocessing

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [6]:

# Define transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse = False))
])

In [7]:
# Update categorical and numerical columns
categorical_columns = df_train.select_dtypes(include=['object', 'category']).columns
numerical_columns = df_train.select_dtypes(include=['int64', 'float64']).columns

# Remove target variable from numerical columns
numerical_columns = numerical_columns.drop('SalePrice')


## Simplifying model for testing

In [8]:
categorical_columns_test = pd.Index(['MSZoning', 'HouseStyle'])

In [10]:
numerical_columns_test = pd.Index(['LotArea', 'YearBuilt', 'TotRmsAbvGrd'])

In [11]:
numerical_columns = numerical_columns_test
categorical_columns = categorical_columns_test

In [12]:
df_cats = X_train[categorical_columns_test]
df_cats

Unnamed: 0,MSZoning,HouseStyle
0,RL,2Story
1,RL,1Story
2,RL,2Story
3,RL,2Story
4,RL,2Story
...,...,...
1455,RL,2Story
1456,RL,1Story
1457,RL,2Story
1458,RL,1Story


In [13]:
df_nums = X_train[numerical_columns_test]
df_nums

Unnamed: 0,LotArea,YearBuilt,TotRmsAbvGrd
0,8450,2003,8
1,9600,1976,6
2,11250,2001,6
3,9550,1915,7
4,14260,2000,9
...,...,...,...
1455,7917,1999,7
1456,13175,1978,7
1457,9042,1941,9
1458,9717,1950,5


In [14]:
X_simple = pd.concat([df_nums, df_cats], axis=1)

In [15]:
X_train = X_simple

## Normal flow

In [16]:
# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],remainder = 'passthrough')

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)])

In [17]:
# Apply the pipeline to your dataset
X = X_train
y = np.log(y_train) #normalize dependent variable
X_preprocessed = pipeline.fit_transform(X)



In [None]:
X_preprocessed.shape

(1460, 16)

# PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
X_pca_pre = pca.fit_transform(X_preprocessed)

# Calculate the cumulative explained variance
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)

# Choose the number of components based on the explained variance threshold
n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1

pca = PCA(n_components=n_components)
pipeline_pca = Pipeline(steps=
                        [('preprocessor', preprocessor),
                        ('pca', pca)])

X_pca = pipeline_pca.fit_transform(X)



# Feature engineering

In [None]:
from sklearn.preprocessing import FunctionTransformer

def custom_features(df):
    df_out = df.copy()
    df_out['PropertyAge'] = df_out['YrSold'] - df_out['YearBuilt']
    df_out['TotalSF'] = df_out['TotalBsmtSF'] + df_out['1stFlrSF'] + df_out['2ndFlrSF']
    df_out['TotalBath'] = df_out['FullBath'] + 0.5 * df_out['HalfBath'] + df_out['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
    df_out['HasRemodeled'] = (df_out['YearRemodAdd'] != df_out['YearBuilt']).astype(object)
    df_out['Has2ndFloor'] = (df_out['2ndFlrSF'] > 0).astype(object)
    df_out['HasGarage'] = (df_out['GarageArea'] > 0).astype(object)
    df_out['YrSold_cat'] = df_out['YrSold'].astype(object)
    df_out['MoSold_cat'] = df_out['MoSold'].astype(object)
    df_out['YearBuilt_cat'] = df_out['YearBuilt'].astype(object)
    df_out['MSSubClass_cat'] = df_out['MSSubClass'].astype(object)

    return df_out

feature_engineering_transformer = FunctionTransformer(custom_features)

In [None]:
# Identify categorical and numerical columns
new_cols_categorical = pd.Index(['HasRemodeled', 'Has2ndFloor', 'HasGarage'])
new_cols_numeric = pd.Index(['PropertyAge', 'TotalSF', 'TotalBath', 'YrSold_cat', 'MoSold_cat', 'YearBuilt_cat', 'MSSubClass_cat'])

# Update categorical and numerical columns
categorical_columns = df_train.select_dtypes(include=['object', 'category']).columns.append(new_cols_categorical)
numerical_columns = df_train.select_dtypes(include=['int64', 'float64']).columns.append(new_cols_numeric)

# Remove target variable from numerical columns
numerical_columns = numerical_columns.drop('SalePrice')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],remainder = 'passthrough')

# Create a pipeline with the preprocessor
pipeline_fe = Pipeline(steps=[
    ('fe', feature_engineering_transformer),
    ('preprocessor', preprocessor),
    ('pca', pca)])

# Apply the pipeline to your dataset
X = df_train.drop('SalePrice', axis=1)
y = np.log(df_train['SalePrice'])
X_preprocessed_fe = pipeline_fe.fit_transform(X)



# Model training, prediction and evaluation

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed,
                            y, test_size=0.2, random_state=42)

from sklearn.metrics import mean_squared_error

## Linear regression

In [19]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

pred = regressor.predict(X_test)

mse = mean_squared_error(y_test,pred) # baseline

print("Simple linear regression MSE: " + str(mse))

Simple linear regression MSE: 0.00041855650464361796


## Exporting the model

In [20]:
import joblib

joblib.dump(regressor, 'linear_reg_model.joblib')

['linear_reg_model.joblib']

## Polynomial regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_reg = PolynomialFeatures(degree = 3)
X_poly = poly_reg.fit_transform(X_train)
lin_reg = LinearRegression()
lin_reg.fit(X_poly, y_train)

X_poly_test = poly_reg.fit_transform(X_test)
pred = lin_reg.predict(X_poly_test)

mse = mean_squared_error(y_test,pred) # baseline

print("Polynomial regression MSE: " + str(mse))

## XGBoost

In [None]:
from xgboost import XGBRegressor

# Instantiation
xgb_r = XGBRegressor(objective ='reg:squarederror',
                  n_estimators = 10, seed = 123)

# Fitting the model
xgb_r.fit(X_train, y_train)

pred = xgb_r.predict(X_test)

mse = mean_squared_error(y_test,pred) # baseline

print("XGBoost MSE: " + str(mse))

## Decision Tree Regression model

In [None]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

pred = regressor.predict(X_test)

mse = mean_squared_error(y_test,pred)

print("Decision Tree Regression MSE: " + str(mse))

## Random forest regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 20, random_state = 0)
regressor.fit(X_train, y_train)

pred = regressor.predict(X_test)

mse = mean_squared_error(y_test,pred)

print("Decision Tree Regression MSE: " + str(mse))

## SVM regressor

In [None]:
from sklearn.svm import SVR

regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

pred = regressor.predict(X_test)

mse = mean_squared_error(y_test,pred)

print("Decision Tree Regression MSE: " + str(mse))

## ANN

In [None]:
import tensorflow as tf

ann = tf.keras.models.Sequential()

ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid')) # softmax if output is non-categorical

In [None]:
ann.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])

In [None]:
ann.fit(X_train, y_train, batch_size = 128, epochs = 20)

In [None]:
pred = ann.predict(X_test)

mse = mean_squared_error(y_test,pred)

print("ANN MSE: " + str(mse))

## MLP regressor

In [None]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(random_state=42, max_iter=10000, n_iter_no_change=3)

mlp.fit(X_train, y_train)

pred = mlp.predict(X_test)

mse = mean_squared_error(y_test,pred)

print("MLP regressor MSE: " + str(mse))

# Submission

## Polynomial regression

In [None]:
X_preprocessed_test = pipeline_fe.fit_transform(df_test)

In [None]:
X_poly_test = poly_reg.fit_transform(X_preprocessed_test)

In [None]:
pred = lin_reg.predict(X_poly_test)

df_stack_out = df_test[['Id']].copy()
df_stack_out['SalePrice'] = pred

df_stack_out.to_csv('submission_poly_reg.csv', index=False)