In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn.ensemble
import sklearn.kernel_ridge
import sklearn.linear_model
import sklearn.model_selection
import sklearn.svm
import tensorflow as tf
pd.set_option('display.max_columns', 100)
sns.set_style("darkgrid")

2021-10-02 07:48:02.705260: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2021-10-02 07:48:02.705386: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Read CSVs
dirname = '/kaggle/input/house-prices-advanced-regression-techniques/'
benchmark = pd.read_csv(dirname + 'sample_submission.csv')
train = pd.read_csv(dirname + 'train.csv')
test = pd.read_csv(dirname + 'test.csv')
# train.columns.values

In [3]:
# Benchmark: random integer mapping, regression on features with |corr| > 0.3
cols = []
benchmark_train = train.copy().drop(["Id", "SalePrice"], axis=1)

for col in benchmark_train.columns:
    
    na_cnt = benchmark_train[col].isna().sum()
    
    if benchmark_train[col].dtype != int and benchmark_train[col].dtype != float:
        mappings = zip(benchmark_train[col].unique(), range(len(benchmark_train[col].unique())))
        benchmark_train[col] = benchmark_train[col].map(dict(mappings))
    
#     print('{}: {} null & corr {}'.format(
#         col, 
#         na_cnt, 
#         benchmark_train[col].corr(train['SalePrice'])
#     ))  
    
    if abs(benchmark_train[col].corr(train['SalePrice'])) > 0.3 and benchmark_train[col].isna().sum() == 0:
        cols.append(col)
    
lr = sklearn.linear_model.LinearRegression()
lr.fit(benchmark_train[cols], train['SalePrice'])
# benchmark_prediction = lr.predict(test[cols])
print('Benchmark score: ', lr.score(benchmark_train[cols], train['SalePrice']))

Benchmark score:  0.7915365120379597


In [4]:
# Functions for data preprocessing
stats_format_categorical = '''
~ Summary of {} ~

# of null values : {}
# of unique values : {}
# of occurrences of most frequent value: {}
Correlation ~ price : {}
__________________________________
Value Counts
{}
__________________________________
Mean Price
{}

'''

stats_format_continuous = '''
~ Summary of {} ~

# of 0s : {}
# of null values : {}
Correlation ~ price : {}
__________________________________

'''

def check_categorical(train, colname, map_numerical=False):
    if map_numerical:
        mapping = dict(zip(train[colname].unique(), range(train[colname].unique().size)))
    print(stats_format_categorical.format(
        colname, 
        train[colname].isna().sum(), 
        train[colname].unique().size,
        (train[colname] == train[colname].mode()[0]).sum(),
        train[colname].map(mapping).corr(train['SalePrice']) if map_numerical else train[colname].corr(train['SalePrice']),
        train[colname].value_counts(dropna=False),
        train[[colname, 'SalePrice']].groupby(colname, as_index=False, dropna=False).mean().sort_values(by='SalePrice')
    ))

def generate_mapping(train, colname):
    return dict(zip(train[[colname, 'SalePrice']].groupby(
        colname, as_index=False, dropna=False
        ).mean().sort_values(by='SalePrice')[colname], range(
        train[colname].unique().size)))

def process_categorical(train, colname, test=None, show_stats=True):
    if show_stats:
        print('Before:')
        check_categorical(train, colname, map_numerical=True)
    
    mapping = generate_mapping(train, colname)
    train[colname] = train[colname].map(mapping).astype(int)
    
    if test is not None:
        test[colname] = test[colname].map(mapping)
        test[colname].fillna(test[colname].mode().values[0], inplace=True)
        test[colname] = test[colname].astype(int)
    
    if show_stats:
        print('Mapping: {}\n'.format(mapping))
        print('After:')
        check_categorical(train, colname)
        sns.regplot(x=train[colname], y=train['SalePrice'])

def band_categorical(train, colname, test=None, buckets=8):
    bandname = colname + "Band"

    _, bins = pd.qcut(train[colname], buckets, retbins=True)
    train.insert(loc=len(train.columns), column=bandname, value=-1)
    if test is not None:
        test.insert(loc=len(test.columns), column=bandname, value=-1)
    bins[0], bins[-1] = float("-inf"), float("inf")

    for i in range(buckets):
        train.loc[(train[colname] >= bins[i]) 
                & (train[colname] < bins[i + 1]), bandname] = i
        if test is not None:
            test.loc[(test[colname] >= bins[i]) 
                   & (test[colname] < bins[i + 1]), bandname] = i
    
def check_continuous(train, colname):
    print(stats_format_continuous.format(
        colname, 
        (train[colname] == 0).sum(),
        train[colname].isna().sum(), 
        train[colname].corr(train['SalePrice']),
    ))
    sns.regplot(x=train[colname], y=train['SalePrice'])

def process_continuous(train, colname, test=None, show_stats=True):
    minval = train[colname].min()
    maxval = train[colname].max()
    train[colname] = (train[colname] - minval) / (maxval - minval)
    if test is not None:
        test[colname] = (test[colname] - minval) / (maxval - minval)
    if show_stats:
        check_continuous(train, colname)

In [5]:
# Define and group feature columns
cols_to_drop = [# Data extremely skewed
                'Street', 'Alley', 'Utilities', 'Condition2', 'LandSlope',
                'BsmtFinSF2', 'RoofMatl', 'BsmtFullBath', 'GarageCond',
                'Heating', 'Electrical', 'LowQualFinSF', 'EnclosedPorch', 
                'BsmtHalfBath', 'KitchenAbvGr', 'Functional', 'PavedDrive',
                '3SsnPorch', 'ScreenPorch',  'PoolArea', 'GarageQual', 
                'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold',
                'LandContour', 'Condition1', 'BldgType', 'LotConfig', 'RoofStyle',
                'ExterCond', 'BsmtCond', 'BsmtFinType2', 'SaleCondition',
                'BedroomAbvGr', 'CentralAir', 'SaleType',
                'Id',          # Unrelated to price
                'LotFrontage', # Too many null values
                'GarageYrBlt', # YearBuilt has better correlation
                'Exterior2nd', # High correlation with Exterior1st
                'TotalBsmtSF', # High correlation with 1stFlrSF
                '2ndFlrSF',    # Some correlation with 1stFlrSF and too many 0s
               ]
cols_categorical = ['MSSubClass', 'MSZoning', 'LotShape', 'HouseStyle', 
                    'OverallQual', 'OverallCond', 'Neighborhood', 
                    'ExterQual', 'Foundation', 'Exterior1st',
                    'BsmtQual', 'BsmtExposure', 'MasVnrType', 
                    'BsmtFinType1', 'HeatingQC', 'KitchenQual', 'TotRmsAbvGrd', 
                    'Fireplaces', 'FireplaceQu', 'GarageType', 'FullBath', 
                    'GarageFinish', 'GarageCars', 'HalfBath']
cols_continuous = ['LotArea', '1stFlrSF', 'GrLivArea','GarageArea', 'OpenPorchSF',
                   'BsmtFinSF1', 'BsmtUnfSF', 'MasVnrArea', 'WoodDeckSF']
cols_to_band = ['YearBuilt', 'YearRemodAdd']

cols = [*cols_categorical, *cols_continuous]
assert len(cols) == len(cols_categorical) + len(cols_continuous)
assert len(train.columns) - 1 == len(cols_to_drop) + len(cols) + len(cols_to_band)

In [6]:
# Preprocessing

# Some categorical features have null value as a category
# This will cause errors during preprocessing: replace it with a string
for data in [train, test]:
    data["Alley"].fillna('NA', inplace=True)
    data["MasVnrType"].fillna('NA', inplace=True)
    data["BsmtQual"].fillna('NA', inplace=True)
    data["BsmtCond"].fillna('NA', inplace=True)
    data["BsmtExposure"].fillna('NA', inplace=True)
    data["BsmtFinType1"].fillna('NA', inplace=True)
    data["BsmtFinType2"].fillna('NA', inplace=True)
    data["FireplaceQu"].fillna('NA', inplace=True)
    data["GarageType"].fillna('NA', inplace=True)
    data["GarageFinish"].fillna('NA', inplace=True)
    data["GarageQual"].fillna('NA', inplace=True)
    data["GarageCond"].fillna('NA', inplace=True)
    data["PoolQC"].fillna('NA', inplace=True)
    data["Fence"].fillna('NA', inplace=True)
    data["MiscFeature"].fillna('NA', inplace=True)

# Some features in train/test dataset are missing
# For categorical, fill with the mode; for continuous, fill with the mean
for data in [train, test]:
    for col in cols_categorical:
        data[col].fillna(data[col].mode().values[0], inplace=True)
    for col in cols_continuous:
        data[col].fillna(data[col].mean(), inplace=True)
    
# Make categorical features numerical and ordinal by mean price
# Normalize continuous features at the range of [0, 1]
# Band features with sparse categories
for col in cols_categorical:
    process_categorical(train, col, test=test, show_stats=False)
for col in cols_continuous:
    process_continuous(train, col, test=test, show_stats=False)
for col in cols_to_band:
    band_categorical(train, col, test=test)

# Some features can be combined for better inference
for data in [train, test]:
    data["Bath"] = data["FullBath"] + data["HalfBath"]
    data['YearBand'] = data['YearBuiltBand'] + data['YearRemodAddBand']
for colname in ['FullBath', 'HalfBath']:
    cols_categorical.remove(colname)
    cols.remove(colname)
for colname in ['Bath', 'YearBand']:
    cols_categorical.insert(0, colname)
    cols.insert(0, colname)
    
# The label feature is too large and too focused around the mean
# Let's normalize it to make training more smooth
# Since we need to de-normalize later, save the parameters
train["SalePrice"] = np.log(train["SalePrice"])
price_min = train["SalePrice"].min()
price_max = train["SalePrice"].max()
train["SalePrice"] = (train["SalePrice"] - price_min) / (price_max - price_min)

In [7]:
# Train various sklearn models
sk_models = {
    "Linear Regression": {"model": sklearn.linear_model.LinearRegression(), "accuracy": None},
    "SGD Regressor": {"model": sklearn.linear_model.SGDRegressor(), "accuracy": None},
    "Support Vector Machine": {"model": sklearn.svm.SVR(), "accuracy": None},
    "Kernel Ridge": {"model": sklearn.kernel_ridge.KernelRidge(), "accuracy": None},
    "Elastic Net": {"model": sklearn.linear_model.ElasticNet(), "accuracy": None},
    "Bayesian Ridge": {"model": sklearn.linear_model.BayesianRidge(), "accuracy": None},
    "Gradient Boosting Regressor": {"model": sklearn.ensemble.GradientBoostingRegressor(), "accuracy": None},
}

for sk_model_name in sk_models:
    sk_model = sk_models[sk_model_name]["model"]
    sk_model.fit(train[cols], train['SalePrice'])
    sk_model_acc = round(sk_model.score(train[cols], train['SalePrice']), 4)
    sk_models[sk_model_name]["accuracy"] = sk_model_acc
    print("{} Accuracy: {}".format(sk_model_name, sk_model_acc))

prediction_sk = sk_models['Gradient Boosting Regressor']["model"].predict(test[cols])

Linear Regression Accuracy: 0.8723
SGD Regressor Accuracy: -2.7431339436310936e+16
Support Vector Machine Accuracy: 0.8503
Kernel Ridge Accuracy: 0.8673
Elastic Net Accuracy: 0.1882
Bayesian Ridge Accuracy: 0.8715
Gradient Boosting Regressor Accuracy: 0.9507


In [8]:
# Build deep learning model
embeddings_dim = 32
hidden_dim = 256

def build_model():
    # 1. Continuous feature inputs
    continuous_inputs = tf.keras.Input(shape=(len(cols_continuous),))
    _ = tf.keras.layers.Dense(units=hidden_dim, 
                              activation='relu')(continuous_inputs)
    _ = tf.keras.layers.Dense(units=embeddings_dim, 
                              activation='relu')(_)
    _ = tf.keras.layers.Reshape(target_shape=(1, embeddings_dim))(_)
    
    # 2. Discrete feature inputs
    discrete_inputs = []
    embeddings = []
    for idx, col in enumerate(cols_categorical):
        discrete_inputs.append(tf.keras.Input(shape=(1,)))
        embeddings.append(
            tf.keras.layers.Embedding(input_dim=train[col].max() + 1, 
                                      output_dim=embeddings_dim, 
                                      input_length=1,
                                      embeddings_regularizer='l2')(discrete_inputs[idx]))
        
    # 3. Concatenation & Final MLP
    _ = tf.keras.layers.Concatenate()([_, *embeddings])
    _ = tf.keras.layers.Flatten()(_)
    _ = tf.keras.layers.Dense(units=hidden_dim, 
                              activation='relu', 
                              kernel_regularizer='l2')(_)
    _ = tf.keras.layers.Dense(units=1, 
                              activation='sigmoid')(_)
    
    return tf.keras.Model(inputs=[continuous_inputs, *discrete_inputs], 
                       outputs=_, name="house_price_model")

model = build_model()
# model.summary()
# tf.keras.utils.plot_model(
#     model,
#     show_shapes=True,
#     show_dtype=True,
#     show_layer_names=True,
# )

2021-10-02 07:48:08.911921: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-10-02 07:48:08.914899: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2021-10-02 07:48:08.914941: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-10-02 07:48:08.914967: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (35bdf45af6aa): /proc/driver/nvidia/version does not exist
2021-10-02 07:48:08.915286: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operation

In [9]:
# Train deep learning model
X_train, X_val, Y_train, Y_val = sklearn.model_selection.train_test_split(
    train[cols].values, train["SalePrice"].values, test_size=0.2, random_state=0)
X_train = [X_train[:, -len(cols_continuous):], *[X_train[:, i] for i in range(len(cols_categorical))]]
X_val = [X_val[:, -len(cols_continuous):], *[X_val[:, i] for i in range(len(cols_categorical))]]
X_test = [test[cols].values[:, -len(cols_continuous):], *[test[cols].values[:, i] for i in range(len(cols_categorical))]]

model.compile(
    optimizer=tf.keras.optimizers.RMSprop(),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=['mean_squared_error'],
)

history = model.fit(
    X_train,
    Y_train,
    batch_size=64,
    epochs=30,
    verbose=1,
    validation_data=(X_val, Y_val),
)

val_pred = model(X_val).numpy().reshape((-1,))
score = np.corrcoef(val_pred, Y_val)[0, 1] ** 2
print('R^2:', score)
prediction_tf = model(X_test).numpy().reshape((-1,))

2021-10-02 07:48:09.348903: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-10-02 07:48:09.361455: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2200150000 Hz


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
R^2: 0.7100372944030391


In [10]:
# De-normalize predictions and generate submission
prediction = prediction_sk
prediction = np.exp(prediction * (price_max - price_min) + price_min)
submission = pd.DataFrame({'Id': test['Id'], 'SalePrice': prediction})
submission.to_csv('submission.csv', index=False)