In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Set custom styles for text color
plt.rcParams['text.color'] = 'white'
plt.rcParams['axes.labelcolor'] = 'white'
plt.rcParams['axes.titlecolor'] = 'white'
plt.rcParams['xtick.color'] = 'white'  # Color of the x-axis tick values
plt.rcParams['ytick.color'] = 'white'  # Color of the y-axis tick values

# tensorflow
import tensorflow as tf
import tensorflow_decision_forests as tfdf 

# feature engineering
from sklearn.feature_selection import mutual_info_regression

import datetime

# For local run
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# For kaggle run
# train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
# test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

2023-09-05 22:38:20.277579: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
submission = pd.read_csv('data/sample_submission.csv')
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [3]:
# code for subplots
# data_num = data.select_dtypes(include=['int64', 'float64'])
# data_num.hist(figsize=(16,20), bins=50, xlabelsize=8, ylabelsize=8);

In [4]:
target = train.columns.to_list()[-1]
train_target = train[target]
train_target_log = np.log1p(train_target)

In [None]:
# normalization helps cause sale price is a bit skewed
plt.figure(figsize=(8,4))
sns.histplot(train_target, color='g', bins=100, kde=True)
plt.title('Original')
plt.show()

plt.figure(figsize=(8,4))
sns.histplot(train_target_log, color='g', bins=100, kde=True)
plt.title('Log')
plt.show()

In [None]:
train_target = train_target_log
train.drop(target, axis=1, inplace=True)
train_id = train['Id']
test_id = test['Id']
# cause Ids shouldn't go in model
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)
random_seed = 1

# since feature engineering should be on both
combined = pd.concat([train, test], axis=0)

### Feature engineering

In [None]:
cat_handle = 1
num_handle = 1

if cat_handle == 1:
    cat_handle = 'FillWithMedian'
else:
    cat_handle = 'GetDummies'
    
if num_handle == 1:
    num_handle = 'log1p'
else:
    num_handle = 'BoxCox'

In [None]:
combined.info()

In [None]:
data_num = combined.select_dtypes(include=['int64', 'float64'])
data_num.hist(figsize=(16,20), bins=50, xlabelsize=8, ylabelsize=8);

In [None]:
# MSSubClass is cateogorical not numerical
combined['MSSubClass'] = combined['MSSubClass'].astype('object')
combined['MoSold'] = combined['MoSold'].astype('object')

In [None]:
# Ordinal features, features that have a ranking
# Convert these to numerical, cause then there is comparison rather than grouping

col_ord = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual',
          'GarageQual', 'GarageCond', 'PoolQC', 'FireplaceQu']
# print('Unique values', combined[col_ord].stack().unique())

for col in col_ord:
    combined[col] = combined[col].map({np.nan: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
    
# interesting point though, should mapping be based on ordinal ranking or their mean ranking
combined['BsmtFinType1'] = combined['BsmtFinType1'].map({np.nan: 0, 'Unf': 5, 'LwQ': 1, 
                                                         'BLQ': 2, 'Rec': 3, 
                                                         'ALQ': 4, 'GLQ': 6})
combined['BsmtFinType2'] = combined['BsmtFinType2'].map({np.nan: 0, 'Unf': 5, 'LwQ': 1, 
                                                         'BLQ': 2, 'Rec': 3, 
                                                         'ALQ': 4, 'GLQ': 6})
combined['Functional'] = combined['Functional'].map({np.nan: 0, 'Sev': 1, 'Maj2': 2, 
                                                     'Maj1': 3, 'Mod': 4, 
                                                     'Min2': 5, 'Min1': 6, 'Typ': 7})
combined['Fence'] = combined['Fence'].map({np.nan: 0, 'MnWw': 1, 'MnPrv': 2, 
                                           'GdWo': 3, 'GdPrv': 4})
combined['BsmtExposure'] = combined['BsmtExposure'].map({np.nan:0, 'No': 1, 
                                                         'Mn': 2, 'Av': 3, 'Gd': 4})
combined['GarageFinish'] = combined['GarageFinish'].map({np.nan: 0, 'Unf': 1, 
                                                         'RFn': 2, 'Fin': 3})

In [None]:
# visualize to help understand the distribution rnaking
col = 'GarageFinish'
X = combined[:train_id.shape[0]][[col]]
X['SalePrice'] = train_target

plt.figure(figsize=(12,8))
sns.violinplot(x=col, y="SalePrice", palette="Set1", data=X)

plt.figure(figsize=(12,8))
sns.boxplot(x=col, y="SalePrice", palette="Set1", data=X)

In [None]:
# fill with None/0
col_cat_NA = ['Alley', 'GarageType', 'GarageFinish', 'MiscFeature']
for col in col_cat_NA:
    combined[col].fillna('None', inplace=True)
    
col_num_NA = ['BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1']
for col in col_num_NA:
    combined[col].fillna(0, inplace=True)
    
# based on neighborhood
col_nul_nbr = ['LotFrontage', 'GarageCars', 'GarageArea']
for col in col_nul_nbr:
    combined[col] = combined.groupby(
        'Neighborhood')[col].transform(lambda x: x.fillna(x.median()))
    
# based on neighborhood and overall quality
col_cat_nbr_ovrlqual = ['Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
                       'Electrical', 'SaleType']
for col in col_cat_nbr_ovrlqual:
    combined[col] = combined.groupby(
        ['Neighborhood', 'OverallQual'])[col].transform(
        lambda x: x.fillna(x.mode()[0]))
    
col_num_nbr_ovrlqual = ['MasVnrArea']
for col in col_num_nbr_ovrlqual:
    combined[col] = combined.groupby(
        ['Neighborhood', 'OverallQual'])[col].transform(lambda x: x.fillna(x.mean()))
    
# based on MSSubClass
combined['MSZoning'] = combined.groupby(
    'MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

# basement surface area from other features
combined['TotalBsmtSF'].fillna(combined['BsmtFinSF1'] + 
                               combined['BsmtFinSF2'] + 
                               combined['BsmtUnfSF'], inplace=True)

# come back to this later!!!!!!!!!!!!!!!!!!!!!
combined['GarageYrBlt'].fillna(combined['YearBuilt'], inplace=True)

In [None]:
combined.info()

In [None]:
# support functions
# check remaning nulls
combined.isnull().sum()[combined.isnull().sum()>0].sort_values(ascending=False)
# np.unique(combined['Fence'], return_counts=True)
# for col in col_year:
#     display(combined[col].sort_values())

In [None]:
# add some features, some of these features have very high frequency of one value
combined['TotalSF'] = combined['TotalBsmtSF'] + combined['1stFlrSF'] + combined['2ndFlrSF']
combined['HasWoodDeck'] = (combined['WoodDeckSF']==0)*1
combined['HasOpenPorch'] = (combined['OpenPorchSF']==0)*1
combined['HasEnclosedPorch'] = (combined['EnclosedPorch']==0)*1
combined['HasPool'] = (combined['PoolArea']==0)*1
# like 2882 counts of 0
combined['Has3SsnPorch'] = (combined['3SsnPorch']==0)*1
combined['TotalBath'] = (combined['BsmtFullBath'] + combined['BsmtHalfBath']*0.5 + 
                         combined['FullBath'] + combined['HalfBath']*0.5)
combined['TotalProch'] = (combined['WoodDeckSF'] + combined['OpenPorchSF'] + 
                          combined['EnclosedPorch'] + combined['3SsnPorch'] + 
                          combined['ScreenPorch'])

In [None]:
# correct outliers/inaccuracy
combined.loc[combined['GarageYrBlt']==2207, 'GarageYrBlt'] = 2007

In [None]:
col_year = ['GarageYrBlt', 'YearBuilt', 'YearRemodAdd', 'YrSold']
current_year = datetime.datetime.now().year
for col in col_year:
    combined[col] = current_year - combined[col]

### Model design

In [None]:
train = combined.iloc[:train_id.shape[0]]
test = combined.iloc[-test_id.shape[0]:]
train[target] = train_target

In [None]:
def train_test_split(data, test_ratio = 0.3):
    test_indices = np.random.rand(data.shape[0]) < test_ratio
    return data[~test_indices], data[test_indices]

train_data, test_data = train_test_split(train)
print('{} samples in training data, {} in testing'.format(
    train_data.shape[0],test_data.shape[0]))

In [None]:
# convert to tensorflow datasets
train_data = tfdf.keras.pd_dataframe_to_tf_dataset(train_data, label=target,
                                                  task=tfdf.keras.Task.REGRESSION)
test_data = tfdf.keras.pd_dataframe_to_tf_dataset(test_data, label=target,
                                                 task=tfdf.keras.Task.REGRESSION)

Model: TFDF

In [None]:
rf = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)
rf.compile(metrics='mse')

In [None]:
rf.fit(x=train_data)

In [None]:
# tfdf.model_plotter.plot_model_in_colab(rf, max_depth=5, tree_idx=0)

In [None]:
inspector = rf.make_inspector()
logs = inspector.training_logs()
display(logs)

In [None]:
plt.plot([log.num_trees for log in logs], [log.evaluation.rmse for log in logs])
plt.xlabel('# of trees')
plt.ylabel('rmse')
plt.show()

In [None]:
inspector.evaluation()

In [None]:
# let's evaluate using our testing set
evaluation = rf.evaluate(x=test_data, return_dict=True)
display(evaluation)

Random Forest Variable Importance

In [None]:
inspector.variable_importances().keys()

In [None]:
inspector.variable_importances()['SUM_SCORE'][0:5]

In [None]:
plt.figure(figsize=(12,8))

var_importance_metric = 'SUM_SCORE'
var_importances = inspector.variable_importances()[var_importance_metric]

feature_names = [v[0].name for v in var_importances][0:10]
feature_importance = [v[1] for v in var_importances][0:10]

# since they are in descending importance already
feature_ranks = range(len(feature_names))

bar = plt.barh(feature_names, feature_importance, label=[str(x) for x in feature_ranks])
plt.yticks(feature_ranks, feature_names)
plt.gca().invert_yaxis() # inverses order of bars appearing

for importance, patch in zip(feature_importance, bar.patches):
    plt.text(patch.get_x() + patch.get_width(), patch.get_y(), f"{importance/1e12:.0f}T", va="top")

plt.xlabel('Importance')
plt.title(var_importance_metric + ' feature importance')
plt.tight_layout()
plt.show()

Predictions

In [None]:
test.head()

In [None]:
test.shape

In [None]:
test = tfdf.keras.pd_dataframe_to_tf_dataset(test, task=tfdf.keras.Task.REGRESSION)
predictions = rf.predict(test)
output = pd.DataFrame({'Id':test_id, 'SalePrice': predictions.squeeze()})

# undo the log, essentially do expo
output['SalePrice'] = np.expm1(output['SalePrice'])
output

In [None]:
submission.loc[submission['Id'].isin(output['Id']), 'SalePrice'] = output['SalePrice']
submission

In [None]:
submission.to_csv('data/submission.csv', index=False)