In [1]:
import pandas as pd

TRAIN = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'

df = pd.read_csv(filepath_or_buffer=TRAIN,).drop(columns=['Id'])
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
TEST = '/kaggle/input/house-prices-advanced-regression-techniques/test.csv'
test_df = pd.read_csv(filepath_or_buffer=TEST)

For this exercise we are going to use scikit-learn regression models only for data with no missing values.

In [3]:
TARGET = 'SalePrice'
keys = [key for key, value in df.drop(columns=[TARGET]).dtypes.to_dict().items() if str(value) in {'int64', 'float64'}]
numerical_columns = [key for key in keys if test_df[key].isna().sum() == 0]
numerical_columns = [column for column in numerical_columns if column != 'LowQualFinSF']

In [4]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.histogram(data_frame=df.sort_values(by='OverallCond'), x=TARGET, facet_col='OverallCond').show()

Fortunately we have no zero or negative values in our training data, but we do have some outliers above about $450k that may cause us some problems.

In [5]:
for column in numerical_columns:
    express.histogram(data_frame=df, x=column).show()

Let's look at the correlations between the numerical columns and the target variable.

In [6]:
express.imshow(img=df[numerical_columns + [TARGET]].corr()[[TARGET]].T)

Before we build any models let's do some dimension reduction and see if we see anything interesting.

In [7]:
import arrow
from umap import UMAP

time_start = arrow.now()
reducer = UMAP(n_components=2, random_state=2024, transform_seed=2024, verbose=True, n_jobs=1, n_epochs=1000)
umap_df = pd.DataFrame(data=reducer.fit_transform(X=df[numerical_columns]), columns=['x', 'y'])
umap_df[TARGET] = df[TARGET].tolist()
express.scatter(data_frame=umap_df, x='x', y='y', color=TARGET, ).show()
print('UMAP done in {}'.format(arrow.now() - time_start))

2024-05-09 13:51:16.939905: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-09 13:51:16.940028: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-09 13:51:17.066521: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(n_epochs=1000, n_jobs=1, random_state=2024, transform_seed=2024, verbose=True)
Thu May  9 13:51:26 2024 Construct fuzzy simplicial set
Thu May  9 13:51:29 2024 Finding Nearest Neighbors
Thu May  9 13:51:33 2024 Finished Nearest Neighbor Search
Thu May  9 13:51:36 2024 Construct embedding


Epochs completed:   0%|            0/1000 [00:00]

	completed  0  /  1000 epochs
	completed  100  /  1000 epochs
	completed  200  /  1000 epochs
	completed  300  /  1000 epochs
	completed  400  /  1000 epochs
	completed  500  /  1000 epochs
	completed  600  /  1000 epochs
	completed  700  /  1000 epochs
	completed  800  /  1000 epochs
	completed  900  /  1000 epochs
Thu May  9 13:51:42 2024 Finished embedding


UMAP done in 0:00:16.134828


It is somewhat but not overwhelmingly encouraging that our higher prices tend to cluster together in our UMAP visualization. Clearly this is a complicated space even after dimension reduction.

Let's add a plot of the test data using the dimension reduction model we built above.

In [8]:
import warnings

warnings.filterwarnings(action='ignore', category=FutureWarning)
train_plot_df = umap_df.drop(columns=[TARGET])
train_plot_df['data'] = 'train'
test_plot_df = pd.DataFrame(data=reducer.transform(X=test_df[numerical_columns]), columns=['x', 'y'])
test_plot_df['data'] = 'test'
plot_df = pd.concat(objs=[train_plot_df, test_plot_df])
express.scatter(data_frame=plot_df.sample(n=1000, random_state=2024), x='x', y='y', color='data').show()

Epochs completed:   0%|            0/333 [00:00]

	completed  0  /  333 epochs
	completed  33  /  333 epochs
	completed  66  /  333 epochs
	completed  99  /  333 epochs
	completed  132  /  333 epochs
	completed  165  /  333 epochs
	completed  198  /  333 epochs
	completed  231  /  333 epochs
	completed  264  /  333 epochs
	completed  297  /  333 epochs
	completed  330  /  333 epochs


In [9]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def rmse(y_true, y_pred) -> float:
    # if we have failures due to NaNs we want to localize them
    left = np.log(1 + y_true)
    right = np.log(1 + y_pred)
    return mean_squared_error(squared=False, y_true=left, y_pred=right)

X_train, X_test, y_train, y_test = train_test_split(df[numerical_columns], df[TARGET], test_size=0.20, random_state=2024)
print('train/test sizes: {}/{}'.format(len(X_train), len(X_test)))

train/test sizes: 1168/292


Let's run through several models using the same train/test split and see which produces the best (lowest) score.

In [10]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

ACTIVATION = ['identity', 'logistic', 'tanh', 'relu'][3]
CRITERION = ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'][3]
MODELS = {
    'support vector' : SVR(kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, 
                           verbose=False, max_iter=-1),
    'ridge +': Ridge(tol=1e-4, random_state=2024, max_iter=10000, positive=True, solver='lbfgs'),
    'neural network' : MLPRegressor(hidden_layer_sizes=(400, 200, 100,), activation=ACTIVATION, solver='adam', alpha=1e-3, batch_size='auto', 
                      learning_rate='adaptive', learning_rate_init=1e-2, power_t=0.5, max_iter=1000, shuffle=True, random_state=2024,
                      tol=1e-5, verbose=False, warm_start=False, momentum=0.8, nesterovs_momentum=True, early_stopping=False, beta_1=0.9, 
                      beta_2=0.999, epsilon=1e-08, n_iter_no_change=20, max_fun=15000),
    'tree' : DecisionTreeRegressor(criterion=CRITERION, splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                             min_weight_fraction_leaf=0.0, max_features=None, random_state=2024, max_leaf_nodes=None,
                             min_impurity_decrease=0.0, ccp_alpha=0.0, ),
}

for name, model in MODELS.items():
    model.fit(X=X_train, y=y_train)
    score = rmse(y_true=y_test, y_pred=model.predict(X=X_test))
    print('{:5.4f} {}'.format(score, name))

0.4025 support vector
0.1836 ridge +
0.1961 neural network
0.2212 tree


Now let's do the same thing but over a range of train/test splits to see if our scores are at all stable.

In [11]:
from plotly import express

mean_scores = []
for name, model in MODELS.items():
    scores = []
    for random_state in range(10):
        X_train, X_test, y_train, y_test = train_test_split(df[numerical_columns], df[TARGET], test_size=0.20, random_state=random_state, )
        model.fit(X=X_train, y=y_train)
        y_pred = model.predict(X=X_test)
        score = rmse(y_true=y_test, y_pred=y_pred)
        scores.append(score)
    title = 'score: {:5.4f} model: {}'.format(sum(scores)/len(scores), name)
    scores_df = pd.DataFrame(data=scores, columns=['score'])
    scores_df['mean'] = scores_df['score'].expanding().mean()
    express.line(data_frame=scores_df, y=['score', 'mean'], title=title).show()
    mean_scores.append((scores_df['mean'].values[-1], name)) 

The score is not especially stable, but the mean score does a good job of ordering the models; our data being what it is we should probably use several splits to choose the model we want to use for our submission.

In [12]:
from sklearn.tree import DecisionTreeRegressor

SUBMISSION = '/kaggle/working/submission.csv'

best_model_name = sorted(mean_scores)[0][1]
y_tree_pred = MODELS[best_model_name].fit(X=df[numerical_columns], y=df[TARGET]).predict(X=test_df[numerical_columns])
pd.DataFrame(data={'Id': test_df['Id'], 'SalePrice': y_tree_pred}).to_csv(path_or_buf=SUBMISSION, index=False)

In [13]:
if best_model_name != 'neural network':
    express.histogram(x=MODELS[best_model_name].feature_names_in_, y=MODELS[best_model_name].feature_importances_, title=MODELS[best_model_name])