In [1]:
import pandas as pd

TEST = '/kaggle/input/house-prices-advanced-regression-techniques/test.csv'
TRAIN = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'

test_df = pd.read_csv(filepath_or_buffer=TEST)
df = pd.read_csv(filepath_or_buffer=TRAIN,).drop(columns=['Id'])
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
TARGET = 'SalePrice'
keys = [key for key, value in df.drop(columns=[TARGET]).dtypes.to_dict().items() if str(value) in {'int64', 'float64'}]
COLUMNS = [key for key in keys if test_df[key].isna().sum() == 0]
knn_columns  = [key for key in keys if 0 < test_df[key].isna().sum() < 5]

The competition uses RMSE, so we need an RMSE metric.

In [3]:
import numpy as np
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred) -> float:
    # if we have failures due to NaNs we want to localize them
    left = np.log(1 + y_true)
    right = np.log(1 + y_pred)
    return mean_squared_error(squared=False, y_true=left, y_pred=right)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.20, random_state=2024)
print('train/test sizes: {}/{}'.format(len(X_train), len(X_test)))

train/test sizes: 1168/292


In [5]:
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

ACTIVATION = 'identity'
CRITERION = 'absolute_error'
MODELS = {
    'ridge +': Ridge(tol=1e-4, random_state=2024, max_iter=10000, positive=True, solver='lbfgs'),
    'neural network' : MLPRegressor(hidden_layer_sizes=(400, 200, 100,), activation=ACTIVATION, solver='adam', alpha=1e-3, batch_size='auto', 
                      learning_rate='adaptive', learning_rate_init=1e-2, power_t=0.5, max_iter=1000, shuffle=True, random_state=2024,
                      tol=1e-5, verbose=False, warm_start=False, momentum=0.8, nesterovs_momentum=True, early_stopping=False, beta_1=0.9, 
                      beta_2=0.999, epsilon=1e-08, n_iter_no_change=20, max_fun=15000),
    'tree' : DecisionTreeRegressor(criterion=CRITERION, splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                             min_weight_fraction_leaf=0.0, max_features=None, random_state=2024, max_leaf_nodes=None,
                             min_impurity_decrease=0.0, ccp_alpha=0.0, ),
}


columns = COLUMNS + ['BsmtFinSF1',  'BsmtUnfSF', 'TotalBsmtSF']
imputer = KNNImputer()
knn_df = imputer.fit_transform(X=df[columns])
for name, model in MODELS.items():
    scores = []
    for random_state in range(2):
        X_train, X_test, y_train, y_test = train_test_split(knn_df, df[TARGET], test_size=0.20, random_state=random_state)
        try:
            model.fit(X=X_train, y=y_train)
            score = rmse(y_true=y_test, y_pred=model.predict(X=X_test))
            scores.append(score)
        except ValueError:
            pass

    print('{:5.4f} : {}'.format(sum(scores)/len(scores), name))

0.2100 : ridge +
0.2383 : neural network
0.2106 : tree


In [6]:
SUBMISSION = '/kaggle/working/submission.csv'

name = 'ridge +'
imputer = KNNImputer().fit(X=pd.concat(objs=[df, test_df])[columns])
y_tree_pred = MODELS[name].fit(X=imputer.transform(X=df[columns]), y=df[TARGET]).predict(X=imputer.transform(X=test_df[columns]))
pd.DataFrame(data={'Id': test_df['Id'], 'SalePrice': y_tree_pred}).to_csv(path_or_buf=SUBMISSION, index=False)