In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import Pipeline

from catboost import CatBoostRegressor

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../data/house-prices-advanced-regression-techniques/train.csv")

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
cat_features = [c for c in df if not pd.api.types.is_numeric_dtype(df[c])]

In [5]:
df.drop(cat_features, axis=1, inplace=True)

In [6]:
imputer = KNNImputer()

In [7]:
X, y = df.drop("SalePrice", axis=1), df["SalePrice"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [9]:
X_train_knn = imputer.fit_transform(X_train)
X_test_knn =  imputer.transform(X_test)

In [10]:
X_train_knn = pd.DataFrame(X_train_knn, columns=X_train.columns)

In [11]:
for label, content in X_train_knn.items():
    if content.isna().sum() > 1:
        print(label)

The above block returns nothing because KNN imputer has filled in the missing values

In [39]:
pipe = Pipeline([
    ("imputer", KNNImputer(n_neighbors=5, add_indicator=False)),
    ("regr", CatBoostRegressor(verbose=False))
])

In [40]:
param_grid = {
    "imputer__n_neighbors": [1, 4, 7, 10],
    "imputer__add_indicator": [True, False]
}

In [41]:
gs_regr = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring="r2")

In [None]:
gs_regr.fit(X_train, y_train)

I'm going to try put this into a ColumnTransformer and see if I can do a complete workflow

In [22]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error

from catboost import CatBoostRegressor

import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
df = pd.read_csv("../data/house-prices-advanced-regression-techniques/train.csv")

In [24]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [25]:
df.drop("Id", axis=1, inplace=True)

In [26]:
cat_labels = [c for c in df if not pd.api.types.is_numeric_dtype(df[c])]
num_labes = [c for c in df if pd.api.types.is_numeric_dtype(df[c]) and c != "SalePrice"]

In [27]:
cat_imputer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

num_imputer = Pipeline([
    ("imputer", KNNImputer(n_neighbors=5, add_indicator=False))
])

In [28]:
preprocessor = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_labels),
    ("num_imputer", num_imputer, num_labes)
], remainder="passthrough")

In [29]:
X, y = df.drop("SalePrice", axis=1), df["SalePrice"]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1022, 79), (1022,), (438, 79), (438,))

In [31]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
709,20,RL,,7162,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,12,2008,WD,Abnorml
1251,120,RL,,3136,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,3,2006,WD,Normal
249,50,RL,,159000,Pave,,IR2,Low,AllPub,CulDSac,...,0,0,,,Shed,500,6,2007,WD,Normal
1125,20,RL,60.0,10434,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,11,2009,WD,Normal
465,120,RM,,3072,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,5,2006,WD,Normal


In [32]:
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

In [33]:
regr = CatBoostRegressor(verbose=False)

In [34]:
regr.fit(X_train_pre, y_train, verbose=False, eval_set=(X_test_pre, y_test), plot=True);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [35]:
regr.score(X_test_pre, y_test)

0.858406389841462

In [36]:
y_preds = regr.predict(X_test_pre)

In [37]:
rmsle = mean_squared_log_error(y_test, y_preds)

In [38]:
print(f"RMSLE score of {rmsle:.2f}%")

RMSLE score of 0.02%


Let me try a GridSearchCv for a better value