In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import Pipeline

from catboost import CatBoostRegressor

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../data/house-prices-advanced-regression-techniques/train.csv")

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
cat_features = [c for c in df if not pd.api.types.is_numeric_dtype(df[c])]

In [5]:
df.drop(cat_features, axis=1, inplace=True)

In [6]:
imputer = KNNImputer()

In [7]:
X, y = df.drop("SalePrice", axis=1), df["SalePrice"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [9]:
X_train_knn = imputer.fit_transform(X_train)
X_test_knn =  imputer.transform(X_test)

In [10]:
X_train_knn = pd.DataFrame(X_train_knn, columns=X_train.columns)

In [11]:
for label, content in X_train_knn.items():
    if content.isna().sum() > 1:
        print(label)

The above block returns nothing because KNN imputer has filled in the missing values

In [39]:
pipe = Pipeline([
    ("imputer", KNNImputer(n_neighbors=5, add_indicator=False)),
    ("regr", CatBoostRegressor(verbose=False))
])

In [40]:
param_grid = {
    "imputer__n_neighbors": [1, 4, 7, 10],
    "imputer__add_indicator": [True, False]
}

In [41]:
gs_regr = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring="r2")

In [None]:
gs_regr.fit(X_train, y_train)

I'm going to try put this into a ColumnTransformer and see if I can do a complete workflow

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error

from catboost import CatBoostRegressor

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../data/house-prices-advanced-regression-techniques/train.csv")

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df.drop("Id", axis=1, inplace=True)

In [5]:
cat_labels = [c for c in df if not pd.api.types.is_numeric_dtype(df[c])]
num_labes = [c for c in df if pd.api.types.is_numeric_dtype(df[c]) and c != "SalePrice"]

In [25]:
cat_imputer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

num_imputer = Pipeline([
    ("imputer", KNNImputer())
])

In [26]:
preprocessor = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_labels),
    ("num_imputer", num_imputer, num_labes)
], remainder="passthrough")

In [27]:
X, y = df.drop("SalePrice", axis=1), df["SalePrice"]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1022, 79), (1022,), (438, 79), (438,))

In [29]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
838,20,RL,75.0,9525,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2008,WD,Normal
722,20,RL,70.0,8120,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,7,2009,WD,Normal
1342,60,RL,,9375,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
117,20,RL,74.0,8536,Pave,,Reg,Lvl,AllPub,Corner,...,0,0,,,,0,4,2007,New,Partial
151,20,RL,107.0,13891,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,1,2008,New,Partial


In [30]:
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

In [31]:
regr = CatBoostRegressor(verbose=False)

In [32]:
regr.fit(X_train_pre, y_train, verbose=False, eval_set=(X_test_pre, y_test), plot=True);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [33]:
regr.score(X_test_pre, y_test)

0.8453969611002491

In [34]:
y_preds = regr.predict(X_test_pre)

In [35]:
rmsle = mean_squared_log_error(y_test, y_preds)

In [36]:
print(f"RMSLE score of {rmsle:.2f}%")

RMSLE score of 0.01%


### Let me try a GridSearchCv for a better value

In [44]:
preprocess_regr = Pipeline([
    ("preprocessor", preprocessor),
    ("regr", CatBoostRegressor(verbose=False))
])

In [45]:
preprocess_regr.fit(X_train, y_train);

In [46]:
preprocess_y_preds = preprocess_regr.predict(X_test)

In [47]:
preprocess_rmsle = mean_squared_log_error(y_test, preprocess_y_preds)

In [48]:
print(f"RMSLE score of {preprocess_rmsle:.2f}%")

RMSLE score of 0.02%


In [49]:
preprocess_regr.score(X_test, y_test)

0.8359183240892957

The score is already really good. 

In [50]:
grid_params = {
    "preprocessor__cat_imputer__imputer__strategy": ["constant", "most_frequent"],
    "preprocessor__num_imputer__imputer__n_neighbors": [1, 3, 5, 7, 10],
    "preprocessor__num_imputer__imputer__add_indicator": [True, False],
    "preprocessor__num_imputer__imputer__weights": ["distance", "uniform"]
}

In [51]:
gs_regr = GridSearchCV(preprocess_regr, grid_params, cv=5, n_jobs=-1, scoring="neg_mean_squared_error")

In [52]:
gs_regr.fit(X_train, y_train);

In [33]:
gs_regr.best_params_

{'preprocessor__cat_imputer__imputer__strategy': 'most_frequent',
 'preprocessor__num_imputer__imputer__add_indicator': False,
 'preprocessor__num_imputer__imputer__n_neighbors': 3}

In [34]:
gs_regr.score(X_test, y_test)

0.9124035024162036

I can't get GridSearchCv to work when the model is independant of the preprocessor. I guess I can make the model with the preprocessor, see what gives me the better score, reinstantiate the model and use something like, Optuna to adjust the hyperparameters.