In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import (
  is_numeric_dtype,
  is_integer_dtype,
  is_float_dtype,
)

import sklearn
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import (
  MinMaxScaler,
  StandardScaler,
  RobustScaler,
  LabelEncoder,
  # OrdinalEncoder,
  TargetEncoder,
)
from category_encoders import (
  BinaryEncoder,
  OrdinalEncoder,
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import (
  mean_squared_error as mse,
  mean_absolute_error as mae,
)

import matplotlib.pyplot as plt
import seaborn as sns

sklearn.set_config(transform_output="pandas")
rng = np.random.default_rng(424)

In [3]:
# read dataframe according to the provided dtypes
df_dtypes = pd.read_csv("data/df_dtypes.csv", index_col="Column")
df_dtypes_dict = df_dtypes.to_dict()["Dtype"]
del df_dtypes_dict["DtPayAcc"]
del df_dtypes_dict["DtIns"]
df = pd.read_csv(
  "data/df.csv",
  parse_dates=["DtPayAcc", "DtIns"],
  dtype=df_dtypes_dict,
  index_col="HouseId",
).replace({"Другой": pd.NA})

In [None]:
df.drop(columns=["CemeteryNrby", "NarcoDispNrby", "Latitude", "Longitude"], inplace=True)

1. sort columns (remainder="passthrough" in ColumnTransformer requires identical order)
2. encode categoricals with simple label encoding (as KNNImputer requires)
3. scale numerical columns (as KNNImputer requires)
4. impute missing values with KNNImputer
5. scale back numerical categoricals
6. round numerical categoricals to possible values
6. encode numerical categoricals

In [44]:
# categorical columns to be target encoded
cat_tar_enc_cols = ["CompanyName"]
# categorical columns to be ordinal encoded
cat_ord_enc_cols = ["District", "HouseCatg"]
# categorical columns to be binary encoded
cat_bin_enc_cols = [
  # too much NAs, drop it
  "FacadeMainMatrl",  # "Другой" - 42%
  "FacadeAuxlMatrl",  # "Другой" - 27%
  "FacadeType",
  "HouseType",
]
cat_cols = cat_tar_enc_cols + cat_ord_enc_cols + cat_bin_enc_cols

# numerical columns to be scaled
num_cols = df.dtypes[df.dtypes.map(is_numeric_dtype)].index.to_list()
num_cols.remove("SoldFlatsArea")
num_cols.remove("SoldFlatsRubl")
num_cols.remove("SqMeterCost")
num_cols.sort()

# boolean columns that shouldn't be scaled
bool_cols = []
for col in num_cols:
  if 0 <= df[col].min() and df[col].max() <= 1:
    bool_cols.append(col)
bool_cols.sort()

for col in bool_cols:
  num_cols.remove(col)

# target column also should be scaled (logarithm)
target_col = ["SqMeterCost"]

In [45]:
sorted_cols = target_col + cat_cols + num_cols + bool_cols
df = df[sorted_cols]

In [46]:
first_enc = OrdinalEncoder()

scaler = ColumnTransformer(
  [("scaler", MinMaxScaler(), cat_cols + num_cols)],
  remainder="passthrough",
  verbose_feature_names_out=False,
)

pipe = Pipeline(
  [
    ("first_enc", first_enc),
    ("scaler", scaler),
    ("imputer", KNNImputer(missing_values=np.nan)),
  ]
)

piped_df = pipe.fit_transform(df)

In [47]:
def crossval(
  model, n_folds: int, X: pd.DataFrame, y: pd.Series, metrics=None
) -> np.ndarray:

  kfold = KFold(n_folds, shuffle=True)
  if not metrics:
    metrics = [{"name": "mse", "func": mse}, {"name": "mae", "func": mae}]
  cvscores = np.zeros(
    shape=n_folds, dtype=[(metric["name"], "float") for metric in metrics]
  )

  for i, (train, test) in enumerate(kfold.split(X, y)):
    model.fit(X.iloc[train], y.iloc[train])
    y_pred = model.predict(X.iloc[test])
    y_true = y.iloc[test]
    for metric in metrics:
      cvscores[metric["name"]][i] = metric["func"](y_true, y_pred)

  return cvscores