In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import (
  is_object_dtype,
  is_numeric_dtype,
)

import sklearn
from sklearn.base import BaseEstimator
from sklearn.preprocessing import (
  MinMaxScaler,
  StandardScaler,
  RobustScaler,
  QuantileTransformer,
)
from sklearn.impute import SimpleImputer
from category_encoders import BinaryEncoder
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import (
  KFold,
  GridSearchCV,
)
from sklearn.metrics import (
  mean_squared_error as mse,
  mean_absolute_error as mae,
)
from sklearn.decomposition import PCA, NMF

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import (
  RandomForestRegressor,
  GradientBoostingRegressor,
)

import seaborn as sns

from model import *


sklearn.set_config(transform_output="pandas")
rng = np.random.default_rng(424)

# import warnings

# warnings.filterwarnings("ignore", category=DeprecationWarning)

In [11]:
# read dataframe according to the provided dtypes
dtypes = pd.read_csv("data/df_dtypes.csv", index_col="Column")
dtypes_dict = dtypes.to_dict()["Dtype"]
df = (
  pd.read_csv(
    "data/df.csv",
    dtype=dtypes_dict,
  )
  .replace({"Другой": pd.NA})
  .drop(columns="HouseId")
)
targ = "SqMeterCost"

In [12]:
month_ord = [
  "Январь",
  "Февраль",
  "Март",
  "Апрель",
  "Май",
  "Июнь",
  "Июль",
  "Август",
  "Сентябрь",
  "Октябрь",
  "Ноябрь",
  "Декабрь",
]
month_map = {month: idx for idx, month in enumerate(month_ord)}

if "Year" in df.columns and "Month" in df.columns:
  df["MonthYear"] = (df.Year - df.Year.min()) * 12 + df.Month.map(month_map)
  df.drop(columns=["Year", "Month"], inplace=True)

In [13]:
df[targ] = np.log(df[targ])

In [14]:
numr_cols = df.dtypes[df.dtypes.map(is_numeric_dtype)].index.to_list()
bool_cols_pred = lambda col: df[col].min() == 0 and df[col].max() == 1
bool_cols = list(filter(bool_cols_pred, numr_cols))
numr_cols = list(set(numr_cols) - set(bool_cols))
catg_cols = df.dtypes[df.dtypes.map(is_object_dtype)].index.to_list()

numr_cols.sort()
bool_cols.sort()
catg_cols.sort()
cols = catg_cols + bool_cols + numr_cols

nrby_cols = list(filter(lambda col: col.startswith("Nrby"), df.columns))
reach_cols = list(filter(lambda col: col.startswith("Reach"), df.columns))
dist_cols = list(filter(lambda col: col.startswith("Dist"), df.columns))
dist_cols.remove("District")
infr_cols = list(filter(lambda col: col.startswith("Infr"), df.columns))
placement_cols = (
  nrby_cols + reach_cols + dist_cols + infr_cols + ["Latitude", "Longitude"]
)

In [15]:
valid_X = df[
  (df["Month"].isin(["Июнь", "Июль", "Август"])) & (df["Year"] == 2022)
]
train_X = df.drop(valid_X.index)

KeyError: 'Month'

In [None]:
valid_y = valid_X.pop(targ)
train_y = train_X.pop(targ)

In [50]:
def prepare(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
  y = np.log(train[targ])
  X = train.drop(columns=targ)
  
  X = X[cols]

  scaler = MinMaxScaler()
  X.loc[:, numr_cols] = scaler.fit_transform(X[numr_cols])
  X.loc[:, "IfVideoSurvei"] = X["IfVideoSurvei"].fillna(value=0)

  

  placement_imputer = KNNImputer()
  X.loc[:, placement_cols] = placement_imputer.fit_transform(
    X[placement_cols]
  )
  for col in nrby_cols + reach_cols:
    X[col] = np.round(X[col])

  imputer = KNNImputer()
  X.loc[:, bool_cols + numr_cols] = imputer.fit_transform(
    X[bool_cols + numr_cols]
  )
  for col in bool_cols:
    X[col] = np.round(X[col])

  catg_imputers = {}
  lhs = X.drop(columns=catg_cols)
  for col in catg_cols:
    clf = RandomForestClassifier()
    rhs = X[col].dropna()
    clf.fit(lhs.loc[rhs.index], rhs)
    catg_imputers[col] = clf

    na_index = X[col][X[col].isna()].index
    if len(na_index) != 0:
      X.loc[na_index, col] = clf.predict(lhs.loc[na_index])

  if "Year" in X.columns and "Month" in X.columns:
    X["MonthYear"] = (X.Year - X.Year.min()) * 12 + X.Month.map(month_map)
    X.MonthYear /= X.MonthYear.max()
    X.drop(columns=["Year", "Month"], inplace=True)
    catg_cols.remove("Month")
    numr_cols.remove("Year")

  encoder = TargetEncoder()
  X.loc[:, catg_cols] = encoder.fit_transform(X[catg_cols], y)
  for col in catg_cols:
    X[col] = X[col].astype("Float32")
    
  return X, y

In [51]:
train_X, train_y = prepare(train)
valid_X, valid_y = prepare(valid)

In [58]:
model = LinearRegression()
model.fit(train_X, train_y)

In [59]:
pred_y = model.predict(valid_X)

In [60]:
pred_y

array([11.39437382, 11.50834923, 11.47903817, 11.50114635, 11.55669332,
       11.54310168, 11.59940234, 11.60402393, 11.60791888, 11.64673294,
       11.68482036, 11.67122872, 12.24442229, 12.17208774, 11.61878311,
       11.65083867, 11.6766024 , 11.71490218, 11.74066591, 11.7576112 ,
       11.78182553, 11.80032021, 11.82167471, 11.84743844, 11.85368267,
       11.88573823, 11.91150196, 11.93130707, 11.99768337, 12.28682039,
       12.3081749 , 11.83019507, 12.09794087, 11.56740503, 11.62870613,
       11.67481537, 11.69616987, 11.69553206, 11.71688657, 11.94068861,
       11.97772604, 12.00691601, 12.00475213, 11.91060475, 11.89774458,
       11.91909908, 11.94045359, 11.98033986, 11.8396015 , 11.87948777,
       11.92436066, 11.94571517, 11.77621817, 11.82109106, 11.8346101 ,
       11.86380007, 11.89587377, 11.93576004, 11.98063293, 12.00198744,
       12.02334194, 12.04709728, 12.06845179, 12.13503438, 12.15638888,
       12.13943392, 12.16726429, 12.18214293, 12.2102737 , 12.26

In [62]:
mae(np.exp(valid_y), np.exp(pred_y))

9957.10994285542