In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from transformers import *

In [2]:
# the data here needs to be named properly (in english)
# converted to proper dtypes, imputed, encoded and then merged
houses = pd.read_csv("data/raw_houses.csv")
sells = pd.read_csv("data/raw_sells.csv")

In [3]:
def apply_dtype_trfmer(
  X: pd.DataFrame, features: pd.DataFrame
) -> pd.DataFrame:
  int_ft = features.index[features["Dtype"] == "int"]
  float_ft = features.index[features["Dtype"] == "float"]
  datetime_ft = features.index[features["Dtype"] == "datetime"]
  boolean_ft = features.index[features["Dtype"] == "boolean"]

  # parses inconsistent floats, booleans, converts to appropriate dtypes
  dtype_trfmer = ColumnTransformer(
    [
      ("int_trfm", IntTransformer(), int_ft),
      ("float_trfm", FloatTransformer(), float_ft),
      ("datetime_trfm", DatetimeTransformer(), datetime_ft),
      ("boolean_trfm", BooleanTransformer(), boolean_ft),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
  )
  dtype_trfmer.set_output(transform="pandas")
  return dtype_trfmer.fit_transform(X)

In [4]:
houses = houses.transpose()
houses.columns = houses.iloc[0]
houses.index = pd.RangeIndex(0, len(houses.index))

ft_houses = pd.read_csv("data/features_houses.csv", index_col="Old")
houses = houses[1:].rename(columns=ft_houses["New"])
ft_houses = ft_houses.reset_index().set_index("New")
houses = apply_dtype_trfmer(houses, ft_houses)

In [5]:
ft_sells = pd.read_csv("data/features_sells.csv", index_col="Old")
sells = sells.rename(columns=ft_sells["New"])
ft_sells = ft_sells.reset_index().set_index("New")
sells = apply_dtype_trfmer(sells, ft_sells)