In [1]:
import pandas as pd
import numpy as np
from preproc import apply_dtype_tfrm, INT_DTYPE, FLOAT_DTYPE

In [2]:
# the data here needs to be named properly (in english)
# converted to proper dtypes, imputed, encoded and then merged
raw_houses = pd.read_csv("data/raw_houses.csv")
raw_sells = pd.read_csv("data/raw_sells.csv")

In [3]:
# initial df was transposes and had houses as columns
houses = raw_houses.transpose()
houses.columns = houses.iloc[0]
houses.index = pd.RangeIndex(0, len(houses.index))

# rename columns, convert to proper dtypes
ft_houses = pd.read_csv("data/features_houses.csv", index_col="Old")
houses = houses[1:].rename(columns=ft_houses["New"]).reset_index(drop=True)
ft_houses = ft_houses.reset_index().set_index("New")
houses = apply_dtype_tfrm(houses, ft_houses)

In [4]:
# rename columns, convert to proper dtypes
ft_sells = pd.read_csv("data/features_sells.csv", index_col="Old")
sells = raw_sells.rename(columns=ft_sells["New"])
ft_sells = ft_sells.reset_index().set_index("New")
sells = apply_dtype_tfrm(sells, ft_sells)

In [5]:
# drop unnecessary columns
houses_drops = [
  "SqMeterCost",
  "SoldNFlats",
  "SoldFlatsArea",
  "SoldFlatsRubl",
  "SoldNParkSlots",
  "SoldNNonresid",
  "SoldPercent",
  "SeaView",
  "DtInfo",
  "DtPayAcc",
  "DtIns",
  # "Latitude",
  # "Longitude",
  "NrbyCemetery",
  "NrbyNarcoDisp",
]
houses.drop(columns=houses_drops, inplace=True, errors="ignore")

sells_drops = [
  "SoldNFlats",
  "SoldNParkSlots",
  "SoldParkSlotsArea",
  "SoldParkSlotsRubl",
  "SoldNNonresid",
  "SoldNonresidArea",
  "SoldNonresidRubl",
  "RowIndex",
  "Region",
  # "InfoMonth",
]
sells.drop(columns=sells_drops, inplace=True, errors="ignore")

In [6]:
# leave only VDK real estate
if "Settlement" in sells.columns:
  sells = sells[sells["Settlement"] == "Владивосток"].drop(
    columns=["Settlement"]
  )

In [7]:
# avoid division by zero: np.nan propagates
sells["SoldFlatsArea"] = (
  sells["SoldFlatsArea"]
  .map(lambda x: np.nan if x == 0.0 else x)
  .astype(FLOAT_DTYPE)
)
sells["SqMeterCost"] = (
  sells["SoldFlatsRubl"] / sells["SoldFlatsArea"]
).astype(FLOAT_DTYPE)
sells.set_index("HouseId", inplace=True)

# drop missing SqMeterCost
sells.dropna(axis=0, subset=["SqMeterCost"], inplace=True)

In [8]:
sells["InfoMonth"].unique()

array(['Апрель 2021', 'Июль 2021', 'Август 2021', 'Сентябрь 2021',
       'Октябрь 2021', 'Ноябрь 2021', 'Декабрь 2021', 'Январь 2022',
       'Февраль 2022', 'Март 2022', 'Апрель 2022', 'Май 2022',
       'Июнь 2022', 'Июль 2022', 'Август 2022', 'Февраль 2021',
       'Март 2021', 'Июнь 2021', 'Январь 2021', 'Май 2021'], dtype=object)

In [9]:
if "InfoMonth" in sells.columns:
  sells["Month"], sells["Year"] = zip(
    *sells["InfoMonth"].map(lambda monthyear: tuple(monthyear.split()))
  )
  sells["Year"] = sells["Year"].astype(INT_DTYPE)
  sells.drop(columns="InfoMonth", inplace=True)

In [10]:
# need to merge two tables using sells["HouseId", "HouseName"] and houses[["Name"]]
# some houses["Name"] are in form of "{Name or Address} {HouseId}"
# but a few of them doesn't have {HouseId}
# HouseId >= 3062 if exists so we can clearly separate it from the first part
def pop_house_id(name: str) -> tuple[str, int]:
  sep = name.rfind(" ")
  if sep == -1:
    return name, pd.NA

  try:
    project_id = int(name[sep + 1 :])
    if project_id < 3062:
      raise Exception
  except:
    return name, pd.NA

  return name[:sep], project_id


# https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns
if "Name" in houses.columns:
  houses["HouseName"], houses["HouseId"] = zip(
    *houses["Name"].map(pop_house_id)
  )
  houses["HouseId"] = houses["HouseId"].astype(INT_DTYPE)
  houses.drop(columns=["Name"], inplace=True, errors="ignore")

In [11]:
# map NA houses.HouseId to valid sells.HouseId
na_mapping = {
  "Садгород-357": [44295],
  "Садгород-295": [44295],
  # 'Времена года': -1,     # wtf
  "Восточный ЛУЧ-5": [
    37381,
    37701,
    37703,
    37704,
    37705,
    34275,
    37333,
    36352,
  ],
  "Новые горизонты": [40959, 42989],
  "Басаргина, д. 2": [41333],
  "Басаргина, д. 2, б/с 2 10 эт": [41422],
  "Басаргина, д. 2, б/с 2 18 эт": [41487],
  "Борисенко, д. 100, лит. Е": [38128, 38129],
  "Изумрудный, 1оч": [13283, 13284, 13285, 37526, 37527],
}

if any(pd.isna(houses["HouseId"])):
  for house_name, house_id in na_mapping.items():
    rows = []
    for i in range(len(house_id)):
      row = houses[houses["HouseName"] == house_name].copy()
      row["HouseId"] = house_id[i]
      rows.append(row)
      # break # if identical rows shouldn't map to possibly different targets
    houses = pd.concat([houses, *rows], ignore_index=True)

  houses.dropna(axis=0, subset=["HouseId"], inplace=True)

In [12]:
sells.drop(
  columns=["HouseName", "ProjectId", "SoldFlatsArea", "SoldFlatsRubl"],
  inplace=True,
  errors="ignore",
)
houses.drop(columns="HouseName", inplace=True, errors="ignore")
houses.drop_duplicates(inplace=True, ignore_index=True)

In [13]:
# the only common column is "HouseId"
df = pd.merge(
  left=sells, right=houses, on="HouseId", suffixes=["", "_right"]
).set_index("HouseId", drop=True)
df["SqMeterCost"] = df["SqMeterCost"].replace({0.0: np.nan})
df.dropna(axis=0, subset="SqMeterCost", inplace=True)

In [14]:
df.to_csv("data/df.csv")

df_dtypes = df.dtypes
df_dtypes.index.name = "Column"
df_dtypes.rename("Dtype", inplace=True)
df_dtypes.to_csv("data/df_dtypes.csv")