# Property price prediction -2nd possibility

## Preparing libraries

In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)

import qgrid
import missingno as msno

from sklearn.metrics import mean_absolute_error
from tqdm import tqdm 
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score, cross_validate

import xgboost as xgb
import gc
import eli5
import mlflow

# MLflow

In [2]:
def get_or_create_experiment(name):
    experiment = mlflow.get_experiment_by_name(name)
    if experiment is None:
        mlflow.create_experiment(name)
        return mlflow.get_experiment_by_name(name)
    
    return experiment

def _eid(name):
    return get_or_create_experiment(name).experiment_id

In [3]:
get_or_create_experiment("dwsolution_property")

<Experiment: artifact_location='file:///home/jovyan/dwsolutions/property_warsaw/notebooks/mlruns/1', experiment_id='1', lifecycle_stage='active', name='dwsolution_property', tags={}>

## Reading data

In [3]:
ls ../input/

big_train_warsaw_property.h5  [0m[01;36mtrain_data.h5[0m@
test_warsaw_property.h5       train_warsaw_property.h5


In [4]:
df_train = pd.read_hdf("../input/train_warsaw_property.h5")
df_test = pd.read_hdf("../input/test_warsaw_property.h5")

df = pd.concat([df_train, df_test])
del df_train, df_test
gc.collect()

df.shape

(60222, 53)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60222 entries, 27731 to 92766
Data columns (total 53 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           60222 non-null  int64  
 1   location                     60222 non-null  object 
 2   is_private                   60222 non-null  int8   
 3   piekarnik                    60222 non-null  bool   
 4   garaż                        60222 non-null  bool   
 5   monitoring / ochrona         60222 non-null  bool   
 6   rolety antywłamaniowe        60222 non-null  bool   
 7   kuchenka                     60222 non-null  bool   
 8   taras                        60222 non-null  bool   
 9   balkon                       60222 non-null  bool   
 10  ogródek                      60222 non-null  bool   
 11  dwupoziomowe                 60222 non-null  bool   
 12  system alarmowy              60222 non-null  bool   
 13  pom. użytkow

## Data analysis

In [6]:
df.area

27731    42,93 m²
40398    43,17 m²
35812       46 m²
48052    48,60 m²
36421       77 m²
           ...   
92751      100 m²
92754       49 m²
92756      150 m²
92765      142 m²
92766    69,45 m²
Name: area, Length: 60222, dtype: object

In [8]:
df["area"].map(lambda x: x.split("m")[0].replace(" ", "").replace(",", ".")).astype("float")

27731     42.93
40398     43.17
35812     46.00
48052     48.60
36421     77.00
          ...  
92751    100.00
92754     49.00
92756    150.00
92765    142.00
92766     69.45
Name: area, Length: 60222, dtype: float64

In [9]:
df["floor"].value_counts()

1           12759
parter      10107
2           10069
3            8983
4            5895
5            1805
6            1159
7             825
8             638
10            540
> 10          533
9             430
poddasze       89
suterena       16
Name: floor, dtype: int64

In [10]:
df["materiał budynku"].value_counts()

cegła              14846
inne                3469
wielka płyta        3394
pustak              2961
silikat             2047
żelbet               655
beton komórkowy      413
beton                346
drewno                47
keramzyt              16
Name: materiał budynku, dtype: int64

In [11]:
df["okna"].value_counts()

plastikowe    35873
drewniane      4211
aluminiowe      317
Name: okna, dtype: int64

In [12]:
df["stan wykończenia"].value_counts()

do zamieszkania    16049
do wykończenia     13839
do remontu          3158
Name: stan wykończenia, dtype: int64

In [13]:
df["rodzaj zabudowy"].value_counts()

blok                27282
apartamentowiec     12047
kamienica            7190
dom wolnostojący     1124
szeregowiec          1067
plomba                101
loft                   43
Name: rodzaj zabudowy, dtype: int64

In [14]:
df["forma własności"].value_counts()

pełna własność               34411
spółdzielcze własnościowe     3034
spółdzielcze wł. z kw         1901
udział                         124
Name: forma własności, dtype: int64

In [15]:
df["floors_in_building"].value_counts()

 (z 4)        15665
 (z 3)        11688
 (z 2)         7423
 (z 5)         4725
 (z 1)         3096
 (z 10)        2752
 (z 6)         2156
 (z 7)         1837
 (z 8)         1666
 (z 11)         920
 (z 9)          482
 (z 12)         308
 (z 15)         202
 (z 13)         196
 (z 14)         190
 (z 16)         186
 (z 17)         168
 (z 18)          48
 (z 21)          40
 (z 19)          18
 (z 26)           9
 (z 30)           9
 (z 32)           9
 (z 44)           7
 (z 28)           5
 (z 27)           5
 (z 22)           5
 (z 52)           5
 (z 24)           4
 (z 25)           4
 (z 23)           4
 (z 36)           3
 (z 2017)         2
 (z 38)           2
 (z 43)           1
 (z 62)           1
 (z 50)           1
 (z 1999)         1
 (z 52019)        1
 (z 51)           1
 (z 49)           1
 (z 20)           1
 (z 142)          1
Name: floors_in_building, dtype: int64

In [16]:
df["floors_in_building_num"] = df["floors_in_building"].map(lambda x: str(x).split("z")[-1].replace(")", "") if str(x) != "nan" else -1).astype("float")
df["floors_in_building_num"].value_counts()

 4.0        15665
 3.0        11688
 2.0         7423
-1.0         6374
 5.0         4725
 1.0         3096
 10.0        2752
 6.0         2156
 7.0         1837
 8.0         1666
 11.0         920
 9.0          482
 12.0         308
 15.0         202
 13.0         196
 14.0         190
 16.0         186
 17.0         168
 18.0          48
 21.0          40
 19.0          18
 32.0           9
 26.0           9
 30.0           9
 44.0           7
 52.0           5
 27.0           5
 28.0           5
 22.0           5
 25.0           4
 24.0           4
 23.0           4
 36.0           3
 2017.0         2
 38.0           2
 20.0           1
 49.0           1
 62.0           1
 43.0           1
 1999.0         1
 142.0          1
 50.0           1
 52019.0        1
 51.0           1
Name: floors_in_building_num, dtype: int64

In [17]:
df["floors_in_building_num_norm"] = df["floors_in_building_num"].map(lambda x: x if x < 20 else 25)
df["floors_in_building_num_norm"].value_counts()

 4.0     15665
 3.0     11688
 2.0      7423
-1.0      6374
 5.0      4725
 1.0      3096
 10.0     2752
 6.0      2156
 7.0      1837
 8.0      1666
 11.0      920
 9.0       482
 12.0      308
 15.0      202
 13.0      196
 14.0      190
 16.0      186
 17.0      168
 25.0      122
 18.0       48
 19.0       18
Name: floors_in_building_num_norm, dtype: int64

## Feature engineering

In [18]:
def feature_engineering(df):

    df["area_num"] = df["area"].map(lambda x: x.split("m")[0].replace(" ", "").replace(",", ".")).astype("float")
    area_num_99 = np.percentile(df["area_num"], 99)
    df["area_norm"] = df["area_num"].map(lambda x: x if x <= area_num_99 else area_num_99  )
    
    df["area_num_log"] = np.log(df["area_num"])
    df["price_m2"] = df["price"] / df["area_num"]
    
    for i in range(5):
        df["loc{}".format(i)] = df["location"].map(lambda x: x[i] if len(x) > i else -1)

    agg_funcs = ["median"]
    for grp_feat in ["price", "price_m2"]:
        for loc_num in ["loc0", "loc1", "loc2"]:
            loc_grp = df[ [grp_feat, loc_num] ].groupby(loc_num).agg(agg_funcs).to_dict()
            for agg_func in agg_funcs:
                df["{0}_{1}_{2}".format(loc_num, grp_feat, agg_func)] = df[loc_num].map(loc_grp[ (grp_feat, agg_func) ])

                
    df["price_median"] = df["area_norm"] * df["loc1_price_m2_median"]
    
                
    floors_dict = {"parter": 0, "> 10": 11, "poddasze": -2, "suterena": -1 }
    df["floor_num"] = df["floor"].map(lambda x: floors_dict.get(x, x)).fillna(-10).astype("int")
    
    df["floors_in_building_num"] = df["floors_in_building"].map(lambda x: str(x).split("z")[-1].replace(")", "") if str(x) != "nan" else -1).astype("float")
    df["floors_in_building_num_norm"] = df["floors_in_building_num"].map(lambda x: x if x < 20 else 25)

    df["build_year"] = df["rok budowy"].fillna(-1).astype("int")
    
    df["rental"] = df["czynsz"].map(lambda x: str(x).split("zł")[0].replace(" ", "").replace(",", ".") if str(x) != "nan" else -1 )
    df["rental"] = df["rental"].map(lambda x: float(str(x).replace("eur", "") * 4) if "eur" in str(x) else x).astype("float")


    #categorical
    cat_feats = {
        "materiał budynku": "build_material_cat",
        "okna": "window_cat",
        "stan wykończenia": "property_completion_cat",
        "rodzaj zabudowy": "property_type_cat",
        "ogrzewanie": "property_heating_cat",
        "forma własności": "own_property_cat"
    }
    
    for feat_name, feat_new_name in cat_feats.items():
        df[feat_new_name] = df[feat_name].factorize()[0]
        
        #ohe
        df_dummies = pd.get_dummies(df[feat_name])
        df_dummies.columns = ["{0}_{1}".format(feat_new_name, x) for x in df_dummies.columns]
        df = pd.concat( [df, df_dummies], axis=1)
    
    return df

In [19]:
df_fe = feature_engineering(df.copy())

## Preparing model

In [20]:
black_list = ["id", "price", "price_m2", "price_median"]
feats = [x for x in df_fe.select_dtypes(["number", "boolean"]).columns if x not in black_list]

print(feats)
len(feats)

['is_private', 'piekarnik', 'garaż', 'monitoring / ochrona', 'rolety antywłamaniowe', 'kuchenka', 'taras', 'balkon', 'ogródek', 'dwupoziomowe', 'system alarmowy', 'pom. użytkowe', 'klimatyzacja', 'tarasy', 'teren zamknięty', 'internet', 'winda', 'telefon', 'pralka', 'piwnica', 'ochrona', 'telewizja kablowa', 'telewizor', 'lodówka', 'domofon / wideofon', 'oddzielna kuchnia', 'zmywarka', 'garaż/miejsce parkingowe', 'meble', 'drzwi / okna antywłamaniowe', 'plan zagospodarowania:', 'rooms', 'is_primary_market', 'floors_in_building_num', 'floors_in_building_num_norm', 'area_num', 'area_norm', 'area_num_log', 'loc0_price_median', 'loc1_price_median', 'loc2_price_median', 'loc0_price_m2_median', 'loc1_price_m2_median', 'loc2_price_m2_median', 'floor_num', 'build_year', 'rental', 'build_material_cat', 'build_material_cat_beton', 'build_material_cat_beton komórkowy', 'build_material_cat_cegła', 'build_material_cat_drewno', 'build_material_cat_inne', 'build_material_cat_keramzyt', 'build_materia

86

In [21]:
train = df_fe[ df_fe["price"].notnull() ]
test = df_fe[ df_fe["price"].isnull() ].copy()

X_train = train[feats].values
y_train = train["price"].values
X_test =  test[feats].values

print(X_train.shape, X_test.shape)

(13947, 86) (46275, 86)


In [22]:
!mkdir -p ../outputs

In [23]:
with mlflow.start_run(experiment_id=_eid("dwsolution_property"), run_name="xgboost") as run:

    model = xgb.XGBRegressor(max_depth=8, n_estimators=150, random_state=0)
    mlflow.log_params(model.get_params())
    mlflow.log_param("model", str(model).split("(")[0])
    mlflow.log_param("feats", feats)
    mlflow.log_param("X_train.shape", X_train.shape)

    model.fit(X_train, y_train)

    #artifcats
    result = eli5.show_weights(model, feature_names=feats)
    with open("../outputs/eli5.html", "w") as f:
        f.write("<html>{}</html>".format(result.data))
    mlflow.log_artifact("../outputs/eli5.html", "plot")

    #metrics
    scoring = ["neg_mean_absolute_error", "neg_mean_squared_error",  "neg_median_absolute_error", "r2"]
    result = cross_validate(model, X_train, y_train, scoring=scoring, return_train_score=True, return_estimator=False)
    mlflow.log_metrics({"avg_{}".format(x): np.mean(result[x]) for x in result})
    mlflow.log_metrics({"std_{}".format(x): np.std(result[x]) for x in result})

"done"    

'done'

In [24]:
test["price"] = model.predict(X_test)
test[ ['id', 'price'] ].to_csv('../output/xgb_starter2.csv', index=False) 