In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [2]:
data = pd.read_pickle("../data_preprocessed/data.pkl")

In [3]:
data['date'] = pd.to_numeric(pd.to_datetime(data['date']))
# TODO separate days, months as features (more expensive in some months??)

cols_cat = [
 'district',
 'parking']

for c in cols_cat:
    if c in data:
        data[c] = pd.Categorical(data[c])

data.head(3)

Unnamed: 0,price_id,flat_id,price,date,flat_id.1,ad_id,title,date_posted,date_scraped,location,...,num_bathrooms,flat_area,text,description,photos_links,page_address,district,private_owner,parking,price_per_m
4,5,4,390000,1610928000000000000,4,1004683230600910484697409,Nowa kawalerka do wprowadzenia w chmurach 15p,2019-04-04,2021-01-18,"Praga Południe, Warszawa",...,1,26,Garaż,AGENCJE NIERUCHOMOSCI PROSZĘ NIE DZWONIĆ! BEZP...,['https://i.ebayimg.com/00/s/NjAwWDgwMA==/z/m4...,https://www.gumtree.pl/a-mieszkania-i-domy-spr...,praga_pld,1,garage,15000
11,12,11,500000,1611705600000000000,11,1005929025950911462330809,Za gotówkę ! Praga Południe gratka dla inwesto...,2019-09-22,2021-01-27,"Praga Południe, Warszawa",...,1,43,Brak,Mieszkanie dobrze rozplanowane 2 pokoje z osob...,['https://i.ebayimg.com/00/s/ODAwWDU5NA==/z/hn...,https://www.gumtree.pl/a-mieszkania-i-domy-spr...,praga_pld,1,none,11627
12,13,12,590000,1611878400000000000,12,1005939166310911450918709,"50m, Praga Południe (będzie Metro), 2 pokoje d...",2019-09-23,2021-01-29,"Praga Południe, Warszawa",...,1,50,Brak,Zakup tylko za gotówkę - grunt podlega regulac...,['https://i.ebayimg.com/00/s/ODAwWDYwMA==/z/Sn...,https://www.gumtree.pl/a-mieszkania-i-domy-spr...,praga_pld,1,none,11800


In [4]:
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='category')

# cat_transformer = OrdinalEncoder()
cat_transformer = OneHotEncoder()
# Word to vec model - embedding for each description - LSTM --> encoding for the entire sentence, embedding for the whole desc as a new feature
# Pick features: 

scaler = StandardScaler()  # I can use a different one (max scaling ... )
transformer = make_column_transformer(
    (scaler, num_selector),
    (cat_transformer, cat_selector),
)

In [5]:
cols = [
#  'date',
 'num_rooms',
 'num_bathrooms',
 'flat_area',
 'district',
 'private_owner',
 'parking']

In [6]:
y = data['price'].copy()
X = data[cols].copy()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
# make sure it's stratified

X_train_raw = X_train.copy()
X_test_raw = X_test.copy()

print(X_train.shape)

(319622, 6)


In [8]:
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

In [9]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(319622, 26)
(319622,)
(136982, 26)


In [10]:
X_train_baseline = transformer.fit_transform(X_train_raw[['flat_area', 'district']])
X_test_baseline = transformer.transform(X_test_raw[['flat_area', 'district']])

# Regression

In [11]:
regr = LinearRegression()

regr.fit(X_train_baseline, y_train)
y_pred_regr = regr.predict(X_test_baseline)

print(f"Score: {regr.score(X_test_baseline, y_test)}")
print(f"Regression: {mean_absolute_error(y_test, y_pred_regr)}")

Score: 0.7147678768034906
Regression: 136435.62996296302


# Decision Tree

In [12]:
regr_tree = DecisionTreeRegressor()

regr_tree.fit(X_train, y_train)
y_pred_tree = regr_tree.predict(X_test)

print(f"Score: {regr_tree.score(X_test, y_test)}")
print(f"Tree: {mean_absolute_error(y_test, y_pred_tree)}")

Score: 0.9048254098903495
Tree: 72924.98491280308


# RandomForestRegressor

In [13]:
regr_forest = RandomForestRegressor()
regr_forest.fit(X_train, y_train)

y_pred_forest = regr_forest.predict(X_test)

print(f"Score: {regr_forest.score(X_test, y_test)}")
print(f"Forest: {mean_absolute_error(y_test, y_pred_forest)}")

# SVM

In [None]:
# svm = SVR()
# svm.fit(X_train, y_train)

# y_pred_svm = svm.predict(X_test)

# svm.score(X_test, y_test)

In [None]:
# mean_absolute_error(y_test, y_pred_svm)

# KNN

In [None]:
# knn = KNeighborsRegressor()
# knn.fit(X_train, y_train)

# y_pred_knn = knn.predict(X_test)

# print(f"Score: {knn.score(X_test, y_test)}")
# print(f"KNN: {mean_absolute_error(y_test, y_pred_knn)}")

# Comparison

In [None]:
print(f"Regression: {mean_absolute_error(y_test, y_pred_regr)}")
print(f"Tree: {mean_absolute_error(y_test, y_pred_tree)}")
print(f"Forest: {mean_absolute_error(y_test, y_pred_forest)}")
# print(f"KNN: {mean_absolute_error(y_test, y_pred_knn)}")

Regression: 132170.3150164895
Tree: 70984.78393634623
Forest: 70994.06417752705


In [None]:
X_test_raw['y_true'] = y_test
X_test_raw['y_pred_regr'] = y_pred_regr
X_test_raw['y_pred_tree'] = y_pred_tree
X_test_raw['y_pred_forest'] = y_pred_forest
# X_test_raw['y_pred_knn'] = y_pred_knn

# X_test_raw['diff'] = abs(X_test_raw['y_true'] - X_test_raw['y_pred'])
X_test_raw['diff_prc_regr'] = abs(X_test_raw['y_true'] - X_test_raw['y_pred_regr'])/X_test_raw['y_true']*100
X_test_raw['diff_prc_tree'] = abs(X_test_raw['y_true'] - X_test_raw['y_pred_tree'])/X_test_raw['y_true']*100
X_test_raw['diff_prc_forest'] = abs(X_test_raw['y_true'] - X_test_raw['y_pred_forest'])/X_test_raw['y_true']*100
# X_test_raw['diff_prc_knn'] = abs(X_test_raw['y_true'] - X_test_raw['y_pred_knn'])/X_test_raw['y_true']*100

pd.set_option('display.float_format', lambda x: '%.3f' % x)

X_test_raw_sorted = X_test_raw.sort_values(by=['diff_prc_tree'], ascending=False)

X_test_raw_sorted.head()

Unnamed: 0,num_rooms,num_bathrooms,flat_area,district,private_owner,parking,y_true,y_pred_regr,y_pred_tree,y_pred_forest,diff_prc_regr,diff_prc_tree,diff_prc_forest
25934,2,1,41,srodmiescie,0,garage,58000,689441.273,618419.714,620406.316,1088.692,966.241,969.666
381951,2,1,40,srodmiescie,1,street,57500,677203.969,609020.408,608810.004,1077.746,959.166,958.8
381943,2,1,40,srodmiescie,1,street,57500,677203.969,609020.408,608810.004,1077.746,959.166,958.8
84983,3,2,90,ochota,0,garage,90000,1121091.027,920000.0,920573.68,1145.657,922.222,922.86
300096,2,1,34,mokotow,1,none,49900,492060.959,507209.302,506871.994,886.094,916.452,915.776


In [None]:
list(X_test_raw_sorted)

['num_rooms',
 'num_bathrooms',
 'flat_area',
 'district',
 'private_owner',
 'parking',
 'y_true',
 'y_pred_regr',
 'y_pred_tree',
 'y_pred_forest',
 'diff_prc_regr',
 'diff_prc_tree',
 'diff_prc_forest']

In [None]:
districts = X_test_raw['district'].unique()

for district in districts:
    df_dist = X_test_raw.loc[X_test_raw['district'] == district]

    q_25 = df_dist['y_true'].quantile(0.25) 
    q_50 = df_dist['y_true'].quantile(0.5) 
    q_75 = df_dist['y_true'].quantile(0.75) 
    max_price = max(df_dist['y_true'])

    for q in [q_25, q_50, q_75, max_price]:
        df_dist_price = df_dist.loc[df_dist['y_true'] < q]
        df_dist_price = df_dist_price[['diff_prc_regr', 'diff_prc_tree','diff_prc_forest']]

        stats_unformatted = dict(df_dist_price.mean(axis=0))
        stats = {k: int(v) for k, v in stats_unformatted.items()}

        print(f"{district}, max price: {q}, flats: {len(df_dist_price)}")
        print(stats)
        print("_"*60)

wola, max price: 475000.0, flats: 5914
{'diff_prc_regr': 13, 'diff_prc_tree': 13, 'diff_prc_forest': 13}
____________________________________________________________
wola, max price: 599000.0, flats: 11882
{'diff_prc_regr': 13, 'diff_prc_tree': 12, 'diff_prc_forest': 12}
____________________________________________________________
wola, max price: 752379.0, flats: 17822
{'diff_prc_regr': 14, 'diff_prc_tree': 11, 'diff_prc_forest': 11}
____________________________________________________________
wola, max price: 2224000, flats: 23778
{'diff_prc_regr': 15, 'diff_prc_tree': 11, 'diff_prc_forest': 11}
____________________________________________________________
praga_pld, max price: 475000.0, flats: 3195
{'diff_prc_regr': 21, 'diff_prc_tree': 11, 'diff_prc_forest': 12}
____________________________________________________________
praga_pld, max price: 580000.0, flats: 6352
{'diff_prc_regr': 18, 'diff_prc_tree': 11, 'diff_prc_forest': 11}
_____________________________________________________