In [55]:
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split

# Preprocess

In [56]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
%autoreload 2
from transformers.preprocessing import CountyTransformer, NoTransformer, RealestateTypeTransformer, PriceOutlierRemoval, PriceMedianMultiplierExtractor

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.linear_model import LinearRegression

## Clean input, remove outliers

In [58]:
cleanse_pipe = Pipeline(
    [('group_county', CountyTransformer()),
     ('type', RealestateTypeTransformer()),
     ('z_score_outlier_drop', PriceOutlierRemoval()),
     ('price_median_multiplier', PriceMedianMultiplierExtractor())])
cleaned_data = cleanse_pipe.fit_transform(train_data)

action  estate_type  
RENT    APT                 330.000
        BUSINESS            500.000
        COTT_HOUSE          255.650
        GARAGE               70.000
        HOUSE               487.400
        PART_OF_HOUSE       420.000
        TERR_HOUSE          505.645
SALE    APT               63800.000
        BUSINESS           2162.000
        COTT_HOUSE        35000.000
        FARM              44000.000
        GARAGE             5000.000
        HOUSE             67000.000
        PART_OF_HOUSE     89900.000
        TERR_HOUSE       107000.000
Name: Hind, dtype: float64


In [59]:
cleaned_data = cleaned_data.drop(columns=['Tüüp', 'Kuupäev'])

In [60]:
cleaned_data

Unnamed: 0,id,Tube,Üldpind,Seisukord,Ehitusaasta,Korrus,Korruseid,Maakond,Hind,action,estate_type,year,ad_year,price_median_multiplier
0,586578,3,56.2,Renoveeritud,1967,2,4,harju maakond,34995.00,SALE,APT,13,13,0.548511
1,292379,3,76.9,Uus,2008,2,3,harju maakond,89760.00,SALE,APT,11,11,1.406897
2,864117,3,66.3,Uus,2015,2,6,harju maakond,185000.00,SALE,APT,15,15,2.899687
3,687362,2,46.3,Uus,1990,4,5,tartu maakond,58000.00,SALE,APT,13,13,0.909091
4,497410,1,23.3,Renoveeritud,1930,1,2,tartu maakond,27000.00,SALE,APT,13,13,0.423197
5,974842,2,44.4,Renoveeritud,1980,2,5,harju maakond,67000.00,SALE,APT,15,15,1.050157
6,432344,2,185.0,Uus,2004,1,1,harju maakond,148000.00,SALE,BUSINESS,12,12,68.455134
7,847700,1,34.0,Keskmine,1991,7,9,harju maakond,209.00,RENT,APT,14,14,0.633333
8,490978,4,110.3,Uus,2008,3,6,harju maakond,139000.00,SALE,APT,12,12,2.178683
9,971527,3,65.9,San. remont tehtud,1993,10,12,harju maakond,85000.00,SALE,APT,15,15,1.332288


In [61]:
data = pd.read_csv("data/traindata.csv")
strat_split = train_test_split

##  Interpret data

In [62]:
price = cleaned_data['price_median_multiplier']
cleaned_data = cleaned_data.drop(columns=['price_median_multiplier'])
cleaned_data = cleaned_data.drop(columns=['Hind'])

In [64]:
cleaned_data
price

0          0.548511
1          1.406897
2          2.899687
3          0.909091
4          0.423197
5          1.050157
6         68.455134
7          0.633333
8          2.178683
9          1.332288
10         2.097179
11         0.327586
12         0.387333
13         0.214734
14         0.705329
15         1.082090
16         1.332288
17         1.666667
18         2.005846
19         0.757576
20         1.097022
21         2.741379
22         1.346395
23         1.487461
24         0.205359
25         0.757576
26         1.292258
27         1.060606
28         1.212121
29         0.688088
            ...    
179971     0.395691
179972     0.086207
179973     0.970219
179974     1.252351
179975     0.590909
179976     1.920063
179977     1.212121
179978     0.901254
179979     3.918495
179980     0.313480
179981     0.039185
179982     0.239812
179983     1.905094
179984     1.144201
179985     1.051837
179986     2.034483
179987     1.793132
179988     1.666667
179989     1.175549


In [65]:
cat_list = [['post-USSR constructure',  'Vajab san. remonti','Vajab renoveerimist', 
       'Renoveeritud', 'San. remont tehtud', 'Keskmine', 'Valmis', 'Uus']]

interpret_transformer = ColumnTransformer(
    [('condition', OrdinalEncoder(categories=cat_list), ['Seisukord']),
     ('onehot', OneHotEncoder(), ['action', 'estate_type', 'Maakond'])], sparse_threshold=0, remainder=StandardScaler())

X = interpret_transformer.fit_transform(cleaned_data)

In [66]:
reg = LinearRegression().fit(X, price)

In [67]:
reg.score(X, price)

0.1524724070703446

# Test

In [44]:
test_data = pd.read_csv("data/testData.csv")

In [45]:
cleanse_pipe.transform(test_data)

KeyError: 'Hind'