# Data Wrangling and explorative data analysis

## Imports

In [279]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.impute import KNNImputer
from matplotlib import pyplot as plt
import seaborn as sns


## CSV Import

In [280]:
source_path = Path('data/original/immoscout.csv')
source_df = pd.read_csv(source_path)
clean_df = source_df.copy()

  source_df = pd.read_csv(source_path)


In [281]:
source_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,...,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,price_cleaned,type,Space extracted,rooms
0,0,0,Biberstein,100 m²,,,On request,"5023 Biberstein, AG","3.5 rooms, 100 m²«Luxuriöse Attika-Wohnung mit...",DescriptionLuxuriöse Attika-Wohnung direkt an ...,...,2.234259,5.89,14.0,9.0,308.0,331.0,1150000.0,penthouse,100.0,5.0
1,1,1,Biberstein,156 m²,222 m²,242 m²,On request,"Buhldenstrasse 8d5023 Biberstein, AG","4.5 rooms, 156 m²«Stilvolle Liegenschaft - ruh...",DescriptionStilvolle Liegenschaft an ruhiger L...,...,2.234259,5.89,14.0,9.0,308.0,331.0,1420000.0,terrace-house,156.0,5.0
2,2,2,,,,,,"5022 Rombach, AG","2.5 rooms, 93 m²«Moderne, lichtdurchflutete At...","detail_responsive#description_title2,5 Zimmerw...",...,3.54901,6.05,37.0,3092.0,30364.0,33493.0,720000.0,penthouse,93.0,5.0
3,3,3,Biberstein,154 m²,370 m²,257 m²,On request,"Buhaldenstrasse 8A5023 Biberstein, AG","4.5 rooms, 154 m²«AgentSelly - Luxuriöses Eckh...",DescriptionDieses äusserst grosszügige Minergi...,...,2.234259,5.89,14.0,9.0,308.0,331.0,1430000.0,detached-house,154.0,5.0
4,4,4,Küttigen,142 m²,,,On request,"5022 Rombach, AG","4.5 rooms, 142 m²«MIT GARTENSITZPLATZ UND VIEL...",DescriptionAus ehemals zwei Wohnungen wurde ei...,...,1.708126,6.3,65.0,349.0,941.0,1355.0,995000.0,flat,142.0,5.0


### Delete Unnamed Columns

In [282]:
clean_df = clean_df.loc[:, ~clean_df.columns.str.contains('^Unnamed')]

### Delete Mostly Empty Columns

In [283]:
mostly_empty_columns = [
    "detail_responsive#municipality",
    "detail_responsive#surface_living",
    "detail_responsive#floor",
    "detail_responsive#available_from",
    "Gemeinde",
    "Wohnfläche",
    "Stockwerk",
    "Nutzfläche",
    "Verfügbarkeit",
    "Grundstücksfläche",
    "detail_responsive#surface_property",
    "detail_responsive#surface_usable",
    "Commune",
    "Surface habitable",
    "Surface du terrain",
    "Surface utile",
    "Disponibilité",
    "Étage",
    "Comune",
    "Superficie abitabile",
    "Disponibilità",
    "Gross return",
    "Piano",
    "Superficie del terreno",
    "Superficie utile"
]

clean_df = clean_df.drop(mostly_empty_columns, axis=1)
clean_df

Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,price_cleaned,type,Space extracted,rooms
0,Biberstein,100 m²,,,On request,"5023 Biberstein, AG","3.5 rooms, 100 m²«Luxuriöse Attika-Wohnung mit...",DescriptionLuxuriöse Attika-Wohnung direkt an ...,https://www.immoscout24.ch//en/d/penthouse-buy...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,2.234259,5.89,14.0,9.0,308.0,331.0,1150000.0,penthouse,100.0,5.0
1,Biberstein,156 m²,222 m²,242 m²,On request,"Buhldenstrasse 8d5023 Biberstein, AG","4.5 rooms, 156 m²«Stilvolle Liegenschaft - ruh...",DescriptionStilvolle Liegenschaft an ruhiger L...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,2.234259,5.89,14.0,9.0,308.0,331.0,1420000.0,terrace-house,156.0,5.0
2,,,,,,"5022 Rombach, AG","2.5 rooms, 93 m²«Moderne, lichtdurchflutete At...","detail_responsive#description_title2,5 Zimmerw...",https://www.immoscout24.ch//en/d/penthouse-buy...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,3.549010,6.05,37.0,3092.0,30364.0,33493.0,720000.0,penthouse,93.0,5.0
3,Biberstein,154 m²,370 m²,257 m²,On request,"Buhaldenstrasse 8A5023 Biberstein, AG","4.5 rooms, 154 m²«AgentSelly - Luxuriöses Eckh...",DescriptionDieses äusserst grosszügige Minergi...,https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,2.234259,5.89,14.0,9.0,308.0,331.0,1430000.0,detached-house,154.0,5.0
4,Küttigen,142 m²,,,On request,"5022 Rombach, AG","4.5 rooms, 142 m²«MIT GARTENSITZPLATZ UND VIEL...",DescriptionAus ehemals zwei Wohnungen wurde ei...,https://www.immoscout24.ch//en/d/flat-buy-romb...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,1.708126,6.30,65.0,349.0,941.0,1355.0,995000.0,flat,142.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13373,Uster,70 m²,,,On request,"Tägerackerstrasse 138610 Uster, ZH","2.5 rooms, 70 m²«Wunderschöne Attikawohnung an...",DescriptionIn einem der schönsten Quartiere Us...,https://www.immoscout24.ch//en/d/flat-buy-uste...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,1.567019,4.52,181.0,3218.0,12931.0,16330.0,1101000.0,flat,70.0,5.0
13374,Volketswil,,284 m²,,On request,"8604 Volketswil, ZH",8 rooms«Charmantes 2-Familienhaus im idyllisch...,DescriptionWillkommen im Herzen von Volketswil...,https://www.immoscout24.ch//en/d/semi-detached...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,2.368364,4.23,74.0,3406.0,7644.0,11124.0,1750000.0,semi-detached-house,,8.0
13375,Seuzach,150 m²,160 m²,,On request,"8472 Seuzach, ZH","6.5 rooms, 150 m²«EIN PARADIES IM GRÜNEN FÜR G...",DescriptionWir verkaufen dieses unglaublich to...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,1.991008,4.33,56.0,212.0,358.0,626.0,1415000.0,terrace-house,150.0,5.0
13376,Zell (ZH),145 m²,853 m²,140 m²,Immediately,"Vordere Bähntalstrasse 18483 Kollbrunn, ZH","6 rooms, 145 m²«Freistehendes 6.0 Zimmer . Ein...",Description####Freistehendes 6.0 Zimmer . Einf...,https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,1.257283,4.35,66.0,167.0,624.0,857.0,1465000.0,detached-house,145.0,6.0


### Take Informations from Column "details"

In [284]:
clean_df["rooms_from_details"] = (clean_df["details"].str.extract(r'(\d+) rooms')).astype(float)
clean_df["space_from_details"] = (clean_df["details"].str.extract(r'(\d+) m²')).astype(float)

### Put the Information from column "details" into "rooms" and "Space extracted" if they are nan or 0.0

In [285]:
clean_df["rooms"] = clean_df["rooms"].mask(clean_df["rooms"] == 0.0, clean_df["rooms_from_details"])
clean_df["rooms"] = clean_df["rooms"].fillna(clean_df["rooms_from_details"])

clean_df["Space extracted"] = clean_df["Space extracted"].fillna(clean_df["space_from_details"])

### Put the Information from "merged" columns into the other columns if they are nan or 0.0

In [286]:
clean_df["Floor"] = clean_df["Floor"].fillna(clean_df["Floor_merged"])
clean_df["Living space"] = clean_df["Living space"].fillna(clean_df["Living_space_merged"])
clean_df["Floor space"] = clean_df["Floor space"].fillna(clean_df["Floor_space_merged"])
clean_df["Plot area"] = clean_df["Plot area"].fillna(clean_df["Plot_area_merged"])
clean_df["Availability"] = clean_df["Availability"].fillna(clean_df["Availability_merged"])

In [287]:
columns_to_delete = [
    "Floor_merged",
    "Living_space_merged",
    "Floor_space_merged",
    "Plot_area_merged",
    "Availability_merged"
]

clean_df = clean_df.drop(columns_to_delete, axis=1)

### Move m2 to column header

In [288]:
m2_columns = ["Living space", "Floor space", "Plot area"]

for col in m2_columns:
    clean_df[col + "_m2"] = clean_df[col].str[:-3]

### Extract floor information

In [289]:
floor_col = clean_df["Floor"]
floor_col = floor_col.str.replace("Ground", "0.")
floor_col = floor_col.str[:-7]

clean_df["floor"] = floor_col

### Remove redundant columns

In [290]:
redundant_columns = [
    "Municipality",
    "Living space",
    "Floor space",
    "Plot area",
    "location_parsed",
    "details",
    "price",
    "details_structured",
    "index",
    "lat",
    "lon",
    "space_from_details",
    "rooms_from_details",
    "floor"
]

clean_df = clean_df.drop(redundant_columns, axis=1)

### Make naming more consistent

In [291]:
name_mapping = [
    ["Space extracted", "living_space_m2"],
    ["Floor_space_merged_m2", "floor_space_m2"],
    ["Plot_area_merged_m2", "plot_area_m2"],
    ["Availability_merged", "availability"],
    ["Municipality_merged", "municipality"],
    ["price_cleaned", "price"]
]

for name in name_mapping:
    if name[0] not in clean_df.columns:
        continue
    clean_df[name[1]] = clean_df[name[0]].copy()
    clean_df = clean_df.drop(name[0], axis=1)

### Column which are numeric as float

In [292]:
clean_df["Floor space_m2"] = clean_df["Floor space_m2"].astype(float)
clean_df["Plot area_m2"] = clean_df["Plot area_m2"].str.replace(",", ".").astype(float)

### Divide categorical "type" variable into dummy variables

In [293]:
clean_df["type"].unique()

array(['penthouse', 'terrace-house', 'detached-house', 'flat',
       'stepped-house', 'farmhouse', 'semi-detached-house',
       'stepped-apartment', 'duplex-maisonette', 'attic-flat', 'loft',
       'chalet', 'villa', 'attic-room', 'secondary-suite', 'castle',
       'detached-secondary-suite', 'studio',
       'furnished-residential-property', 'rustico', 'single-room'],
      dtype=object)

In [294]:
clean_df = pd.get_dummies(clean_df, columns=['type'])

### Make a new df with only numeric data, for the correlation matrix

In [295]:
#only numeric df
numeric_df = clean_df.select_dtypes(include=['float64', 'int64', "uint8"])
numeric_df

Unnamed: 0,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,NoisePollutionRailwayS,NoisePollutionRoadL,NoisePollutionRoadM,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.511176,0.286451,0.090908,47.415927,8.085840,0.000000,0.0,0.0,0.058298,0.067048,...,1,0,0,0,0,0,0,0,0,0
1,0.511176,0.286451,0.090908,47.415927,8.085840,0.000000,0.0,0.0,0.058298,0.067048,...,0,0,0,0,0,0,0,0,1,0
2,0.163362,0.095877,0.001911,47.397416,8.043150,0.000000,0.0,0.0,0.334957,0.381257,...,1,0,0,0,0,0,0,0,0,0
3,0.511176,0.286451,0.090908,47.415927,8.085840,0.000000,0.0,0.0,0.058298,0.067048,...,0,0,0,0,0,0,0,0,0,0
4,0.333865,0.279276,0.145835,47.404870,8.052781,0.000000,0.0,0.0,0.133498,0.132933,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13373,0.089304,0.000000,0.000000,47.341095,8.719508,0.000000,0.0,0.0,0.306013,0.248198,...,0,0,0,0,0,0,0,0,0,0
13374,0.120618,0.054523,0.000000,47.387817,8.692746,0.000000,0.0,0.0,0.330695,0.376048,...,0,0,0,1,0,0,0,0,0,0
13375,0.007178,0.000000,0.000000,47.545958,8.708230,0.000000,0.0,0.0,0.164688,0.258562,...,0,0,0,0,0,0,0,0,1,0
13376,0.249841,0.112081,0.000000,47.433114,8.762768,0.000000,0.0,0.0,0.203866,0.269818,...,0,0,0,0,0,0,0,0,0,0


In [296]:
corrMatrix = numeric_df.corr(method = "pearson")
corrMatrix = corrMatrix.where(np.tril(np.ones(corrMatrix.shape)).astype(np.bool)) #only lower triangle --> no duplicates
corrMatrix

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  corrMatrix = corrMatrix.where(np.tril(np.ones(corrMatrix.shape)).astype(np.bool)) #only lower triangle --> no duplicates


Unnamed: 0,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,NoisePollutionRailwayS,NoisePollutionRoadL,NoisePollutionRoadM,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
ForestDensityL,1.000000,,,,,,,,,,...,,,,,,,,,,
ForestDensityM,0.911491,1.000000,,,,,,,,,...,,,,,,,,,,
ForestDensityS,0.710585,0.867609,1.000000,,,,,,,,...,,,,,,,,,,
Latitude,-0.291478,-0.287520,-0.227517,1.000000,,,,,,,...,,,,,,,,,,
Longitude,0.273650,0.203794,0.138276,0.058665,1.000000,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
type_stepped-apartment,-0.018379,-0.014465,-0.004462,0.054592,0.008223,-0.012829,-0.012335,-0.008849,0.002007,0.000306,...,-0.021374,-0.007186,-0.000889,-0.022128,-0.000889,1.000000,,,,
type_stepped-house,0.026723,0.017659,0.020498,0.024310,-0.005025,-0.005131,0.000797,0.008599,-0.021506,-0.014233,...,-0.014635,-0.004920,-0.000609,-0.015151,-0.000609,-0.007241,1.000000,,,
type_studio,0.015922,0.018866,0.018865,-0.022263,-0.023972,-0.001872,-0.009201,-0.009533,-0.031229,-0.029519,...,-0.009347,-0.003142,-0.000389,-0.009677,-0.000389,-0.004625,-0.003166,1.000000,,
type_terrace-house,-0.033460,-0.034042,-0.022782,0.057873,-0.021312,0.021318,0.020566,0.014079,0.005222,-0.000929,...,-0.042049,-0.014136,-0.001749,-0.043531,-0.001749,-0.020805,-0.014245,-0.009098,1.000000,


### Print all strong correlation (over 0.7 or under -0.7)

In [297]:
high_correlation = []

for i in range(len(corrMatrix)):
    for j in range(len(corrMatrix)):
        if abs(corrMatrix.iloc[i,j] > 0.7) and corrMatrix.iloc[i,j] != 1 and corrMatrix.iloc[i,j] not in high_correlation:
            high_correlation.append([corrMatrix.index[i], corrMatrix.columns[j], corrMatrix.iloc[i,j]])

high_correlation

  if abs(corrMatrix.iloc[i,j] > 0.7) and corrMatrix.iloc[i,j] != 1 and corrMatrix.iloc[i,j] not in high_correlation:


[['ForestDensityM', 'ForestDensityL', 0.9114909095381499],
 ['ForestDensityS', 'ForestDensityL', 0.710585008742777],
 ['ForestDensityS', 'ForestDensityM', 0.8676090226762789],
 ['NoisePollutionRailwayM', 'NoisePollutionRailwayL', 0.8343790570503911],
 ['NoisePollutionRailwayS', 'NoisePollutionRailwayM', 0.7864027031797696],
 ['NoisePollutionRoadM', 'NoisePollutionRoadL', 0.9040623034427968],
 ['NoisePollutionRoadS', 'NoisePollutionRoadM', 0.8430971070480482],
 ['PopulationDensityM', 'PopulationDensityL', 0.9202544032244113],
 ['PopulationDensityS', 'PopulationDensityL', 0.72993618794267],
 ['PopulationDensityS', 'PopulationDensityM', 0.8658392151152488],
 ['RiversAndLakesM', 'RiversAndLakesL', 0.8335660032373315],
 ['WorkplaceDensityL', 'PopulationDensityL', 0.8014977199828],
 ['WorkplaceDensityL', 'PopulationDensityM', 0.716422074016624],
 ['WorkplaceDensityM', 'PopulationDensityL', 0.7289775256184324],
 ['WorkplaceDensityM', 'PopulationDensityM', 0.7267595628767942],
 ['WorkplaceDens

### For the Models: If two columns have a high correlation --> have the same effect on the output --> So we can drop one of them

In [298]:
model_df = numeric_df.copy()

columns_to_drop = ["ForestDensityS", "ForestDensityL",
                   "NoisePollutionRailwayS", "NoisePollutionRailwayL",
                   "NoisePollutionRoadS", "NoisePollutionRoadL",
                   "PopulationDensityS", "PopulationDensityL",
                   "RiversAndLakesL",
                   "WorkplaceDensityS", "WorkplaceDensityM", "WorkplaceDensityL",
                   "gde_private_apartments",
                   "gde_workers_sector2", "gde_workers_sector3",
                   "gde_workers_total",
                   "gde_area_settlement_percentage",
                   "Zip"]

model_df = model_df.drop(columns_to_drop, axis=1)
model_df

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.286451,47.415927,8.085840,0.0,0.067048,0.209530,0.001811,0.011871,3.038467,30.676329,...,1,0,0,0,0,0,0,0,0,0
1,0.286451,47.415927,8.085840,0.0,0.067048,0.209530,0.001811,0.011871,3.038467,30.676329,...,0,0,0,0,0,0,0,0,1,0
2,0.095877,47.397416,8.043150,0.0,0.381257,0.393783,0.188229,0.000000,0.909587,11.354420,...,1,0,0,0,0,0,0,0,0,0
3,0.286451,47.415927,8.085840,0.0,0.067048,0.209530,0.001811,0.011871,3.038467,30.676329,...,0,0,0,0,0,0,0,0,0,0
4,0.279276,47.404870,8.052781,0.0,0.132933,0.136984,0.141473,0.091805,1.460245,33.137090,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13373,0.000000,47.341095,8.719508,0.0,0.248198,0.398929,0.000000,0.000000,1.032783,40.749825,...,0,0,0,0,0,0,0,0,0,0
13374,0.054523,47.387817,8.692746,0.0,0.376048,0.332250,0.016351,0.000000,2.107159,37.473233,...,0,0,0,1,0,0,0,0,0,0
13375,0.000000,47.545958,8.708230,0.0,0.258562,0.374555,0.000000,0.000000,1.007680,59.067358,...,0,0,0,0,0,0,0,0,1,0
13376,0.112081,47.433114,8.762768,0.0,0.269818,0.264024,0.000000,0.000000,2.848786,42.823529,...,0,0,0,0,0,0,0,0,0,0


### We can impute the missing values, so we have more data for the model

In [299]:
model_df.isna().sum()

ForestDensityM                             0
Latitude                                   0
Longitude                                  0
NoisePollutionRailwayM                     0
NoisePollutionRoadM                        0
PopulationDensityM                         0
RiversAndLakesM                            0
RiversAndLakesS                            0
distanceToTrainStation                     0
gde_area_agriculture_percentage            0
gde_area_forest_percentage                 0
gde_area_nonproductive_percentage          0
gde_average_house_hold                     0
gde_empty_apartments                       0
gde_foreigners_percentage                  0
gde_new_homes_per_1000                     0
gde_politics_bdp                        4910
gde_politics_cvp                         317
gde_politics_evp                        3776
gde_politics_fdp                         152
gde_politics_glp                        2127
gde_politics_gps                         472
gde_politi

In [300]:
#impute missing values with knn imputer
imputer = KNNImputer(n_neighbors=5, weights="distance")
model_df = pd.DataFrame(imputer.fit_transform(model_df), columns=model_df.columns)
model_df.isna().sum()

ForestDensityM                         0
Latitude                               0
Longitude                              0
NoisePollutionRailwayM                 0
NoisePollutionRoadM                    0
PopulationDensityM                     0
RiversAndLakesM                        0
RiversAndLakesS                        0
distanceToTrainStation                 0
gde_area_agriculture_percentage        0
gde_area_forest_percentage             0
gde_area_nonproductive_percentage      0
gde_average_house_hold                 0
gde_empty_apartments                   0
gde_foreigners_percentage              0
gde_new_homes_per_1000                 0
gde_politics_bdp                       0
gde_politics_cvp                       0
gde_politics_evp                       0
gde_politics_fdp                       0
gde_politics_glp                       0
gde_politics_gps                       0
gde_politics_pda                       0
gde_politics_rights                    0
gde_politics_sp 

### Check for every columns if there are absurd values, which are not possible
### e.g. negative prices, negative living space, negative plot area, etc.

In [301]:
#make a new df, rows are the columns of model_df and the columns are "min", "max", "mean", "median", "0.25 quantile", "0.75 quantile". Round to 2 decimals
stats_df = pd.DataFrame(columns=["min", "0.25 quantile", "mean", "median", "0.75 quantile", "max"])
stats_df["min"] = model_df.min()
stats_df["0.25 quantile"] = model_df.quantile(0.25)
stats_df["mean"] = model_df.mean()
stats_df["median"] = model_df.median()
stats_df["0.75 quantile"] = model_df.quantile(0.75)
stats_df["max"] = model_df.max()
stats_df = stats_df.round(2)
stats_df = stats_df[~stats_df.index.str.startswith("type_")]
stats_df

Unnamed: 0,min,0.25 quantile,mean,median,0.75 quantile,max
ForestDensityM,0.0,0.0,0.13,0.05,0.22,1.0
Latitude,45.83,46.24,46.74,46.71,47.27,47.79
Longitude,5.97,7.07,7.9,7.85,8.87,10.4
NoisePollutionRailwayM,0.0,0.0,0.01,0.0,0.0,0.33
NoisePollutionRoadM,0.0,0.18,0.26,0.26,0.35,0.7
PopulationDensityM,0.0,0.12,0.25,0.21,0.35,0.96
RiversAndLakesM,0.0,0.0,0.01,0.0,0.0,0.5
RiversAndLakesS,0.0,0.0,0.0,0.0,0.0,0.37
distanceToTrainStation,0.01,0.47,1.85,1.05,2.48,26.48
gde_area_agriculture_percentage,0.0,16.48,34.12,33.94,48.29,87.78


In [302]:
#living_space_m2, Floor space_m2, area_m2 and price has weird min values, lets check them
model_df[model_df["living_space_m2"] <= 20]

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
2733,0.0,46.749443,6.809191,0.0,0.136482,0.119243,0.0,0.0,4.911552,66.895522,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3721,0.387136,46.166641,6.170931,0.0,0.06758,0.165785,0.0,0.0,3.631949,33.23124,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6498,0.0,47.509946,9.433765,0.050272,0.187745,0.078518,0.26847,0.0,0.077649,32.209106,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8672,0.264163,45.989107,8.82056,0.0,0.424122,0.052492,0.035137,0.0,3.746328,16.475973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10553,0.007802,46.529557,6.560561,0.0,0.369676,0.5762,0.0,0.0,1.534805,27.41652,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
11483,0.584757,46.194115,7.17534,0.0,0.163184,0.062924,0.0,0.0,4.05956,27.005944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
11573,0.018012,46.096017,7.230466,0.0,0.17588,0.163368,0.0,0.0,2.25986,14.552992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11585,0.490431,46.080593,7.212619,0.0,0.40809,0.282512,0.047702,0.137663,0.276993,14.552992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11588,0.018012,46.096017,7.230466,0.0,0.17588,0.163368,0.0,0.0,2.25986,14.552992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11598,0.018012,46.096017,7.230466,0.0,0.17588,0.163368,0.0,0.0,2.25986,14.552992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [303]:
model_df[model_df["Floor space_m2"] <= 20]

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
406,0.0,47.472135,8.212604,0.006292,0.302899,0.494174,0.0,0.0,1.009865,23.577236,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
806,0.129813,47.263667,8.341495,0.0,0.151749,0.302618,0.0,0.0,1.45036,58.292683,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1705,0.022386,47.126617,7.554488,0.03598,0.088057,0.088065,0.003203,0.0,0.324405,59.375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2226,0.249285,46.68832,7.844555,0.0,0.218806,0.434169,0.0,0.0,0.822428,18.136558,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2461,0.0,47.471452,7.597517,0.0,0.377034,0.549405,0.0,0.0,0.586432,46.891892,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2493,0.022341,47.493984,7.579411,0.0,0.365176,0.228695,0.0,0.0,2.455293,26.398852,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3435,0.019433,46.949743,7.076806,0.0,0.212027,0.140434,0.0,0.0,3.603116,70.319635,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4046,0.084771,46.836294,9.287939,0.0,0.161202,0.239891,0.0,0.0,3.637985,32.765114,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4052,0.323842,46.826946,9.289007,0.0,0.086729,0.115259,0.0,0.0,2.713248,32.765114,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4054,0.323842,46.826946,9.289007,0.0,0.086729,0.115259,0.0,0.0,2.713248,32.765114,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [304]:
model_df[model_df["Plot area_m2"] <= 20]

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
1608,0.0,47.21536,7.798781,0.0,0.154537,0.396858,0.0,0.0,1.084594,31.962942,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1743,0.495966,46.788609,7.773376,0.0,0.114005,0.008566,0.0,0.0,11.575589,45.270891,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2371,0.0,47.459639,7.591916,0.0,0.210248,0.433087,0.0,0.0,1.107954,32.371134,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3094,0.0,46.770626,6.968549,0.0,0.266895,0.076904,0.000171,0.0,3.402048,68.89107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3721,0.387136,46.166641,6.170931,0.0,0.06758,0.165785,0.0,0.0,3.631949,33.23124,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3793,0.037032,46.219504,6.108298,0.0,0.553817,0.693733,0.008822,0.0,1.117236,13.185379,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6961,0.352797,46.494355,8.951573,0.0,0.231488,0.068663,0.0,0.0,10.425314,24.825676,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7447,0.709,46.154143,8.637526,0.0,0.011573,0.002857,0.0,0.0,0.933199,4.745235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7459,0.261749,46.179199,8.7141,0.0,0.202832,0.081593,0.033091,0.0,0.756732,4.745235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7630,0.291551,46.066757,8.922198,0.0,0.29735,0.27827,0.0,0.0,1.28424,8.795411,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [305]:
model_df[model_df["price"] <= 1000]

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
3929,0.387136,46.166641,6.170931,0.0,0.06758,0.165785,0.0,0.0,3.631949,33.23124,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5044,0.300538,47.053017,6.753855,0.0,0.14918,0.214639,0.0,0.0,0.794293,53.816132,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10553,0.007802,46.529557,6.560561,0.0,0.369676,0.5762,0.0,0.0,1.534805,27.41652,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10713,0.0,46.487053,6.421055,0.0,0.296677,0.312241,0.0,0.0,1.401095,66.801619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [306]:
#remove the rows with the weird min values --> Als Gruppe noch besprechen
model_df = model_df[model_df["living_space_m2"] > 20]
model_df = model_df[model_df["Floor space_m2"] > 20]
model_df = model_df[model_df["Plot area_m2"] > 20]
model_df = model_df[model_df["price"] > 1000]
model_df

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.286451,47.415927,8.085840,0.0,0.067048,0.209530,0.001811,0.011871,3.038467,30.676329,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.286451,47.415927,8.085840,0.0,0.067048,0.209530,0.001811,0.011871,3.038467,30.676329,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.095877,47.397416,8.043150,0.0,0.381257,0.393783,0.188229,0.000000,0.909587,11.354420,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.286451,47.415927,8.085840,0.0,0.067048,0.209530,0.001811,0.011871,3.038467,30.676329,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.279276,47.404870,8.052781,0.0,0.132933,0.136984,0.141473,0.091805,1.460245,33.137090,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13373,0.000000,47.341095,8.719508,0.0,0.248198,0.398929,0.000000,0.000000,1.032783,40.749825,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13374,0.054523,47.387817,8.692746,0.0,0.376048,0.332250,0.016351,0.000000,2.107159,37.473233,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13375,0.000000,47.545958,8.708230,0.0,0.258562,0.374555,0.000000,0.000000,1.007680,59.067358,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13376,0.112081,47.433114,8.762768,0.0,0.269818,0.264024,0.000000,0.000000,2.848786,42.823529,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [307]:
target_path = Path('./data/clean/immoscout.csv')
model_df.to_csv(target_path, index=False)