# Data Wrangling and explorative data analysis

## Imports

In [30]:
import pandas as pd
import numpy as np
from pathlib import Path

## CSV Import

In [31]:
source_path = Path('./data/clean/immoscout.csv')
model_df = pd.read_csv(source_path)

In [32]:
model_df.head()

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.286451,47.415927,8.08584,0.0,0.067048,0.20953,0.001811,0.011871,3.038467,30.676329,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.286451,47.415927,8.08584,0.0,0.067048,0.20953,0.001811,0.011871,3.038467,30.676329,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.095877,47.397416,8.04315,0.0,0.381257,0.393783,0.188229,0.0,0.909587,11.35442,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.286451,47.415927,8.08584,0.0,0.067048,0.20953,0.001811,0.011871,3.038467,30.676329,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.279276,47.40487,8.052781,0.0,0.132933,0.136984,0.141473,0.091805,1.460245,33.13709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Scaling

#### Min Max Scaling (prone to outliers)

In [33]:
model_df_min_max = model_df.copy()

for col in model_df_min_max.columns:
    if col == "price":
        continue
    elif col.startswith("type_"):
        continue
    model_df_min_max[col] = (model_df_min_max[col] - model_df_min_max[col].min()) / (model_df_min_max[col].max() - model_df_min_max[col].min())

model_df_min_max.head()

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.286608,0.808677,0.477811,0.0,0.095327,0.218913,0.003596,0.032255,0.114247,0.349452,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.286608,0.808677,0.477811,0.0,0.095327,0.218913,0.003596,0.032255,0.114247,0.349452,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.09593,0.799258,0.468164,0.0,0.542063,0.411418,0.373727,0.0,0.033815,0.129345,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.286608,0.808677,0.477811,0.0,0.095327,0.218913,0.003596,0.032255,0.114247,0.349452,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.279429,0.803051,0.470341,0.0,0.189001,0.143119,0.280894,0.249448,0.054619,0.377484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Normalization (actually you change the distribution of your data)

In [34]:
model_df_norm = model_df.copy()

for col in model_df_norm.columns:
    if col == "price":
        continue
    elif col.startswith("type_"):
        continue
    model_df_norm[col] = (model_df_norm[col] - model_df_norm[col].mean()) / (model_df_norm[col].max() - model_df_norm[col].min())


model_df_norm.head()

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.154214,0.344111,0.041019,-0.037293,-0.279852,-0.040884,-0.0257,0.019323,0.044947,-0.039162,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.154214,0.344111,0.041019,-0.037293,-0.279852,-0.040884,-0.0257,0.019323,0.044947,-0.039162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.036464,0.334692,0.031373,-0.037293,0.166884,0.151621,0.344431,-0.012932,-0.035485,-0.259269,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.154214,0.344111,0.041019,-0.037293,-0.279852,-0.040884,-0.0257,0.019323,0.044947,-0.039162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.147035,0.338485,0.033549,-0.037293,-0.186178,-0.116678,0.251598,0.236516,-0.01468,-0.01113,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Standardization (best method to use if the deature is normally distributed)

In [35]:
model_df_std = model_df.copy()

for col in model_df_std.columns:
    if col == "price":
        continue
    elif col.startswith("type_"):
        continue
    model_df_std[col] = (model_df_std[col] - model_df_std[col].mean()) / model_df_std[col].std()

model_df_std.head()

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.890942,1.241835,0.190878,-0.371997,-1.604797,-0.235707,-0.278192,0.272861,0.502105,-0.178024,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.890942,1.241835,0.190878,-0.371997,-1.604797,-0.235707,-0.278192,0.272861,0.502105,-0.178024,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.210666,1.207846,0.145988,-0.371997,0.956986,0.874142,3.728371,-0.182613,-0.396397,-1.178597,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.890942,1.241835,0.190878,-0.371997,-1.604797,-0.235707,-0.278192,0.272861,0.502105,-0.178024,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.849468,1.221534,0.156115,-0.371997,-1.067625,-0.672683,2.723484,3.339819,-0.16399,-0.050595,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Robust Scaling (robust to outliers)

In [36]:
model_df_robust = model_df.copy()

for col in model_df_robust.columns:
    if col == "price":
        continue
    elif col.startswith("type_"):
        continue
    elif col == "RiversAndLakesS" or col == "rooms":
        # TO BE REVIEWED, generated inf values
        continue
    model_df_robust[col] = (model_df_robust[col] - model_df_robust[col].median()) / (model_df_robust[col].quantile(0.75) - model_df_robust[col].quantile(0.25))

model_df_robust.head()

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,1.078117,0.686833,0.130463,0.0,-1.131549,0.009053,12.018725,0.011871,0.991351,-0.102607,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.078117,0.686833,0.130463,0.0,-1.131549,0.009053,12.018725,0.011871,0.991351,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.217974,0.66892,0.106747,0.0,0.71514,0.80289,1248.917696,0.0,-0.068507,-0.709981,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.078117,0.686833,0.130463,0.0,-1.131549,0.009053,12.018725,0.011871,0.991351,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.045734,0.676133,0.112097,0.0,-0.744323,-0.303501,938.690859,0.091805,0.205637,-0.025255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
target_path = Path("./data/model/immoscout_norm.csv")
model_df_norm.to_csv(target_path, index=False)

target_path = Path("./data/model/immoscout_min_max.csv")
model_df_min_max.to_csv(target_path, index=False)

target_path = Path("./data/model/immoscout_std.csv")
model_df_std.to_csv(target_path, index=False)

target_path = Path("./data/model/immoscout_robust.csv")
model_df_robust.to_csv(target_path, index=False)