# Data Wrangling and explorative data analysis

## Imports

In [41]:
import pandas as pd
from pathlib import Path

## CSV Import

In [42]:
source_path = Path('./data/clean/immoscout.csv')
model_df = pd.read_csv(source_path)

In [43]:
model_df.head()

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.286451,47.415927,8.08584,0.0,0.067048,0.20953,0.001811,0.011871,3.038467,30.676329,...,1,0,0,0,0,0,0,0,0,0
1,0.286451,47.415927,8.08584,0.0,0.067048,0.20953,0.001811,0.011871,3.038467,30.676329,...,0,0,0,0,0,0,0,0,1,0
2,0.095877,47.397416,8.04315,0.0,0.381257,0.393783,0.188229,0.0,0.909587,11.35442,...,1,0,0,0,0,0,0,0,0,0
3,0.286451,47.415927,8.08584,0.0,0.067048,0.20953,0.001811,0.011871,3.038467,30.676329,...,0,0,0,0,0,0,0,0,0,0
4,0.279276,47.40487,8.052781,0.0,0.132933,0.136984,0.141473,0.091805,1.460245,33.13709,...,0,0,0,0,0,0,0,0,0,0


### Scaling

#### Min Max Scaling (prone to outliers)

In [44]:
model_df_min_max = model_df.copy()

for col in model_df_min_max.columns:
    if col == "price":
        continue
    elif col.startswith("type_"):
        continue
    model_df_min_max[col] = (model_df_min_max[col] - model_df_min_max[col].min()) / (model_df_min_max[col].max() - model_df_min_max[col].min())

model_df_min_max.head()

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.286608,0.808677,0.477811,0.0,0.095327,0.218913,0.003596,0.032255,0.114247,0.349452,...,1,0,0,0,0,0,0,0,0,0
1,0.286608,0.808677,0.477811,0.0,0.095327,0.218913,0.003596,0.032255,0.114247,0.349452,...,0,0,0,0,0,0,0,0,1,0
2,0.09593,0.799258,0.468164,0.0,0.542063,0.411418,0.373727,0.0,0.033815,0.129345,...,1,0,0,0,0,0,0,0,0,0
3,0.286608,0.808677,0.477811,0.0,0.095327,0.218913,0.003596,0.032255,0.114247,0.349452,...,0,0,0,0,0,0,0,0,0,0
4,0.279429,0.803051,0.470341,0.0,0.189001,0.143119,0.280894,0.249448,0.054619,0.377484,...,0,0,0,0,0,0,0,0,0,0


#### Normalization (actually you change the distribution of your data)

In [45]:
model_df_norm = model_df.copy()

for col in model_df_norm.columns:
    if col == "price":
        continue
    elif col.startswith("type_"):
        continue
    model_df_norm[col] = (model_df_norm[col] - model_df_norm[col].mean()) / (model_df_norm[col].max() - model_df_norm[col].min())


model_df_norm.head()

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.153933,0.344257,0.040907,-0.037157,-0.279696,-0.040741,-0.025666,0.019334,0.044746,-0.039176,...,1,0,0,0,0,0,0,0,0,0
1,0.153933,0.344257,0.040907,-0.037157,-0.279696,-0.040741,-0.025666,0.019334,0.044746,-0.039176,...,0,0,0,0,0,0,0,0,1,0
2,-0.036745,0.334839,0.03126,-0.037157,0.16704,0.151764,0.344464,-0.012922,-0.035686,-0.259283,...,1,0,0,0,0,0,0,0,0,0
3,0.153933,0.344257,0.040907,-0.037157,-0.279696,-0.040741,-0.025666,0.019334,0.044746,-0.039176,...,0,0,0,0,0,0,0,0,0,0
4,0.146755,0.338632,0.033436,-0.037157,-0.186021,-0.116536,0.251632,0.236526,-0.014882,-0.011144,...,0,0,0,0,0,0,0,0,0,0


#### Standardization (best method to use if the deature is normally distributed)

In [46]:
model_df_std = model_df.copy()

for col in model_df_std.columns:
    if col == "price":
        continue
    elif col.startswith("type_"):
        continue
    model_df_std[col] = (model_df_std[col] - model_df_std[col].mean()) / model_df_std[col].std()

model_df_std.head()

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.88824,1.242636,0.190171,-0.371367,-1.604049,-0.234829,-0.278031,0.273363,0.498842,-0.178127,...,1,0,0,0,0,0,0,0,0,0
1,0.88824,1.242636,0.190171,-0.371367,-1.604049,-0.234829,-0.278031,0.273363,0.498842,-0.178127,...,0,0,0,0,0,0,0,0,1,0
2,-0.212026,1.208638,0.145324,-0.371367,0.957971,0.874749,3.731423,-0.182707,-0.397846,-1.178926,...,1,0,0,0,0,0,0,0,0,0
3,0.88824,1.242636,0.190171,-0.371367,-1.604049,-0.234829,-0.278031,0.273363,0.498842,-0.178127,...,0,0,0,0,0,0,0,0,0,0
4,0.846816,1.222329,0.155442,-0.371367,-1.066828,-0.671698,2.725811,3.344332,-0.165908,-0.050669,...,0,0,0,0,0,0,0,0,0,0


#### Robust Scaling (robust to outliers)

In [47]:
model_df_robust = model_df.copy()

for col in model_df_robust.columns:
    if col == "price":
        continue
    elif col.startswith("type_"):
        continue
    elif col == "RiversAndLakesS" or col == "rooms":
        # TO BE REVIEWED, generated inf values
        continue
    model_df_robust[col] = (model_df_robust[col] - model_df_robust[col].median()) / (model_df_robust[col].quantile(0.75) - model_df_robust[col].quantile(0.25))

model_df_robust.head()

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,1.06796,0.686833,0.130463,0.0,-1.129839,0.010313,18.218171,0.011871,0.987703,-0.102607,...,1,0,0,0,0,0,0,0,0,0
1,1.06796,0.686833,0.130463,0.0,-1.129839,0.010313,18.218171,0.011871,0.987703,-0.102607,...,0,0,0,0,0,0,0,0,1,0
2,0.21574,0.66892,0.106747,0.0,0.715909,0.80415,1893.128856,0.0,-0.071577,-0.709981,...,1,0,0,0,0,0,0,0,0,0
3,1.06796,0.686833,0.130463,0.0,-1.129839,0.010313,18.218171,0.011871,0.987703,-0.102607,...,0,0,0,0,0,0,0,0,0,0
4,1.035876,0.676133,0.112097,0.0,-0.74281,-0.302241,1422.882194,0.091805,0.202417,-0.025255,...,0,0,0,0,0,0,0,0,0,0


In [48]:
target_path = Path("./data/model/immoscout_norm.csv")
model_df_norm.to_csv(target_path, index=False)

target_path = Path("./data/model/immoscout_min_max.csv")
model_df_min_max.to_csv(target_path, index=False)

target_path = Path("./data/model/immoscout_std.csv")
model_df_std.to_csv(target_path, index=False)

target_path = Path("./data/model/immoscout_robust.csv")
model_df_robust.to_csv(target_path, index=False)

<font color='red'>

#### Fragen die wir klären müssen:
- Müssen alle featuren gleich normalisiert werden? Oder können verschiedene Features, verschieden normalisiert werden?
- Dummy-Spalten ebenfalls normalisieren? (wurde hier jetzt nicht gemacht)
- Können wir Daten löschen, die "unmöglich" sind? (bsp: Wo "price" 1.- ist)

</font>