# Data Wrangling and explorative data analysis

### Imports

In [19]:
import pandas as pd
import numpy as np

from pathlib import Path
from sklearn.preprocessing import MinMaxScaler as Scaler
from joblib import dump

import warnings
warnings.filterwarnings('ignore')

### CSV Import

In [20]:
source_path = Path('./data/clean/immoscout_v2.csv')
df = pd.read_csv(source_path)

In [21]:
df.head()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,type_unified_penthouse,type_unified_rustico,type_unified_secondary-suite,type_unified_semi-detached-house,type_unified_single-room,type_unified_stepped-apartment,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa
0,100.0,553.6,4.0,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,156.0,222.0,2.0,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,93.0,1781.6,2.0,0.163362,0.095877,0.001911,47.397416,8.04315,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,154.0,370.0,2.0,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,142.0,584.0,0.0,0.333865,0.279276,0.145835,47.40487,8.052781,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Scaling
Using the Min-Max-Method

In [22]:
scaled_df = df.copy()
scaler = Scaler()

all_columns = scaled_df.columns
columns = list(filter(lambda col: col != 'price_cleaned', all_columns))

scaler.fit(scaled_df[columns])

scaled_df[columns] = scaler.transform(scaled_df[columns])

scaled_df.head()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,type_unified_penthouse,type_unified_rustico,type_unified_secondary-suite,type_unified_semi-detached-house,type_unified_single-room,type_unified_stepped-apartment,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa
0,0.009716,0.001532,0.285714,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.015504,0.000587,0.214286,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.008992,0.005032,0.214286,0.182127,0.09593,0.001911,0.799258,0.468164,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.015297,0.001009,0.214286,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.014057,0.001619,0.142857,0.372216,0.279429,0.145835,0.803051,0.470341,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Enhancing the dataset with additional columns with the power of an existing column

In [23]:
numeric_columns = scaled_df.select_dtypes(include=np.number).columns.tolist()

numeric_df = scaled_df[numeric_columns]

numeric_df = numeric_df.drop(columns=[
    col
    for col in numeric_columns
    if 'type_' in col or col == 'price_cleaned'
])

numeric_columns = numeric_df.columns

In [24]:
len(numeric_columns)

48

In [25]:
clean_df = scaled_df.copy()

for col in numeric_columns:
    for p in range(2,7):
        clean_df[f'{col}_{p}'] = numeric_df[col]**p


In [26]:
clean_df.head()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,gde_workers_total_2,gde_workers_total_3,gde_workers_total_4,gde_workers_total_5,gde_workers_total_6,rooms_2,rooms_3,rooms_4,rooms_5,rooms_6
0,0.009716,0.001532,0.285714,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,0.0,...,4.489742e-07,3.008376e-10,2.015778e-13,1.350683e-16,9.050323e-20,0.020408,0.002915,0.000416,5.9e-05,8e-06
1,0.015504,0.000587,0.214286,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,0.0,...,4.489742e-07,3.008376e-10,2.015778e-13,1.350683e-16,9.050323e-20,0.020408,0.002915,0.000416,5.9e-05,8e-06
2,0.008992,0.005032,0.214286,0.182127,0.09593,0.001911,0.799258,0.468164,0.0,0.0,...,0.005103043,0.0003645388,2.604104e-05,1.860257e-06,1.328885e-07,0.020408,0.002915,0.000416,5.9e-05,8e-06
3,0.015297,0.001009,0.214286,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,0.0,...,4.489742e-07,3.008376e-10,2.015778e-13,1.350683e-16,9.050323e-20,0.020408,0.002915,0.000416,5.9e-05,8e-06
4,0.014057,0.001619,0.142857,0.372216,0.279429,0.145835,0.803051,0.470341,0.0,0.0,...,8.15219e-06,2.327617e-08,6.64582e-11,1.897517e-13,5.417799e-16,0.020408,0.002915,0.000416,5.9e-05,8e-06


### Export the object

In [27]:
target_path = Path("./data/model/train_data.pkl")

dump({
    'scaler': scaler,
    'dataset': clean_df
}, target_path)

['data\\model\\train_data.pkl']