# Data Wrangling and explorative data analysis

### Imports

In [50]:
import pandas as pd
import numpy as np

from pathlib import Path
from joblib import load, dump

import warnings
warnings.filterwarnings('ignore')

### CSV Import

In [51]:
source_path = Path('./data/kaggle/01_clean_data.pkl')
source_data = load(source_path)

df = source_data['dataset']

In [52]:
df.head()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,type_unified_penthouse,type_unified_rustico,type_unified_secondary-suite,type_unified_semi-detached-house,type_unified_single-room,type_unified_stepped-apartment,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa
0,220.0,733.0,2.494495,0.164382,0.10003,0.063548,47.37211,8.075858,0.003811,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,230.0,702.0,2.512848,0.260855,0.170434,0.083253,47.371558,8.07311,0.002623,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,131.0,299.010402,2.585125,0.434114,0.357984,0.125505,47.413754,8.082318,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,140.0,206.0,3.145155,0.14819,0.07661,0.0,47.373327,8.076892,0.005193,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,156.0,222.0,2.38458,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Scaling
Using the Min-Max-Method

In [53]:
scaled_df = df.copy()
source_path = Path('data/model/02_train_data.pkl')
scaler = load(source_path)['scaler']

columns = scaled_df.columns

scaled_df[columns] = scaler.transform(scaled_df[columns])

scaled_df.head()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,type_unified_penthouse,type_unified_rustico,type_unified_secondary-suite,type_unified_semi-detached-house,type_unified_single-room,type_unified_stepped-apartment,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa
0,0.02677,0.002072,0.231946,0.183264,0.100085,0.063548,0.786382,0.475555,0.014734,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.028021,0.001983,0.232602,0.29082,0.170527,0.083253,0.786102,0.474934,0.010139,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.015637,0.000835,0.235183,0.483981,0.35818,0.125505,0.807571,0.477015,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.016763,0.00057,0.255184,0.165213,0.076652,0.0,0.787002,0.475789,0.020076,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.018764,0.000616,0.228021,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Enhancing the dataset with additional columns with the power of an existing column

In [54]:
numeric_columns = scaled_df.select_dtypes(include=np.number).columns.tolist()

numeric_df = scaled_df[numeric_columns]

numeric_df = numeric_df.drop(columns=[
    col
    for col in numeric_columns
    if 'type_' in col
])

numeric_columns = numeric_df.columns.tolist()

In [55]:
clean_df = scaled_df.copy()

for col in numeric_columns:
    for p in range(2,7):
        clean_df[f'{col}_{p}'] = numeric_df[col]**p


### Export csv

In [56]:
target_path = Path('./data/kaggle/02_model_data.pkl')
dump({
    'dataset': clean_df,
    'imputer': source_data['imputer'],
    'scaler': scaler
}, target_path)

['data/kaggle/02_model_data.pkl']