# Data Wrangling and explorative data analysis

### Imports

In [4]:
import numpy as np
import __main__ as main

from pathlib import Path
from joblib import load, dump
from helpers.is_interactive import is_interactive

import warnings
warnings.filterwarnings('ignore')

### Run dependency notebooks

In [5]:
if is_interactive(main):
    %run 01_0_data_wrangling.ipynb -p
    %run 01_2_data_wrangling_kaggle.ipynb -p
    %run 02_0_scaling.ipynb -p

Running previous notebooks...


### CSV Import

In [6]:
source_path = Path('./data/kaggle/01_clean_data.pkl')
source_data = load(source_path)

df = source_data['dataset']

In [7]:
df.tail()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,type_unified_penthouse,type_unified_rustico,type_unified_secondary-suite,type_unified_semi-detached-house,type_unified_single-room,type_unified_stepped-apartment,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa
24551,36.0,36.0,2.0,0.186881,0.018827,0.0,47.204125,9.353275,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24552,40.0,40.0,3.0,0.141426,0.003706,0.011718,47.203747,9.350731,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24553,86.0,2803.0,0.0,0.141426,0.003706,0.011718,47.203747,9.350731,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24554,127.0,127.0,2.0,0.141426,0.003706,0.011718,47.203747,9.350731,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24555,151.268598,610.0,0.0,0.141426,0.003706,0.011718,47.203747,9.350731,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Scaling
Using the Min-Max-Method

In [8]:
scaled_df = df.copy()
source_path = Path('data/model/02_train_data.pkl')
scaler = load(source_path)['scaler']

columns = scaled_df.columns

scaled_df[columns] = scaler.transform(scaled_df[columns])

scaled_df.head()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,type_unified_penthouse,type_unified_rustico,type_unified_secondary-suite,type_unified_semi-detached-house,type_unified_single-room,type_unified_stepped-apartment,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa
0,0.02677,0.002089,0.142857,0.183264,0.100085,0.063548,0.786382,0.475555,0.014734,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.028021,0.002,0.142857,0.29082,0.170527,0.083253,0.786102,0.474934,0.010139,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.015637,0.000373,0.142857,0.483981,0.35818,0.125505,0.807571,0.477015,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.016763,0.000587,0.241071,0.165213,0.076652,0.0,0.787002,0.475789,0.020076,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.018764,0.000633,0.241071,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Enhancing the dataset with additional columns with the power of an existing column

In [9]:
numeric_columns = scaled_df.select_dtypes(include=np.number).columns.tolist()

numeric_df = scaled_df[numeric_columns]

numeric_df = numeric_df.drop(columns=[
    col
    for col in numeric_columns
    if 'type_' in col
])

numeric_columns = numeric_df.columns.tolist()

In [10]:
clean_df = scaled_df.copy()

for col in numeric_columns:
    for p in range(2,7):
        clean_df[f'{col}_{p}'] = numeric_df[col]**p


### Export csv

In [11]:
target_path = Path('./data/kaggle/02_model_data.pkl')
dump({
    'dataset': clean_df,
    'imputers': source_data['imputers'],
    'base_imputer': source_data['base_imputer'],
    'scaler': scaler
}, target_path)

['data/kaggle/02_model_data.pkl']