# Data Wrangling and explorative data analysis

### Imports

In [1]:
import numpy as np
import __main__ as main

from helpers.paths import Paths
from sklearn.preprocessing import MinMaxScaler as Scaler
from joblib import dump, load
from helpers.is_interactive import is_interactive

import warnings
warnings.filterwarnings('ignore')

### Run dependency notebooks

In [2]:
if is_interactive(main):
    %run 01_0_data_wrangling.ipynb -p

Running previous notebooks...


### Data Import

In [3]:
source_data = load(Paths.DATA_WRANGLING_DATA)

df = source_data['dataset']

In [4]:
df.head()

Unnamed: 0,Living space,Plot area,price_cleaned,Floor,Latitude,Longitude,Zip,distanceToTrainStation,gde_area_agriculture_percentage,gde_area_forest_percentage,...,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa,NoisePollutionRailway,NoisePollutionRoad,PopulationDensity,RiversAndLakes,WorkplaceDensity,ForestDensity
0,100.0,1282.323307,1150000.0,4.0,47.415927,8.08584,5023.0,3.038467,30.676329,51.449275,...,0.0,0.0,0.0,0.0,0.0,0.076399,0.223039,0.031951,0.060414,0.296178
1,156.0,222.0,1420000.0,2.75,47.415927,8.08584,5023.0,3.038467,30.676329,51.449275,...,0.0,0.0,1.0,0.0,0.0,0.076399,0.223039,0.031951,0.060414,0.296178
2,93.0,1198.982216,720000.0,2.0,47.397416,8.04315,5000.0,0.909587,11.35442,32.197891,...,0.0,0.0,0.0,0.0,0.0,0.33793,0.451622,0.114168,0.167442,0.08705
3,154.0,370.0,1430000.0,0.0,47.415927,8.08584,5023.0,3.038467,30.676329,51.449275,...,0.0,0.0,0.0,0.0,0.0,0.076399,0.223039,0.031951,0.060414,0.296178
4,142.0,462.537377,995000.0,0.0,47.40487,8.052781,5022.0,1.460245,33.13709,49.705635,...,0.0,0.0,0.0,0.0,0.0,0.16745,0.177506,0.114288,0.046822,0.252992


### Scaling
Using the Min-Max-Method

In [5]:
scaled_df = df.copy()
scaler = Scaler()

all_columns = scaled_df.columns
columns = list(filter(lambda col: col != 'price_cleaned', all_columns))

scaler.fit(scaled_df[columns])

scaled_df[columns] = scaler.transform(scaled_df[columns])

scaled_df.head()

Unnamed: 0,Living space,Plot area,price_cleaned,Floor,Latitude,Longitude,Zip,distanceToTrainStation,gde_area_agriculture_percentage,gde_area_forest_percentage,...,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa,NoisePollutionRailway,NoisePollutionRoad,PopulationDensity,RiversAndLakes,WorkplaceDensity,ForestDensity
0,0.0125,0.003654,1150000.0,0.363636,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,...,0.0,0.0,0.0,0.0,0.0,0.112937,0.236697,0.069723,0.069131,0.30677
1,0.0195,0.000633,1420000.0,0.306818,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,...,0.0,0.0,1.0,0.0,0.0,0.112937,0.236697,0.069723,0.069131,0.30677
2,0.011625,0.003417,720000.0,0.272727,0.799258,0.468164,0.462,0.034049,0.129342,0.366948,...,0.0,0.0,0.0,0.0,0.0,0.499547,0.480581,0.249137,0.191605,0.090163
3,0.01925,0.001054,1430000.0,0.181818,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,...,0.0,0.0,0.0,0.0,0.0,0.112937,0.236697,0.069723,0.069131,0.30677
4,0.01775,0.001318,995000.0,0.181818,0.803051,0.470341,0.464541,0.054848,0.377475,0.566478,...,0.0,0.0,0.0,0.0,0.0,0.247534,0.188117,0.2494,0.053579,0.262039


### Enhancing the dataset with additional columns with the power of an existing column

In [6]:
numeric_columns = scaled_df.select_dtypes(include=np.number).columns.tolist()

numeric_df = scaled_df[numeric_columns]

numeric_df = numeric_df.drop(columns=[
    col
    for col in numeric_columns
    if 'type_' in col or col == 'price_cleaned'
])

numeric_columns = numeric_df.columns

In [7]:
len(numeric_columns)

36

In [8]:
clean_df = scaled_df.copy()

for col in numeric_columns:
    for p in range(2,7):
        clean_df[f'{col}_{p}'] = numeric_df[col]**p


In [9]:
clean_df.head()

Unnamed: 0,Living space,Plot area,price_cleaned,Floor,Latitude,Longitude,Zip,distanceToTrainStation,gde_area_agriculture_percentage,gde_area_forest_percentage,...,WorkplaceDensity_2,WorkplaceDensity_3,WorkplaceDensity_4,WorkplaceDensity_5,WorkplaceDensity_6,ForestDensity_2,ForestDensity_3,ForestDensity_4,ForestDensity_5,ForestDensity_6
0,0.0125,0.003654,1150000.0,0.363636,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856,0.002717,0.000833447
1,0.0195,0.000633,1420000.0,0.306818,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856,0.002717,0.000833447
2,0.011625,0.003417,720000.0,0.272727,0.799258,0.468164,0.462,0.034049,0.129342,0.366948,...,0.036712,0.007034,0.001348,0.0002582432,4.948057e-05,0.008129,0.000733,6.6e-05,6e-06,5.372472e-07
3,0.01925,0.001054,1430000.0,0.181818,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856,0.002717,0.000833447
4,0.01775,0.001318,995000.0,0.181818,0.803051,0.470341,0.464541,0.054848,0.377475,0.566478,...,0.002871,0.000154,8e-06,4.415364e-07,2.3657e-08,0.068665,0.017993,0.004715,0.001235,0.000323743


### Export the object

In [11]:
dump({
    'scaler': scaler,
    'base_imputer': source_data['base_imputer'],
    'imputers': source_data['imputers'],
    'dataset': clean_df
}, Paths.REGRESSOR_SCALING_DATA)

['data/regressor/02_0_scaling.dump']