# Scaling

### Imports

In [1]:
import numpy as np
import __main__ as main

from helpers.paths import Paths
from joblib import load, dump
from helpers.is_interactive import is_interactive

import warnings
warnings.filterwarnings('ignore')

### Run dependency notebooks

In [2]:
if is_interactive(main):
    %run 01_0_data_wrangling.ipynb -p
    %run 01_1_data_wrangling_kaggle.ipynb -p
    %run 02_0_scaling.ipynb -p

Running previous notebooks...


### CSV Import

In [3]:
source_data = load(Paths.KAGGLE_DATA_WRANGLING_DATA)

df = source_data['dataset']

In [4]:
df.tail()

Unnamed: 0,Living space,Plot area,Floor,Latitude,Longitude,Zip,distanceToTrainStation,gde_area_agriculture_percentage,gde_area_forest_percentage,gde_area_nonproductive_percentage,...,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa,NoisePollutionRailway,NoisePollutionRoad,PopulationDensity,RiversAndLakes,WorkplaceDensity,ForestDensity
24551,36.0,36.0,2.0,47.204125,9.353275,9658.0,9.851342,52.092864,27.252973,17.486276,...,0.0,0.0,0.0,0.0,0.0,0.115615,0.091479,0.0,0.052014,0.068569
24552,40.0,40.0,3.0,47.203747,9.350731,9658.0,9.753366,52.092864,27.252973,17.486276,...,0.0,0.0,0.0,0.0,0.0,0.197714,0.072111,0.0,0.060707,0.052283
24553,86.0,2803.0,0.0,47.203747,9.350731,9658.0,9.753366,52.092864,27.252973,17.486276,...,0.0,0.0,0.0,0.0,0.0,0.197714,0.072111,0.0,0.060707,0.052283
24554,127.0,127.0,2.0,47.203747,9.350731,9658.0,9.753366,52.092864,27.252973,17.486276,...,0.0,0.0,0.0,0.0,0.0,0.197714,0.072111,0.0,0.060707,0.052283
24555,151.268598,610.0,0.0,47.203747,9.350731,9658.0,9.753366,52.092864,27.252973,17.486276,...,0.0,0.0,0.0,0.0,0.0,0.197714,0.072111,0.0,0.060707,0.052283


### Scaling
Using the same method as the train dataset

In [5]:
scaled_df = df.copy()
source_path = Paths.REGRESSOR_SCALING_DATA
scaler = load(source_path)['scaler']

columns = scaled_df.columns

scaled_df[columns] = scaler.transform(scaled_df[columns])

scaled_df.head()

Unnamed: 0,Living space,Plot area,Floor,Latitude,Longitude,Zip,distanceToTrainStation,gde_area_agriculture_percentage,gde_area_forest_percentage,gde_area_nonproductive_percentage,...,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa,NoisePollutionRailway,NoisePollutionRoad,PopulationDensity,RiversAndLakes,WorkplaceDensity,ForestDensity
0,0.0275,0.002089,0.181818,0.786382,0.475555,0.465927,0.018088,0.316723,0.505921,0.012766,...,0.0,0.0,0.0,1.0,0.003553,0.329225,0.261738,0.077476,0.091757,0.113229
1,0.02875,0.002,0.181818,0.786102,0.474934,0.465927,0.025796,0.316723,0.505921,0.012766,...,0.0,0.0,0.0,0.0,0.002445,0.288135,0.191195,0.075463,0.045578,0.177648
2,0.016375,0.000373,0.181818,0.807571,0.477015,0.464657,0.101284,0.349443,0.586349,0.051803,...,1.0,0.0,0.0,0.0,0.0,0.280342,0.189881,0.141335,0.099931,0.316806
3,0.0175,0.000587,0.306818,0.787002,0.475789,0.465927,0.016606,0.316723,0.505921,0.012766,...,0.0,0.0,1.0,0.0,0.004842,0.370899,0.348118,0.040112,0.185281,0.077613
4,0.0195,0.000633,0.306818,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,0.051803,...,0.0,0.0,1.0,0.0,0.0,0.112937,0.236697,0.069723,0.069131,0.30677


### Enhancing the dataset with additional columns with the power of an existing column

In [6]:
numeric_columns = scaled_df.select_dtypes(include=np.number).columns.tolist()

numeric_df = scaled_df[numeric_columns]

numeric_df = numeric_df.drop(columns=[
    col
    for col in numeric_columns
    if 'type_' in col
])

numeric_columns = numeric_df.columns.tolist()

In [7]:
clean_df = scaled_df.copy()

for col in numeric_columns:
    for p in range(2,7):
        clean_df[f'{col}_{p}'] = numeric_df[col]**p


### Export csv

In [8]:
dump({
    'dataset': clean_df,
    'imputers': source_data['imputers'],
    'base_imputer': source_data['base_imputer'],
    'scaler': scaler
}, Paths.KAGGLE_SCALING_DATA)

['data/kaggle/02_0_scaling.dump']