### Imports

In [13]:
import pandas as pd
import numpy as np 

Para normalizar um dataset:

1. Troque o caminho do seu CSV na primeira célula de "Importando dataset"
2. Execute todas as células de "Importando dataset"
3. Execute todas as células de "Selecionando features numéricas"
4. Execute todas as células de "Normalizando os dados (MinMaxScaler)"

### Importando dataset

In [14]:
# Insira os dados aqui
INPUT = 'data/processed/train_clean.csv'
ORIGINAL = 'data/raw/train.csv'
OUTPUT_STANDARDIZED = INPUT.replace('.csv', '_standardized.csv')
OUTPUT_NORMALIZED = INPUT.replace('.csv', '_normalized.csv')
OUTPUT_ROBUST = INPUT.replace('.csv', '_robust.csv')

In [15]:
original_df = pd.read_csv(ORIGINAL)
df = pd.read_csv(INPUT)

### Selecionando features numéricas

In [16]:
# Cortando a coluna Id para colocar no final
df_id = df.pop('Id')

In [17]:
num_cols = original_df.select_dtypes(include=[np.number]).columns

# Remover colunas que não estão no df
num_cols = [col for col in num_cols if col in df.columns]

# Remover target SalePrice
num_cols = [col for col in num_cols if col != 'SalePrice']

Os dados categóricos que já foram convertidos já obedecem normalização, então não precisamos nos preocupar com eles.

In [18]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MSSubClass,1460.0,56.897260,42.300571,20.0,20.0,50.000000,70.0,190.0
LotFrontage,1460.0,70.049958,22.024023,21.0,60.0,70.049958,79.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.500000,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.000000,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.000000,6.0,9.0
...,...,...,...,...,...,...,...,...
SaleCondition_Alloca,1460.0,0.008219,0.090317,0.0,0.0,0.000000,0.0,1.0
SaleCondition_Family,1460.0,0.013699,0.116277,0.0,0.0,0.000000,0.0,1.0
SaleCondition_Normal,1460.0,0.820548,0.383862,0.0,1.0,1.000000,1.0,1.0
SaleCondition_Partial,1460.0,0.085616,0.279893,0.0,0.0,0.000000,0.0,1.0


## Estandarizando os dados

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[num_cols])

In [20]:
print(df_scaled.mean(axis=0))
print(df_scaled.std(axis=0))


[-1.80069049e-16 -1.10718132e-15  3.23637616e-16  6.42408501e-16
  7.93277164e-16 -2.15839523e-15  4.49685951e-15  7.05676005e-17
 -6.08341383e-17  1.70335587e-17 -4.86673107e-17  2.55503381e-16
  3.92988534e-16  6.08341383e-18 -4.86673107e-18 -9.76996262e-16
 -3.89338485e-17  0.00000000e+00 -5.37773783e-16 -1.41743542e-16
 -8.76011592e-17  0.00000000e+00  9.79429627e-17 -1.16801546e-16
  9.79003788e-15  7.90843798e-17  1.14368180e-16  1.94669243e-17
  3.77171658e-17  3.16337519e-17 -3.16337519e-17 -1.46001932e-17
  0.00000000e+00 -4.62339451e-17  7.54343315e-17  3.56743554e-14
  2.19002898e-17]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1.]


In [22]:
print('Min values (Numerical features): ', df_scaled.min(axis=0))
print('Max values (Numerical features): ', df_scaled.max(axis=0))


Min values (Numerical features):  [-0.91879819 -2.6146489  -2.01078328 -2.38512273 -2.53676851 -2.7910231
 -1.6893685  -0.64055264 -1.02866187 -0.25947536 -1.33138557 -2.65220992
 -2.3448989  -0.80051196 -0.05756644 -2.50624854 -0.83338295  0.
 -1.07083223 -0.76865888 -2.47159407  0.         -2.35592294 -0.96143515
 -2.75286376 -2.39217395 -2.29275568 -0.79620226 -0.79732003 -0.31744801
 -0.02659934 -0.18571663  0.         -0.16059059 -1.96911145 -1.36765473
 -2.14470335]
Max values (Numerical features):  [ 2.95351641  3.58678868  6.33922307  2.28226905  2.46000248  1.30165634
  1.21784273  3.85445484  2.90109426  6.35499534  2.91821594  3.00405207
  3.03690018  2.79185977 24.01161914  3.06667769  1.21322416  0.
  2.65525305  1.31174871  2.84476893  0.          2.47026939  2.21785608
  1.32437141  1.68666952  2.82252852  3.22163537  3.731711    4.82156446
 38.19664899  7.06709147  0.         12.30841506  2.10089239  1.64520971
  3.37576873]


In [23]:
# Exportando csv com dados estandardizados
df_scaled = pd.DataFrame(df_scaled, columns=num_cols)
df_scaled.to_csv(OUTPUT_STANDARDIZED, index=False)

## Normalizando os dados (MinMaxScaler)

In [19]:
from sklearn.preprocessing import MinMaxScaler

In [20]:
scaler = MinMaxScaler() 
data_scaled = scaler.fit_transform(df[num_cols])

In [21]:
print('means (Numerical features): ', data_scaled.mean(axis=0))
print('std (Numerical features): ', data_scaled.std(axis=0))

means (Numerical features):  [0.21704271 0.16797931 0.04308036 0.56659056 0.57191781 0.71933194
 0.58109589 0.06480329 0.07860378 0.03158027 0.24282552 0.17306538
 0.19013922 0.16803509 0.01021769 0.22258171 0.14178082 0.5216895
 0.19143836 0.35830479 0.34885845 0.37648402 0.2043379  0.7136924
 0.44178082 0.3335544  0.10997027 0.08530215 0.03977194 0.00671179
 0.031377   0.00373835 0.00280574 0.48381071 0.45393836]
std (Numerical features):  [0.24874166 0.0753989  0.04663744 0.15361365 0.13905227 0.21878666
 0.34397225 0.11281704 0.08078346 0.10940571 0.18909058 0.07177661
 0.08867722 0.21132151 0.08497627 0.0989639  0.17291096 0.1835757
 0.25135657 0.10193733 0.07342091 0.13540305 0.21481519 0.21805786
 0.18676476 0.15072751 0.14620287 0.1210847  0.11068517 0.05769151
 0.11612149 0.05442215 0.03199697 0.24570001 0.33191005]


In [22]:
print('Min (Numerical features): ', data_scaled.min(axis=0))
print('Max (Numerical features): ', data_scaled.max(axis=0))


Min (Numerical features):  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Max (Numerical features):  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [23]:
# Exportando csv com dados normalizados
df_scaled = pd.DataFrame(data_scaled, columns=num_cols)

# Substituir as colunas de df pelas colunas de df_scaled
df_scaled = pd.concat([df_scaled, df.drop(columns=num_cols)], axis=1)

# Concat com a df_id
df_scaled = pd.concat([df_id, df_scaled], axis=1)

df_scaled.to_csv(OUTPUT_NORMALIZED, index=False)

In [24]:
df_scaled

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,1,0.235294,0.150685,0.033420,0.666667,0.500,0.949275,0.883333,0.122500,0.125089,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,208500
1,2,0.000000,0.202055,0.038795,0.555556,0.875,0.753623,0.433333,0.000000,0.173281,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,181500
2,3,0.235294,0.160959,0.046507,0.666667,0.500,0.934783,0.866667,0.101250,0.086109,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,223500
3,4,0.294118,0.133562,0.038561,0.666667,0.500,0.311594,0.333333,0.000000,0.038271,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,140000
4,5,0.235294,0.215753,0.060576,0.777778,0.500,0.927536,0.833333,0.218750,0.116052,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,0.235294,0.140411,0.030929,0.555556,0.500,0.920290,0.833333,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,175000
1456,1457,0.000000,0.219178,0.055505,0.555556,0.625,0.768116,0.633333,0.074375,0.139972,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,210000
1457,1458,0.294118,0.154110,0.036187,0.666667,1.000,0.500000,0.933333,0.000000,0.048724,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,266500
1458,1459,0.000000,0.160959,0.039342,0.444444,0.625,0.565217,0.766667,0.000000,0.008682,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,142125


## RobustScaler

In [29]:
from sklearn.preprocessing import RobustScaler

In [30]:
scaler = RobustScaler() 
data_scaled = scaler.fit_transform(df[num_cols])

In [31]:
print('means (Numerical features): ', data_scaled.mean(axis=0))
print('std (Numerical features): ', data_scaled.std(axis=0))

means (Numerical features):  [ 8.04577465e-02 -3.21204250e-16  7.67578028e-02  3.30549756e-02
  5.38407821e-01 -2.53348515e-02 -2.46871529e-01  6.11925209e-01
  6.76581281e-02  1.88296507e+01  1.26750124e-01  1.05628761e-01
  1.19447474e-01  4.66662310e-01  2.87004864e-01  3.70738510e-02
  4.07202216e-01  0.00000000e+00 -4.25223983e-01  3.69475138e-01
 -1.40387275e-01  0.00000000e+00  2.08537439e-01 -3.95189003e-01
  8.09094040e-17 -2.40549828e-01 -3.31677304e-02  5.29673899e-01
  2.32813140e-01  1.14278536e+01  1.60055672e-02  4.22501797e+00
  0.00000000e+00  1.67429359e+01  1.07305936e-01 -9.21232877e-02
  1.53156253e-01]
std (Numerical features):  [  0.90385218   1.01231008   1.0855068    0.64275727   1.00064622
   0.63740532   0.55779284   0.95530823   0.59836501  72.56816487
   0.72913701   0.81623207   0.70897737   0.58295483   4.98562777
   0.73908667   0.48861357   0.           0.53675637   0.48067504
   0.75239407   0.           0.72520939   0.62907103   0.61961493
   0.735502

In [32]:
print('Min (Numerical features): ', data_scaled.min(axis=0))
print('Max (Numerical features): ', data_scaled.max(axis=0))

Min (Numerical features):  [-0.75       -2.64683544 -2.10596112 -1.5        -2.         -1.80434783
 -1.18918919  0.         -0.54785714  0.         -0.84401237 -2.05919003
 -1.54303279  0.          0.         -1.81526104  0.          0.
 -1.          0.         -2.          0.         -1.5        -1.
 -1.70571549 -2.         -1.9689441   0.         -0.40322581  0.
  0.          0.          0.          0.         -1.66666667 -1.
 -1.6523702 ]
Max (Numerical features):  [2.75000000e+00 3.63094233e+00 6.95802755e+00 1.50000000e+00
 3.00000000e+00 8.04347826e-01 4.32432432e-01 4.29411765e+00
 1.80357143e+00 4.80000000e+02 2.25452939e+00 2.55763240e+00
 2.27254098e+00 2.09419044e+00 1.20000000e+02 2.30361446e+00
 1.00000000e+00 0.00000000e+00 1.00000000e+00 1.00000000e+00
 2.00000000e+00 0.00000000e+00 2.00000000e+00 1.00000000e+00
 8.20600299e-01 1.00000000e+00 2.34989648e+00 2.67286822e+00
 3.20967742e+00 1.85000000e+02 2.30000000e+01 1.65000000e+02
 0.00000000e+00 1.30000000e+03 2.00000

In [33]:
# Exportando csv com dados após RobustScaler
df_scaled = pd.DataFrame(data_scaled, columns=num_cols)
df_scaled.to_csv(OUTPUT_ROBUST, index=False)