In [1]:
%reload_ext autoreload
%autoreload 2

## Dependencias

In [2]:
import pandas as pd 
import numpy as np 
import sys
sys.path.append("../")
import src.constants as c
from src.preprocessing import merge_similar_categories, CustomTransformer
from src.utils import save_to_pickle, read_pickle
from sklearn.preprocessing import StandardScaler

## Lectura de info.

In [3]:
X_train, y_train = read_pickle(c.train_data_v1)
X_test, y_test = read_pickle(c.test_data_v1)

In [4]:
numericas = read_pickle(c.numericas)
categoricas = read_pickle(c.categoricas)
drop = read_pickle(c.drop)

## Uniendo niveles

In [5]:
transformer = CustomTransformer(
    variables_to_eliminate=drop,
    discrete_variables=categoricas,
    continous_variables=numericas
)

In [6]:
transformer.fit(X_train, y_train)

In [7]:
X_train_transformed = transformer.transform(X_train, training=True)
y_train_masked = y_train[transformer.final_mask]

mask = (95000.000000 <= y_train_masked) & (y_train_masked <= 3400000.000000)

X_train_transformed = X_train_transformed[mask]
y_train_masked = y_train_masked[mask]

display(X_train_transformed.head())

percentage = len(X_train_transformed) / len(X_train) * 100
print(f"Percentage of kept rows: {percentage :.2f}%")

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,sqft_above,sqft_basement,lat,sqft_living15,sqft_lot15,zipcode_1,...,grade_5,grade_6,grade_7,floors_1,floors_2,floors_3,view_1,view_2,view_3,waterfront_1
6181,4.0,1.75,2090.0,7416.0,1050.0,1040.0,47.4107,1710.0,7527.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4688,3.0,2.5,1450.0,5175.0,1030.0,420.0,47.7082,1740.0,9250.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1703,3.0,1.75,3020.0,360241.0,3020.0,0.0,47.2662,1890.0,209959.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
21200,3.0,1.75,1370.0,1990.0,1370.0,0.0,47.6434,1730.0,1990.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
18874,3.0,1.75,1330.0,7216.0,1330.0,0.0,47.7199,1500.0,8000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Percentage of kept rows: 98.80%


In [8]:
transformer.dict_merged_levesl

{'zipcode': {(98002, 98032, 98168): 0,
  (98001,
   98003,
   98022,
   98023,
   98030,
   98031,
   98055,
   98148,
   98178,
   98188,
   98198): 1,
  (98042, 98092, 98106): 2,
  (98038, 98058, 98108, 98146): 3,
  (98133,): 4,
  (98118, 98126): 5,
  (98010,
   98011,
   98014,
   98019,
   98028,
   98045,
   98056,
   98070,
   98125,
   98155,
   98166): 6,
  (98034, 98059, 98065, 98136): 7,
  (98072, 98103, 98107, 98117): 8,
  (98007,
   98008,
   98024,
   98027,
   98029,
   98052,
   98077,
   98115,
   98116,
   98122,
   98144,
   98177): 9,
  (98053, 98074): 10,
  (98005, 98006, 98033, 98075, 98102, 98105, 98109, 98119, 98199): 11,
  (98112,): 12,
  (98004, 98040): 13,
  (98039,): 14},
 'grade': {(1, 3, 4, 5): 0,
  (6,): 1,
  (7,): 2,
  (8,): 3,
  (9,): 4,
  (10,): 5,
  (11,): 6,
  (12, 13): 7},
 'floors': {(1.0, 3.5): 0, (1.5, 3.0): 1, (2.0,): 2, (2.5,): 3},
 'view': {(0,): 0, (1, 2): 1, (3,): 2, (4,): 3},
 'waterfront': {(0,): 0, (1,): 1}}

In [9]:
for feature, table in transformer.dict_tables.items():
    print("="*120)
    print(f"Merged levels for {feature}")
    print("="*120)
    display(table)

Merged levels for zipcode


Unnamed: 0,mean_y,variance_y,n
"(98002, 98032, 98168)",241901.6,2184602000.0,544
"(98001, 98003, 98022, 98023, 98030, 98031, 98055, 98148, 98178, 98188, 98198)",296465.5,2176386000.0,2609
"(98042, 98092, 98106)",320584.8,2657636000.0,1115
"(98038, 98058, 98108, 98146)",358165.0,8967649000.0,1360
"(98133,)",386203.0,10030830000.0,446
"(98118, 98126)",415392.1,10035370000.0,773
"(98010, 98011, 98014, 98019, 98028, 98045, 98056, 98070, 98125, 98155, 98166)",446913.8,6658178000.0,2473
"(98034, 98059, 98065, 98136)",519906.2,11332380000.0,1424
"(98072, 98103, 98107, 98117)",576030.0,11581070000.0,1532
"(98007, 98008, 98024, 98027, 98029, 98052, 98077, 98115, 98116, 98122, 98144, 98177)",628559.9,13537740000.0,3430


Merged levels for grade


Unnamed: 0,mean_y,variance_y,n
"(1, 3, 4, 5)",244994.9,12668840000.0,244
"(6,)",302069.3,14509900000.0,1831
"(7,)",402654.6,24362860000.0,8108
"(8,)",540805.6,46882280000.0,5443
"(9,)",771699.5,99819740000.0,2353
"(10,)",1075007.0,234035500000.0,1026
"(11,)",1486759.0,490466000000.0,353
"(12, 13)",2361449.0,737541600000.0,93


Merged levels for floors


Unnamed: 0,mean_y,variance_y,n
"(1.0, 3.5)",441548.3,857349500.0,9585
"(1.5, 3.0)",563129.8,27434260000.0,2276
"(2.0,)",647090.5,185701100000.0,7443
"(2.5,)",1048870.0,734542900000.0,147


Merged levels for view


Unnamed: 0,mean_y,variance_y,n
"(0,)",496034.2,82477780000.0,17539
"(1, 2)",797624.6,205102800000.0,1171
"(3,)",962347.0,385007100000.0,455
"(4,)",1442562.0,862307100000.0,286


Merged levels for waterfront


Unnamed: 0,mean_y,variance_y,n
"(0,)",530848.9,115931200000.0,19306
"(1,)",1626434.0,1298163000000.0,145


## Estandarización

In [10]:
scaler = StandardScaler()

cols = transformer.continous_variables
X_train_transformed[cols] = (
    pd.DataFrame(
        scaler.fit_transform(X_train_transformed[cols]), 
        columns=cols,
        index=X_train_transformed.index
    )
)

## Guardando información

In [11]:
save_to_pickle([X_train_transformed, y_train_masked], c.train_data_v2)
save_to_pickle(transformer, c.transformer)
save_to_pickle(scaler, c.scaler)