In [3]:
import pandas as pd
import numpy as np
import matplotlib as mpl

In [4]:
# Importar pickle con el dataset preprocesado
df = pd.read_pickle('dataset.pkl')

In [5]:
# Fill NaNs
df["surface_covered_in_m2"] = np.where(df["surface_covered_in_m2"].isnull(),df["surface_total_in_m2"],df["surface_covered_in_m2"])
df["floor"] = np.where(df["floor"].isnull(),0,df["floor"])
df["rooms"] = np.where(df["rooms"].isnull(),1,df["rooms"])
df["expenses"] = np.where(df["expenses"].isnull(),0,df["expenses"])
df['lat'].fillna(df.groupby(['state_name','place_name'])['lat'].transform('mean'),inplace=True)
df['lon'].fillna(df.groupby(['state_name','place_name'])['lon'].transform('mean'),inplace=True)
df['lat'].fillna(df.groupby(['state_name'])['lat'].transform('mean'),inplace=True)
df['lon'].fillna(df.groupby(['state_name'])['lon'].transform('mean'),inplace=True)
# Drop columnas no utilizables
df.drop(['geonames_id','properati_url'], axis=1, inplace=True)
df.dropna(how='any', inplace=True)

In [6]:
# Limpieza Final y Control
df.dropna(how='any', inplace=True)
display(df.isna().sum())

property_type            0
place_name               0
state_name               0
lat                      0
lon                      0
price_aprox_usd          0
surface_total_in_m2      0
surface_covered_in_m2    0
price_usd_per_m2         0
floor                    0
rooms                    0
expenses                 0
description              0
title                    0
zona                     0
dtype: int64

In [7]:
# Funciones para extraer variables de Description & Title
import re
def searchReg(exp):
  return (df['title'].str.contains(exp, case=False, regex=True) | df['description'].str.contains(exp, case=False, regex=True));

def extractCat( exp, catName = False):
  if(catName == False):
    catName = 'has_'+exp
  found = searchReg(exp).astype(int)
  return catName,found;

def test_addDmmy(termns):
    for i in termns:
        print (extractCat(i[0])[0], extractCat(i[0])[1].sum());

def addDmmy(termns):
    for i in termns:
        df[(i[1])]=extractCat(i[0])[1];

In [8]:
# Agregar dummies al DataFrame
addDmmy([
    ['subte','subte'],
    ['colectivos','colectivo'],
    ['balc[oó]n','balcon'],
    ['tren','tren'],
    ['port[oó]n','porton'],
    ['(?:financiaci[oó]n)|(?:financiamiento)','financia'],
    ['parr?ill?a', 'parrilla'],
    ['(?:pileta)|(?:piscina)', 'pileta'],
    ['solarium','solarium'],
    ['lavadero|laundry','lavadero'],
    ['estacionamiento','estacionamiento'],
    ['NO (?:ES )?APTO CR[EÉ]DITO', 'no_apto_credito'],
    ['APTO CR[EÉ]DITO', 'apto_credito'],
    ['estacionamiento','estacionamiento'],
    ['amplio living','amplioliving'],
    ['cocheras?','cocheras'],
    [' frente[ .,]', 'frente'],
    ['contrafrente', 'contrafrente'],
    ['(?:seguridad)|(?:control)', 'seguridad'],
    ['amenities|amenitys', 'amenities'],
    ['SUM|m[uú]ltiples', 'SUM'],
    ['espacio[s] verde[s]','espaciosverdes'],
    ['jacuzzi','jacuzzi'],
    ['estrenar', 'estrenar'],
    ['profesional','aptoprofesional'],
    ['pozo','pozo'],
    ['categoria','categoria'],
    ['reciclado','reciclado'],
    ['luminoso','luminoso'],
    ['acondicionado','acondicionado'],
    ['quincho','quincho'],
    ['escalera','escalera']
])

In [9]:
# Unir Description y Title para Vectorizar
df['texto'] = df['title'] + ' ' + df['description']
df.drop(axis=1,columns=['description','title'],inplace=True)

In [10]:
# Limpiar caracteres especiales de Texto
import re
from unicodedata import normalize

def limpiar(x):
    # -> NFD y eliminar diacríticos
    x = re.sub(
        r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
        normalize( "NFD", x), 0, re.I
        )
    # -> NFC
    x = normalize( 'NFC', x)
    return x

df['texto'] = df.texto.apply(lambda x: limpiar(x))

In [11]:
df.to_pickle('DatasetFinalParaEntrenar.pkl')