In [105]:
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [106]:

df = pd.read_csv('./data/train.csv')


In [107]:
def normalizar_descripciones(descripcion):
    
    descripcion = descripcion.replace(',', ' ')
    descripcion = descripcion.replace('.', ' ')
    descripcion = descripcion.replace('<', ' ')
    descripcion = descripcion.replace('>', ' ')
    descripcion = descripcion.replace('\n', ' ')
    descripcion = descripcion.replace('\strong', ' ')
    descripcion = descripcion.replace('\bold', ' ')
    descripcion = descripcion.lower()
    
    return descripcion

def contar_palabras(titulo, palabras):
    
    contadas = 0
    
    for palabra in palabras:
        spliteado = titulo.split()
        contadas += spliteado.count(palabra)
        
    return contadas

In [108]:
df.loc[df['direccion'] == 'Calle', 'direccion'] = 0
df.loc[df['direccion'] == '1', 'direccion'] = 0
df.loc[df['direccion'] == '.', 'direccion'] = 0
df.loc[df['direccion'] == '-', 'direccion'] = 0
df.loc[df['direccion'] != 0, 'direccion'] = 1

df['titulo'] = df['titulo'].fillna(value = ".")
df['titulo'] = df['titulo'].apply(normalizar_descripciones)
df['descripcion'] = df['descripcion'].fillna(value = ".")
df['descripcion'] = df['descripcion'].apply(normalizar_descripciones)

In [109]:
df['calefaccion'] = df['titulo'].apply(contar_palabras, palabras=["calefaccion", "calefacción", "calefaccionado", "aireacondicinado",
                    "acondicionado", "estufa","chimenea"]) +df['descripcion'].apply(contar_palabras, palabras=["calefaccion", 
                    "calefacción", "calefaccionado", "aireacondicinado", "acondicionado", "estufa","chimenea"])

df['suite'] = df['titulo'].apply(contar_palabras, palabras=["suite"])+df['descripcion'].apply(contar_palabras, palabras=["suite"])

df['avenida'] = df['titulo'].apply(contar_palabras, palabras=["avenida", "av", "avenidas", "bulevar", 
               "boulevard", "paseo", "vía"])+df['descripcion'].apply(contar_palabras, palabras=["avenida", "av", 
                "avenidas", "bulevar","boulevard", "paseo", "vía"])

df['gim'] = df['titulo'].apply(contar_palabras, palabras=["gimnasio", "gimnásio", "entrenamiento", "gim", "gym", "fit",
            "ejercicio", "gimnasia","atletismo", "cancha"])+df['descripcion'].apply(contar_palabras, palabras=["gimnasio",
            "gimnásio", "entrenamiento", "gim", "gym", "fit","ejercicio", "gimnasia","atletismo", "cancha"])

df['cochera'] = df['titulo'].apply(contar_palabras, palabras=["cochera", "cocheras", "garage", 
                "garages", "garaje", "garajes"])+df['descripcion'].apply(contar_palabras, palabras=["cochera", 
                "cocheras", "garage","garages", "garaje", "garajes"])

df['ubicacion'] = df['titulo'].apply(contar_palabras, palabras=["ubicacion", "ubicación", "locacion", 
                "locación", "localizacion", "localización","ubicado", "ubicada", "centro", "centrico",
                "centrica", "céntrico", "céntrica","central"])+df['descripcion'].apply(contar_palabras, palabras=["ubicacion", "ubicación", "locacion", 
                "locación", "localizacion", "localización","ubicado", "ubicada", "centro", "centrico",
                "centrica", "céntrico", "céntrica","central"])

df['balcon'] = df['titulo'].apply(contar_palabras, palabras=["balcon", "balcón", "terraza", "palco", "mirador", 
            "balconcillo","azotea", "solana"])+df['descripcion'].apply(contar_palabras, palabras=["balcon", "balcón", 
            "terraza", "palco", "mirador", "balconcillo","azotea", "solana"])

df['camaraseg'] = df['titulo'].apply(contar_palabras, palabras=["camara", "cámara", "cámaras", "camaras","seguridad",
                "guardia","seguro","protegido"]) + df['descripcion'].apply(contar_palabras, palabras=["camara", "cámara",
                "cámaras", "camaras","seguridad","guardia","seguro","protegido"])

df['parque'] = df['titulo'].apply(contar_palabras, palabras=["parque", "plaza", "plazoleta", "glorieta","jardin",
            "jardín","patio"])+df['descripcion'].apply(contar_palabras, palabras=["parque", "plaza", "plazoleta", 
            "glorieta","jardin","jardín","patio"])

df['amoblado'] = df['titulo'].apply(contar_palabras, palabras=["muebles", "amoblado", "mueble","decorado",
                "listo"]) + df['descripcion'].apply(contar_palabras, palabras=["muebles", "amoblado", "mueble",
                "decorado","listo"])

df['bañera'] = df['titulo'].apply(contar_palabras, palabras=["bañera", "hidromasaje", "hidro", "tina",
            "jacuzzi","jacuzi","yacuzi"]) + df['descripcion'].apply(contar_palabras, palabras=["bañera", "hidromasaje", 
            "hidro", "tina","jacuzzi","jacuzi","yacuzi"])


df['estreno'] = df['titulo'].apply(contar_palabras, palabras=["nuevo", "nueva", "estrenar","estreno",
            "innovador"]) + df['descripcion'].apply(contar_palabras, palabras=["nuevo", "nueva", "estrenar","estreno",
            "innovador"])

df['transporte'] = df['titulo'].apply(contar_palabras, palabras=["subte", "subterraneo", "subterráneo","metro", 
                "estacion", "estación", "tren","subestacion", "subestación", "ferrocarril","metrobús", "metrobus", 
                "trolebus","trolebús", "bus", "bús"]) + df['descripcion'].apply(contar_palabras, palabras=["subte", 
                "subterraneo", "subterráneo","metro", "estacion", "estación", "tren","subestacion", "subestación", 
                "ferrocarril","metrobús", "metrobus", "trolebus","trolebús", "bus", "bús"])

df['pileta'] = df['titulo'].apply(contar_palabras, palabras=["piscina", "pileta", "nado"])+ df['descripcion'].apply(contar_palabras, palabras=["piscina", "pileta", "nado"])
                                       
df['lujo'] = df['titulo'].apply(contar_palabras, palabras=["lujo", "delujo", "deluxe", "delúxe", "lujosa", "lujoso", 
            "lujosas", "lujosos","exclusivo","vip"]) + df['descripcion'].apply(contar_palabras, palabras=["lujo", 
            "delujo","deluxe", "delúxe", "lujosa", "lujoso", "lujosas", "lujosos","exclusivo","vip"])

df['humilde'] = df['titulo'].apply(contar_palabras, palabras=["humilde", "economico", "economica", 
            "económico", "económica", "barata", "barato", "accesible", "baratillo",
            "baratilla", "rebajado", "ganga", "asequible", "módico", "módica","credito","crédito","oferta","oferton",
            "imperdible"]) + df['descripcion'].apply(contar_palabras, palabras=["humilde", "economico", "economica", 
            "económico", "económica", "barata", "barato", "accesible", "baratillo",
            "baratilla", "rebajado", "ganga", "asequible", "módico", "módica",
            "credito","crédito","oferta","oferton","imperdible"]) 

df['ventana'] = df['titulo'].apply(contar_palabras, palabras=["ventana", "ventanas", 
            "vista", "ventanal","vistas","cristal"]) + df['descripcion'].apply(contar_palabras, palabras=["ventana",
            "ventanas", "vista", "ventanal","vistas","cristal"])

df['nuevo'] = df['titulo'].apply(contar_palabras, palabras=["reciente", "recien", "recién", "nueva", "nuevo", "nuevas", 
            "nuevos", "estrenar"]) + df['descripcion'].apply(contar_palabras, palabras=["reciente", "recien", "recién", 
            "nueva", "nuevo", "nuevas", "nuevos", "estrenar"])

df['luz'] = df['titulo'].apply(contar_palabras, palabras=["luz", "luminoso", "luminosa","claridad", "luminiscencia", 
        "luminosidad", "iluminación","iluminacion"]) + df['descripcion'].apply(contar_palabras, palabras=["luz", 
        "luminoso", "luminosa","claridad", "luminiscencia","luminosidad", "iluminación","iluminacion"])

df['bueno'] = df['titulo'].apply(contar_palabras, palabras=["bueno", "buena", "buenas", "buenos","excelente", 
            "excelentes","increible","espectacular"]) + df['descripcion'].apply(contar_palabras, palabras=["bueno", 
            "buena", "buenas", "buenos","excelente", "excelentes","increible","espectacular"])

df['contable'] = df['titulo'].apply(contar_palabras, palabras=["precio"]) + df['descripcion'].apply(contar_palabras, palabras=["precio"])

#Nuevos Feat

df['agente'] = df['descripcion'].apply(contar_palabras, palabras=["inmobiliaria", "asesoria", "asesoría", "lider", "re/max", "remax"]) \
             + df['titulo'].apply(contar_palabras, palabras=["inmobiliaria", "asesoria", "asesoría", "lider", "re/max", "remax"])

df['garante'] = df['descripcion'].apply(contar_palabras, palabras=["garante", "garantía", "fiador", "garantizador", "avalista", "garantia",
                "defensor", "garantías", "garantes", "codeudor"]) + df['titulo'].apply(contar_palabras, palabras=["garante", 
                "garantía", "fiador", "garantizador", "avalista", "garantia", "defensor", "garantías", "garantes", "codeudor"])

df['finanza'] =  df['descripcion'].apply(contar_palabras, palabras=["credito", "crédito", "prestamo", "préstamo", "cuotas", "pagos", "hipotecario"\
                 ,"amortizable", "aptocredito", "aptocrédito", "apto-credito", "apto-crédito", "aptocredito", "aptocréditp", "ahorro"]) + \
                 df['titulo'].apply(contar_palabras, palabras=["credito", "crédito", "prestamo", "préstamo", "cuotas", "pagos", "hipotecario" \
                 ,"amortizable", "aptocredito", "aptocrédito", "apto-credito", "apto-crédito", "aptocredito", "aptocrédito", "ahorro"]) 

df['turismo'] =  df['descripcion'].apply(contar_palabras, palabras=["playa", "vacaciones", "descanso", "costa", "arena", "mar", "montaña", "monte",
                                                                   "paisaje", "orilla", "rambla", "turista", "turistas", "cordillera", "sierra"])\
                 + df['titulo'].apply(contar_palabras, palabras=["playa", "vacaciones", "descanso", "costa", "arena", "mar", "montaña", "monte",
                                                                   "paisaje", "orilla", "rambla", "turista", "turistas", "cordillera", "sierra"])

df['longitud_desc'] =  df['descripcion'].apply(lambda x: len(x.split()))

#Nuevas de V4

df['longitud_titulo'] = df['titulo'].apply(lambda x: len(x))

df['cant_!'] = df['titulo'].apply(lambda x: x.count('!')) 

df['tranquilidad'] = df['descripcion'].apply(contar_palabras, palabras=["armonía", "armonia", "tranquilo", "tranqui", "tranquilidad", \
                                                                        "paz", "calma", "calmo", "quietud"])\
                 + df['titulo'].apply(contar_palabras, palabras=["armonía", "armonia", "tranquilo", "tranqui", "tranquilidad", \
                                                                        "paz", "calma", "calmo", "quietud"])

df['reparacion'] = df['descripcion'].apply(contar_palabras, palabras=["reparación", "reparacion", "reparando", "reparar", \
                            "construcción", "construccion", "construyendo", "construllendo", "mantenimiento", "averiado", "averiada",
                                                                     "refaccion","refacción","refacciones"])\
                 + df['titulo'].apply(contar_palabras, palabras=["reparación", "reparacion", "reparando", "reparar", \
                            "construcción", "construccion", "construyendo", "construllendo", "mantenimiento", "averiado", "averiada"
                                                                     "refaccion","refacción","refacciones"])

df['mascotas'] = df['descripcion'].apply(contar_palabras, palabras=["mascotas", "mascota", "perros", "perro", "perra", \
                                                "gatos", "gato", "gata", "animal", "animales"])\
                 + df['titulo'].apply(contar_palabras, palabras=["mascotas", "mascota", "perros", "perro", "perra", \
                                                "gatos", "gato", "gata", "animal", "animales"])

df['accesibilidad'] = df['descripcion'].apply(contar_palabras, palabras=["rampa", "discapacitados", "discapacitado", \
                                                "discapacitada", "lisiado", "lisiada", "silla de ruedas", "lastimado", "heridos"])\
                 + df['titulo'].apply(contar_palabras, palabras=["rampa", "discapacitados", "discapacitado", \
                                                "discapacitada", "lisiado", "lisiada", "silla de ruedas", "lastimado", "heridos"])

df['normas'] =  df['descripcion'].apply(contar_palabras, palabras=["norma", "normas", "regla", "reglas", \
                                                "prohibido", "prohibida", "denegado", "denegada", "imposible", \
                                                "ilegal", "legal", "multa", "infraccion", "infracción"])\
                 + df['titulo'].apply(contar_palabras, palabras=["norma", "normas", "regla", "reglas", \
                                                "prohibido", "prohibida", "denegado", "denegada", "imposible", \
                                                "ilegal", "legal", "multa", "infraccion", "infracción"])

df['beneficios'] = df['descripcion'].apply(contar_palabras, palabras=["gratis", "free", "incluido", "incluye", \
                                                "agregado", "gratuito", "gratuitamente", \
                                                "regalo"])\
                 + df['titulo'].apply(contar_palabras, palabras=["gratis", "free", "incluido", "incluye", \
                                                "agregado", "gratuito", "gratuitamente", \
                                                "regalo"])

df['conexion'] = df['descripcion'].apply(contar_palabras, palabras=["wifi", "wi-fi", "internet", "conexión", "conexion", \
                                                                   "señal"])\
                 + df['titulo'].apply(contar_palabras, palabras=["wifi", "wi-fi", "internet", "conexión", "conexion", \
                                                                   "señal"])

df['servicios_desc'] = df['descripcion'].apply(contar_palabras, palabras = ["servicio", "servicios"])

df['metros_desc'] = df['descripcion'].apply(contar_palabras, palabras = ["metros","m2"])

df['acabados'] = df['descripcion'].apply(contar_palabras, palabras = ["acabados","acabado","terminacion","terminación"])

df['plusvalia'] = df['descripcion'].apply(contar_palabras, palabras = ["plusvalia", "plusvalía"])

#Nuevos de V5

df['cocina'] = df['descripcion'].apply(contar_palabras, palabras=['cocina', 'cocinas'])

df['alberca'] = df['descripcion'].apply(contar_palabras, palabras=['alberca'])

df['negacion'] = df['descripcion'].apply(contar_palabras, palabras = ["no"])

df['variospisos'] = df['descripcion'].apply(contar_palabras, palabras = ["escalera","escaleras",
                    "ascensor", "elevador", "escalinata", "gradas", "escalerilla"])

df['vestidor'] = df['descripcion'].apply(contar_palabras, palabras = ["vestidor"])

In [110]:
df['metrosdescubiertos'] = abs(df['metrostotales']-df['metroscubiertos'])
df['relacionmetros'] = df['metrostotales']/df['metroscubiertos']

In [113]:
train = df

In [114]:

# preparo el modelo 
y = train.precio.values
train.drop(['id', 'precio','descripcion','titulo','fecha','tipodepropiedad','provincia','ciudad'],inplace = True,axis=1)
x = train.values
train

Unnamed: 0,direccion,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,lat,lng,...,metros_desc,acabados,plusvalia,cocina,alberca,negacion,variospisos,vestidor,metrosdescubiertos,relacionmetros
0,1,,2.0,1.0,2.0,80.0,80.0,23533.0,,,...,0,0,0,1,0,0,0,0,0.0,1.000000
1,1,10.0,3.0,2.0,2.0,268.0,180.0,24514.0,19.310205,-99.227655,...,0,0,0,1,0,0,0,1,88.0,0.671642
2,1,5.0,3.0,2.0,2.0,144.0,166.0,48551.0,,,...,0,0,0,1,0,0,0,0,22.0,1.152778
3,1,1.0,2.0,1.0,1.0,63.0,67.0,53666.0,19.301890,-99.688015,...,0,0,0,0,0,0,0,0,4.0,1.063492
4,1,10.0,2.0,1.0,1.0,95.0,95.0,47835.0,,,...,0,0,0,0,0,0,0,0,0.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239995,1,0.0,2.0,2.0,1.0,67.0,,53666.0,,,...,0,0,0,1,0,1,0,0,,
239996,1,0.0,3.0,3.0,3.0,200.0,250.0,51954.0,19.294665,-99.692916,...,0,0,0,0,0,0,0,0,50.0,1.250000
239997,1,20.0,2.0,1.0,2.0,138.0,138.0,50003995.0,,,...,0,0,0,1,0,0,0,0,0.0,1.000000
239998,1,20.0,4.0,0.0,4.0,235.0,137.0,24162.0,19.366651,-99.082246,...,0,0,0,1,0,0,0,0,98.0,0.582979


In [115]:

# divido el df 

x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42)#, stratify=y)


# creo LightGBM data containers

categorical_features = [c for c, col in enumerate(train.columns) if 'cat' in col]
train_data = lightgbm.Dataset(x, label=y, categorical_feature=categorical_features)
test_data = lightgbm.Dataset(x_test, label=y_test)



#Lo entreno

params = {
    'n_estimators' : 500,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l1', 'l2'},
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 326,
    'min_child_weight': 10000.0,
    'reg_alpha': 50,
    'reg_lambda': 0,
    'verbose': 0
}

model = lightgbm.train(params,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)



New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's l2: 4.52999e+12	valid_0's l1: 1.58335e+06
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 4.35854e+12	valid_0's l1: 1.54591e+06
[3]	valid_0's l2: 4.21002e+12	valid_0's l1: 1.51272e+06
[4]	valid_0's l2: 4.0726e+12	valid_0's l1: 1.48151e+06
[5]	valid_0's l2: 3.9442e+12	valid_0's l1: 1.45161e+06
[6]	valid_0's l2: 3.83082e+12	valid_0's l1: 1.42416e+06
[7]	valid_0's l2: 3.73479e+12	valid_0's l1: 1.4023e+06
[8]	valid_0's l2: 3.63394e+12	valid_0's l1: 1.37833e+06
[9]	valid_0's l2: 3.54246e+12	valid_0's l1: 1.35571e+06
[10]	valid_0's l2: 3.45961e+12	valid_0's l1: 1.33516e+06
[11]	valid_0's l2: 3.37981e+12	valid_0's l1: 1.31425e+06
[12]	valid_0's l2: 3.30893e+12	valid_0's l1: 1.2953e+06
[13]	valid_0's l2: 3.23964e+12	valid_0's l1: 1.27699e+06
[14]	valid_0's l2: 3.17972e+12	valid_0's l1: 1.26071e+06
[15]	valid_0's l2: 3.11933e+12	valid_0's l1: 1.24428e+06
[16]	valid_0's l2: 3.06845e+12	valid_0's l1: 1.22995e+06
[17]	valid_0's l2: 3.0173e+12	valid_0's

[155]	valid_0's l2: 1.79456e+12	valid_0's l1: 856095
[156]	valid_0's l2: 1.79285e+12	valid_0's l1: 855380
[157]	valid_0's l2: 1.7909e+12	valid_0's l1: 854721
[158]	valid_0's l2: 1.78949e+12	valid_0's l1: 854331
[159]	valid_0's l2: 1.78794e+12	valid_0's l1: 853819
[160]	valid_0's l2: 1.78654e+12	valid_0's l1: 853288
[161]	valid_0's l2: 1.78439e+12	valid_0's l1: 852953
[162]	valid_0's l2: 1.78143e+12	valid_0's l1: 852235
[163]	valid_0's l2: 1.78037e+12	valid_0's l1: 851927
[164]	valid_0's l2: 1.77897e+12	valid_0's l1: 851695
[165]	valid_0's l2: 1.77736e+12	valid_0's l1: 851428
[166]	valid_0's l2: 1.77612e+12	valid_0's l1: 851076
[167]	valid_0's l2: 1.77458e+12	valid_0's l1: 850540
[168]	valid_0's l2: 1.77359e+12	valid_0's l1: 850267
[169]	valid_0's l2: 1.77135e+12	valid_0's l1: 849880
[170]	valid_0's l2: 1.77016e+12	valid_0's l1: 849676
[171]	valid_0's l2: 1.76818e+12	valid_0's l1: 849236
[172]	valid_0's l2: 1.76642e+12	valid_0's l1: 848962
[173]	valid_0's l2: 1.76506e+12	valid_0's l1: 8

[318]	valid_0's l2: 1.62722e+12	valid_0's l1: 810345
[319]	valid_0's l2: 1.62655e+12	valid_0's l1: 810054
[320]	valid_0's l2: 1.62606e+12	valid_0's l1: 809884
[321]	valid_0's l2: 1.62546e+12	valid_0's l1: 809758
[322]	valid_0's l2: 1.62505e+12	valid_0's l1: 809715
[323]	valid_0's l2: 1.62441e+12	valid_0's l1: 809709
[324]	valid_0's l2: 1.62384e+12	valid_0's l1: 809574
[325]	valid_0's l2: 1.62328e+12	valid_0's l1: 809368
[326]	valid_0's l2: 1.62259e+12	valid_0's l1: 809160
[327]	valid_0's l2: 1.62204e+12	valid_0's l1: 808989
[328]	valid_0's l2: 1.62128e+12	valid_0's l1: 808877
[329]	valid_0's l2: 1.62036e+12	valid_0's l1: 808814
[330]	valid_0's l2: 1.61986e+12	valid_0's l1: 808674
[331]	valid_0's l2: 1.61895e+12	valid_0's l1: 808668
[332]	valid_0's l2: 1.61816e+12	valid_0's l1: 808506
[333]	valid_0's l2: 1.61752e+12	valid_0's l1: 808343
[334]	valid_0's l2: 1.61691e+12	valid_0's l1: 808325
[335]	valid_0's l2: 1.61617e+12	valid_0's l1: 808216
[336]	valid_0's l2: 1.61574e+12	valid_0's l1: 

[473]	valid_0's l2: 1.55138e+12	valid_0's l1: 790256
[474]	valid_0's l2: 1.55105e+12	valid_0's l1: 790208
[475]	valid_0's l2: 1.55072e+12	valid_0's l1: 790103
[476]	valid_0's l2: 1.55041e+12	valid_0's l1: 789839
[477]	valid_0's l2: 1.54997e+12	valid_0's l1: 789659
[478]	valid_0's l2: 1.54968e+12	valid_0's l1: 789473
[479]	valid_0's l2: 1.54945e+12	valid_0's l1: 789372
[480]	valid_0's l2: 1.54918e+12	valid_0's l1: 789244
[481]	valid_0's l2: 1.54888e+12	valid_0's l1: 789102
[482]	valid_0's l2: 1.54831e+12	valid_0's l1: 788916
[483]	valid_0's l2: 1.54804e+12	valid_0's l1: 788781
[484]	valid_0's l2: 1.54745e+12	valid_0's l1: 788581
[485]	valid_0's l2: 1.54721e+12	valid_0's l1: 788476
[486]	valid_0's l2: 1.54677e+12	valid_0's l1: 788319
[487]	valid_0's l2: 1.54628e+12	valid_0's l1: 788140
[488]	valid_0's l2: 1.54604e+12	valid_0's l1: 788117
[489]	valid_0's l2: 1.54584e+12	valid_0's l1: 788117
[490]	valid_0's l2: 1.5453e+12	valid_0's l1: 788041
[491]	valid_0's l2: 1.54482e+12	valid_0's l1: 7

In [116]:
prediccion = model.predict(x_test)

In [117]:
#Estimo MAE
print(mean_absolute_error(y_test, prediccion))

786682.3507767542
