In [10]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost as xgb

seed = 42

In [2]:
import nbimporter

import pre_processing
import feature_selection

import feature_generation_dangerous
import feature_generation_reliable

Importing Jupyter notebook from pre_processing.ipynb
Importing Jupyter notebook from feature_generation_reliable.ipynb
Importing Jupyter notebook from feature_generation_dangerous.ipynb
Importing Jupyter notebook from feature_selection.ipynb


In [3]:
def escribir_respuesta(ids,predicciones):
    with open("respuesta.csv",'w') as archivo:
        archivo.write("id,target\n")
        for i in range(len(ids)):
            linea = f"{int(ids[i])},{predicciones[i]}"
            archivo.write(f"{linea}\n")

# IDEA: Convertir el problema en un problema de clasificacion
Tras haber trabajado muchos dias con el dataframe original, mandando directamente a entrenar a un modelo de regresion para intentar predecir los precios, hemos llegado a obtener un MAE de 472k, que nos parece muy alto. Una idea que entonces surge, es ver donde estamos fallando por mucho.

Tras realizar el **analisis de error correspondiente**, hemos podido observar que nuestro modelo de regresion esta fallando por mucho en los outliers, propiedades que generan un error absoluto de 10 millones o mas. Esto se debe a datos que no tienen sentido. Para encarar este problema, proponemos la siguiente solucion:

1. Antes de aplicar modelos de regresion, entrenaremos un **modelo de clasificacion** cuyo objetivo sera determinar si el dato es o no, un outlier. El criterio para definir si un dato es o no un outlier sera especificado mas adelante. Ahora, si logramos una buena precision en este modelo, tendremos nuestro dataset original dividido en dos sub-datasets: uno con datos "confiables", y otro con datos feos.
2. Una vez que tenemos nuestros dos sub-datasets, aplicaremos el proceso que venimos aplicando al dataset original hasta ahora a cada uno de estos, generando features para cada uno de manera independiente y tratando el problema como dos sub problemas. La idea es que el modelo que trata con los datos confiables no se "maree" con los outliers, e intentar predecir estos con un modelo que este entrenado para tal fin.
3. Finalmente, predecimos los precios de las propiedades en test siguiendo este procedimiento y unimos todas las predicciones para dar una prediccion final.
<hr>

### IMPORTANTE:
**NO** es necesario **correr todo el notebook para poder entrenar**. Se pueden **ejecutar por pasos**, ya que cada paso termina guardando los resultados obtenidos.
<hr>

# Paso 1: Entrenar modelo de clasificacion

In [5]:
train,test = pre_processing.load_featured_datasets()

In [6]:
train = feature_selection.get_selected_dataframe(train)
test = feature_selection.get_selected_dataframe(test, precio=False)

In [7]:
# Nuestro criterio para detectar un outlier sera que su precio sea mayor al precio promedio + el desvio.
mean = train['precio'].describe()[1]
std = train['precio'].describe()[2]

In [8]:
def bin_std(x, sup):
    if (x<sup):
        return 1
    return 0

In [9]:
train['target'] = train['precio'].map(lambda x: bin_std(x, mean+std))

In [10]:
train['target'].value_counts()

1    206838
0     33162
Name: target, dtype: int64

In [11]:
# Vemos que con este criterio, tenemos 33k de datos "outliers" o "con precios muy altos", que pueden confundir
# al modelo de regresion.

In [12]:
# Entrenamos el modelo de clasificacion:

In [13]:
train_2 = train.drop('precio', axis=1)

In [14]:
X = train_2.drop('target', axis=1)
Y = train_2['target']

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

In [17]:
import lightgbm as lgb

In [33]:
lightgbm = {'objective': 'binary',
            'metric':'auc',
            'num_leaves': 75,
            'max_depth': 7,
            'min_split_gain': 0.01,
            'min_child_weight': 5.00001,
            'learning_rate': 0.05,
            'lambda_l2': 0,
            'feature_fraction': 0.7000000000000001,
            'bagging_fraction': 1.0,
            'n_estimators': 99999,
            'early_stopping_round': 500}

In [34]:
d_train = lgb.Dataset(X_train.values, label=Y_train.values)
d_valid = lgb.Dataset(X_val.values, label=Y_val.values)
watchlist = [d_valid]
reg = lgb.train(lightgbm, d_train, valid_sets=watchlist, verbose_eval=500)

Training until validation scores don't improve for 500 rounds
[500]	valid_0's auc: 0.978176
[1000]	valid_0's auc: 0.978976
[1500]	valid_0's auc: 0.979177
Early stopping, best iteration is:
[1468]	valid_0's auc: 0.979189


Tenemos nuestro clasificador listo para separar.
Vemos que hemos obtenido un **AUC de 0.979189**, es decir que el modelo sera capaz de detectar outliers con una precision altisima. Esto es importante ya que el precio final predecido dependera muchisimo de esta separacion temprana.

Vamos entonces a **clasificar los datasets originales** en dos datasets y guardarlos.

In [35]:
test_predicted = reg.predict(test)

In [36]:
f = np.vectorize(lambda x: 1 if (x>0.5) else 0)
test_predicted = f(test_predicted)

In [37]:
test['target'] = test_predicted

In [38]:
test['target'].value_counts()

1    52355
0     7645
Name: target, dtype: int64

In [39]:
train['target'].value_counts()

1    206838
0     33162
Name: target, dtype: int64

In [40]:
# Ahora lo que queremos es separar en dos el problema: MODELO para los datos confiables, y otro modelo
# para los datos no confiables, y luego juntar todo.

In [46]:
train_outliers = train[['target']]
test_outliers = test[['target']]

In [57]:
outliers = train_outliers.append(test_outliers)

In [58]:
outliers.to_csv('data/outliers.csv')

# Entrenamos el modelo para los datos confiables

In [19]:
train,test = pre_processing.load_featured_reliable_datasets()

In [20]:
train = feature_selection.get_selected_dataframe_reliable(train)
test = feature_selection.get_selected_dataframe_reliable(test, precio=False)

In [21]:
train['precio'] = train['precio'].map(lambda x: math.log(x))

In [22]:
# Entrenamos el modelo:
X = train.drop('precio', axis=1)
Y = train['precio']

#X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

In [23]:
# xgboost
reg = xgb.XGBRegressor(max_depth=12,n_estimators=140 ,learning_rate=0.08, verbosity=2,subsample=0.9, min_child_weight=20, n_jobs=2)
reg.fit(X.values,Y.values)

[21:51:44] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=7
[21:51:45] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=7
[21:51:47] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 0 pruned nodes, max_depth=7
[21:51:48] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=8
[21:51:50] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 66 extra nodes, 0 pruned nodes, max_depth=9
[21:51:51] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=8
[21:51:53] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 70 extra nodes, 0 pruned nodes, max_depth=10
[21:51:54] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 80 extra no

[21:53:33] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1750 extra nodes, 0 pruned nodes, max_depth=12
[21:53:34] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1142 extra nodes, 0 pruned nodes, max_depth=12
[21:53:36] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1096 extra nodes, 0 pruned nodes, max_depth=12
[21:53:38] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1060 extra nodes, 0 pruned nodes, max_depth=12
[21:53:40] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1456 extra nodes, 0 pruned nodes, max_depth=12
[21:53:43] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1722 extra nodes, 0 pruned nodes, max_depth=12
[21:53:45] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1508 extra nodes, 0 pruned nodes, max_depth=12
[21:53:48] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 

[21:55:40] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1068 extra nodes, 0 pruned nodes, max_depth=12
[21:55:41] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 842 extra nodes, 0 pruned nodes, max_depth=12
[21:55:43] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 636 extra nodes, 0 pruned nodes, max_depth=12
[21:55:45] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 352 extra nodes, 0 pruned nodes, max_depth=12
[21:55:46] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 746 extra nodes, 0 pruned nodes, max_depth=12
[21:55:48] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 820 extra nodes, 0 pruned nodes, max_depth=12
[21:55:49] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 708 extra nodes, 0 pruned nodes, max_depth=12
[21:55:51] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 root

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.08, max_delta_step=0,
             max_depth=12, min_child_weight=20, missing=None, n_estimators=140,
             n_jobs=2, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.9, verbosity=2)

In [None]:
# lightgbm
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    'max_depth': 14,
    'num_leaves': 120,
    #'learning_rate': 0.02,
    'learning_rate': 0.05,
    'verbose': 0, 
    'early_stopping_round': 100}
n_estimators=20000

d_train = lgb.Dataset(X_train.values, label=Y_train.values)
d_valid = lgb.Dataset(X_val.values, label=Y_val.values)
watchlist = [d_valid]
reg = lgb.train(params, d_train, n_estimators, valid_sets=watchlist, verbose_eval=1000)

In [24]:
# prediccion
pred_reliable = reg.predict(test.values)
f = np.vectorize(math.exp)
pred_reliable = f(pred_reliable)

In [25]:
test_a = test.copy()

In [26]:
test_a['precio'] = pred_reliable

In [27]:
test_a = test_a.reset_index()[['id', 'precio']]

In [28]:
# tenemos test a
test_a.head()

Unnamed: 0,id,precio
0,51775,975057.5
1,115253,2245816.0
2,299321,1118169.0
3,173570,624323.1
4,30862,1369667.0


# Entrenamos el modelo para los datos no confiables

In [29]:
train,test = pre_processing.load_featured_dangerous_datasets()

In [30]:
train = feature_selection.get_selected_dataframe_dangerous(train)
test = feature_selection.get_selected_dataframe_dangerous(test, precio=False)

In [31]:
train['precio'] = train['precio'].map(lambda x: math.log(x))

In [32]:
# Entrenamos el modelo:
X = train.drop('precio', axis=1)
Y = train['precio']

#X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

In [33]:
# xgboost
reg = xgb.XGBRegressor(max_depth=12,n_estimators=140 ,learning_rate=0.08, verbosity=2,subsample=0.9, min_child_weight=20, n_jobs=2)
reg.fit(X.values,Y.values)

[21:57:17] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[21:57:17] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[21:57:17] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[21:57:17] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[21:57:17] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[21:57:17] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[21:57:17] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[21:57:17] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pr

[21:57:27] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 996 extra nodes, 0 pruned nodes, max_depth=12
[21:57:27] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 932 extra nodes, 0 pruned nodes, max_depth=12
[21:57:27] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 684 extra nodes, 0 pruned nodes, max_depth=12
[21:57:28] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 684 extra nodes, 0 pruned nodes, max_depth=12
[21:57:28] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 592 extra nodes, 0 pruned nodes, max_depth=12
[21:57:28] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 882 extra nodes, 0 pruned nodes, max_depth=12
[21:57:28] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 806 extra nodes, 0 pruned nodes, max_depth=12
[21:57:28] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots

[21:57:41] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 308 extra nodes, 0 pruned nodes, max_depth=12
[21:57:41] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 472 extra nodes, 0 pruned nodes, max_depth=12
[21:57:41] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 0 pruned nodes, max_depth=12
[21:57:41] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 682 extra nodes, 0 pruned nodes, max_depth=12
[21:57:41] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 218 extra nodes, 0 pruned nodes, max_depth=12
[21:57:41] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 392 extra nodes, 0 pruned nodes, max_depth=12
[21:57:42] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 744 extra nodes, 0 pruned nodes, max_depth=12
[21:57:42] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.08, max_delta_step=0,
             max_depth=12, min_child_weight=20, missing=None, n_estimators=140,
             n_jobs=2, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.9, verbosity=2)

In [49]:
# lightgbm
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    'max_depth': 11,
    'num_leaves': 70,
    'learning_rate': 0.02,
    #'learning_rate': 0.05,
    'verbose': 0} 
    #'early_stopping_round': 100}
n_estimators=10000

d_train = lgb.Dataset(X_train.values, label=Y_train.values)
d_valid = lgb.Dataset(X_val.values, label=Y_val.values)
watchlist = [d_valid]
reg = lgb.train(params, d_train, n_estimators, valid_sets=watchlist, verbose_eval=1000)

In [34]:
# prediccion
pred_dangerous = reg.predict(test.values)
f = np.vectorize(math.exp)
pred_dangerous = f(pred_dangerous)

In [35]:
test_b = test.copy()

In [36]:
test_b['precio'] = pred_dangerous

In [37]:
test_b = test_b.reset_index()[['id', 'precio']]

In [38]:
# tenemos test b
test_b.head()

Unnamed: 0,id,precio
0,4941,7548524.0
1,262957,6125732.0
2,253578,6212492.0
3,62134,7769564.0
4,277353,5331547.0


# Join

In [39]:
test_a['id'].nunique()

52355

In [40]:
test_b['id'].nunique()

7645

In [41]:
test = test_a.append(test_b)

In [42]:
test['id'].nunique()

60000

# Submit

In [43]:
ids = test['id'].values
valores = test['precio'].values

In [45]:
escribir_respuesta(ids, valores)