In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb

seed = 42

In [2]:
import nbimporter

import pre_processing
import feature_selection

import feature_generation_dangerous
import feature_generation_reliable

Importing Jupyter notebook from pre_processing.ipynb
Importing Jupyter notebook from feature_generation_reliable.ipynb
Importing Jupyter notebook from feature_generation_dangerous.ipynb
Importing Jupyter notebook from feature_selection.ipynb


In [3]:
def escribir_respuesta(ids,predicciones):
    with open("respuesta.csv",'w') as archivo:
        archivo.write("id,target\n")
        for i in range(len(ids)):
            linea = f"{int(ids[i])},{predicciones[i]}"
            archivo.write(f"{linea}\n")

# IDEA: Convertir el problema en un problema de clasificacion
Tras haber trabajado muchos dias con el dataframe original, mandando directamente a entrenar a un modelo de regresion para intentar predecir los precios, hemos llegado a obtener un MAE de 472k, que nos parece muy alto. Una idea que entonces surge, es ver donde estamos fallando por mucho.

Tras realizar el **analisis de error correspondiente**, hemos podido observar que nuestro modelo de regresion esta fallando por mucho en los outliers, propiedades que generan un error absoluto de 10 millones o mas. Esto se debe a datos que no tienen sentido. Para encarar este problema, proponemos la siguiente solucion:

1. Antes de aplicar modelos de regresion, entrenaremos un **modelo de clasificacion** cuyo objetivo sera determinar si el dato es o no, un outlier. El criterio para definir si un dato es o no un outlier sera especificado mas adelante. Ahora, si logramos una buena precision en este modelo, tendremos nuestro dataset original dividido en dos sub-datasets: uno con datos "confiables", y otro con datos feos.
2. Una vez que tenemos nuestros dos sub-datasets, aplicaremos el proceso que venimos aplicando al dataset original hasta ahora a cada uno de estos, generando features para cada uno de manera independiente y tratando el problema como dos sub problemas. La idea es que el modelo que trata con los datos confiables no se "maree" con los outliers, e intentar predecir estos con un modelo que este entrenado para tal fin.
3. Finalmente, predecimos los precios de las propiedades en test siguiendo este procedimiento y unimos todas las predicciones para dar una prediccion final.
<hr>

### IMPORTANTE:
**NO** es necesario **correr todo el notebook para poder entrenar**. Se pueden **ejecutar por pasos**, ya que cada paso termina guardando los resultados obtenidos.
<hr>

# Paso 1: Entrenar modelo de clasificacion

In [5]:
train,test = pre_processing.load_featured_datasets()

In [6]:
train = feature_selection.get_selected_dataframe(train)
test = feature_selection.get_selected_dataframe(test, precio=False)

In [7]:
# Nuestro criterio para detectar un outlier sera que su precio sea mayor al precio promedio + el desvio.
mean = train['precio'].describe()[1]
std = train['precio'].describe()[2]

In [8]:
def bin_std(x, sup):
    if (x<sup):
        return 1
    return 0

In [9]:
train['target'] = train['precio'].map(lambda x: bin_std(x, mean+std))

In [10]:
train['target'].value_counts()

1    206838
0     33162
Name: target, dtype: int64

In [11]:
# Vemos que con este criterio, tenemos 33k de datos "outliers" o "con precios muy altos", que pueden confundir
# al modelo de regresion.

In [12]:
# Entrenamos el modelo de clasificacion:

In [13]:
train_2 = train.drop('precio', axis=1)

In [14]:
X = train_2.drop('target', axis=1)
Y = train_2['target']

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

In [17]:
import lightgbm as lgb

In [33]:
lightgbm = {'objective': 'binary',
            'metric':'auc',
            'num_leaves': 75,
            'max_depth': 7,
            'min_split_gain': 0.01,
            'min_child_weight': 5.00001,
            'learning_rate': 0.05,
            'lambda_l2': 0,
            'feature_fraction': 0.7000000000000001,
            'bagging_fraction': 1.0,
            'n_estimators': 99999,
            'early_stopping_round': 500}

In [34]:
d_train = lgb.Dataset(X_train.values, label=Y_train.values)
d_valid = lgb.Dataset(X_val.values, label=Y_val.values)
watchlist = [d_valid]
reg = lgb.train(lightgbm, d_train, valid_sets=watchlist, verbose_eval=500)

Training until validation scores don't improve for 500 rounds
[500]	valid_0's auc: 0.978176
[1000]	valid_0's auc: 0.978976
[1500]	valid_0's auc: 0.979177
Early stopping, best iteration is:
[1468]	valid_0's auc: 0.979189


Tenemos nuestro clasificador listo para separar.
Vemos que hemos obtenido un **AUC de 0.979189**, es decir que el modelo sera capaz de detectar outliers con una precision altisima. Esto es importante ya que el precio final predecido dependera muchisimo de esta separacion temprana.

Vamos entonces a **clasificar los datasets originales** en dos datasets y guardarlos.

In [35]:
test_predicted = reg.predict(test)

In [36]:
f = np.vectorize(lambda x: 1 if (x>0.5) else 0)
test_predicted = f(test_predicted)

In [37]:
test['target'] = test_predicted

In [38]:
test['target'].value_counts()

1    52355
0     7645
Name: target, dtype: int64

In [39]:
train['target'].value_counts()

1    206838
0     33162
Name: target, dtype: int64

In [40]:
# Ahora lo que queremos es separar en dos el problema: MODELO para los datos confiables, y otro modelo
# para los datos no confiables, y luego juntar todo.

In [46]:
train_outliers = train[['target']]
test_outliers = test[['target']]

In [57]:
outliers = train_outliers.append(test_outliers)

In [58]:
outliers.to_csv('data/outliers.csv')

# Entrenamos el modelo para los outliers

In [4]:
train,test = pre_processing.load_featured_dangerous_datasets()

In [5]:
train = feature_selection.get_selected_dataframe_dangerous(train)
test = feature_selection.get_selected_dataframe_dangerous(test, precio=False)

In [6]:
# Entrenamos el modelo:
X = train.drop('precio', axis=1)
Y = train['precio']

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

In [7]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    'max_depth': 7,
    'num_leaves': 30,
    'learning_rate': 0.02,
    #'learning_rate': 0.05,
    'verbose': 0, 
    'early_stopping_round': 1000}
n_estimators=99999999

In [8]:
d_train = lgb.Dataset(X_train.values, label=Y_train.values)
d_valid = lgb.Dataset(X_val.values, label=Y_val.values)
watchlist = [d_valid]
reg = lgb.train(params, d_train, n_estimators, valid_sets=watchlist, verbose_eval=1000)



Training until validation scores don't improve for 1000 rounds
[1000]	valid_0's l1: 924977
[2000]	valid_0's l1: 912150
[3000]	valid_0's l1: 906293
[4000]	valid_0's l1: 902981
[5000]	valid_0's l1: 900943
[6000]	valid_0's l1: 898665
[7000]	valid_0's l1: 897224
[8000]	valid_0's l1: 896438
[9000]	valid_0's l1: 896290
[10000]	valid_0's l1: 895711
[11000]	valid_0's l1: 895442
Early stopping, best iteration is:
[10829]	valid_0's l1: 895139


# Entrenamos el modelo para los datos confiables

In [41]:
train,test = pre_processing.load_featured_reliable_datasets()

In [42]:
train = feature_selection.get_selected_dataframe_reliable(train)
test = feature_selection.get_selected_dataframe_reliable(test, precio=False)

In [43]:
# Entrenamos el modelo:
X = train.drop('precio', axis=1)
Y = train['precio']

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

In [44]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    'max_depth': 7,
    'num_leaves': 80,
    #'learning_rate': 0.02,
    'learning_rate': 0.05,
    'verbose': 0, 
    'early_stopping_round': 1000}
n_estimators=99999999

In [45]:
d_train = lgb.Dataset(X_train.values, label=Y_train.values)
d_valid = lgb.Dataset(X_val.values, label=Y_val.values)
watchlist = [d_valid]
reg = lgb.train(params, d_train, n_estimators, valid_sets=watchlist, verbose_eval=1000)

Training until validation scores don't improve for 1000 rounds
[1000]	valid_0's l1: 311951
[2000]	valid_0's l1: 306384
[3000]	valid_0's l1: 303478
[4000]	valid_0's l1: 301644
[5000]	valid_0's l1: 300973
[6000]	valid_0's l1: 300498
[7000]	valid_0's l1: 300248
[8000]	valid_0's l1: 300178


KeyboardInterrupt: 

In [41]:
# Realizamos las predicciones para test_a
pred_test_a = reg.predict(test_a.values)

In [42]:
pred_test_a

array([1068873.65659347, 2225691.17965524, 1085875.8559637 , ...,
        764217.32817345, 1491791.99554189, 2417493.41916187])

In [45]:
test_a['precio'] = pred_test_a

In [46]:
test_a = test_a.reset_index()[['id', 'precio']]

In [49]:
# tenemos test a
test_a.head()

Unnamed: 0,id,precio
0,51775,1068874.0
1,115253,2225691.0
2,299321,1085876.0
3,173570,679402.4
4,30862,1251787.0


# Predicciones para datos no confiables:

In [50]:
# Entrenamos el modelo:
X = train_b.drop('precio', axis=1)
Y = train_b['precio']

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

In [53]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    'max_depth': 11,
    'num_leaves': 70,
    #'learning_rate': 0.02,
    'learning_rate': 0.05,
    'verbose': 0, 
    'early_stopping_round': 100}
n_estimators=20000

In [54]:
d_train = lgb.Dataset(X_train.values, label=Y_train.values)
d_valid = lgb.Dataset(X_val.values, label=Y_val.values)
watchlist = [d_valid]
reg = lgb.train(params, d_train, n_estimators, valid_sets=watchlist, verbose_eval=50)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's l1: 1.00026e+06
[100]	valid_0's l1: 891124
[150]	valid_0's l1: 872183
[200]	valid_0's l1: 863164
[250]	valid_0's l1: 856664
[300]	valid_0's l1: 852965
[350]	valid_0's l1: 850472
[400]	valid_0's l1: 848131
[450]	valid_0's l1: 845008
[500]	valid_0's l1: 843966
[550]	valid_0's l1: 841600
[600]	valid_0's l1: 840469
[650]	valid_0's l1: 838567
[700]	valid_0's l1: 837642
[750]	valid_0's l1: 836588
[800]	valid_0's l1: 836032
[850]	valid_0's l1: 835624
[900]	valid_0's l1: 834890
[950]	valid_0's l1: 834240
[1000]	valid_0's l1: 833377
[1050]	valid_0's l1: 833012
[1100]	valid_0's l1: 832690
[1150]	valid_0's l1: 832619
[1200]	valid_0's l1: 832168
[1250]	valid_0's l1: 831512
[1300]	valid_0's l1: 831049
[1350]	valid_0's l1: 830675
[1400]	valid_0's l1: 830264
[1450]	valid_0's l1: 830283
[1500]	valid_0's l1: 830228
[1550]	valid_0's l1: 830296
[1600]	valid_0's l1: 830338
Early stopping, best iteration is:
[1516]	valid_0's l1: 8

In [55]:
# Realizamos las predicciones para test_b
pred_test_b = reg.predict(test_b.values)

In [56]:
pred_test_b

array([7622446.01228321, 6426612.8913557 , 5565497.28187775, ...,
       8608918.85641762, 8761395.37872929, 9771921.10135002])

In [57]:
test_b['precio'] = pred_test_b

In [58]:
test_b = test_b.reset_index()[['id', 'precio']]

In [59]:
# tenemos test_b
test_b.head()

Unnamed: 0,id,precio
0,4941,7622446.0
1,253578,6426613.0
2,208352,5565497.0
3,295822,5276481.0
4,30763,9537812.0


# Join

In [65]:
test_a['id'].nunique()

53287

In [66]:
test_b['id'].nunique()

6713

In [68]:
test = test_a.append(test_b)

In [69]:
test['id'].nunique()

60000

# Submit

In [73]:
ids = test['id'].values
valores = test['precio'].values

In [74]:
escribir_respuesta(ids, valores)