In [1]:
import category_encoders as ce

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Defino funciones

In [2]:
def imputar(df):
    imp = SimpleImputer()
    cols = df._get_numeric_data().columns
    df_result = df.copy()
    for x in cols:
        df_result[x]=imp.fit_transform(df[[x]])
    return df_result

In [3]:
from sklearn.metrics import mean_absolute_error

def predecir(model, train_features, train_labels, test_features, test_labels):
    predict = model.predict(test_features)
    error = mean_absolute_error(test_labels, predict)
    score = model.score(test_features,test_labels)
    
    print('Entrenamiento: {:0.4f}%'.format(model.score(train_features, train_labels)*100))
    print('Testeo: {:0.4f}%.'.format(score*100))
    print('Mean abs error: {:0.4f}.'.format(error))

    
    return predict

In [4]:
def transformar(df,colums,func):
    for x in colums:
        df[x]=df[x].transform(lambda y: func(y))
        
columnas_para_transformar = ["metroscubiertos","metrostotales","metroscubiertostotales"]

#transformar(df1,columnas_para_transformar,np.sqrt)

## Importo dataset

In [5]:
df = pd.read_csv('../data/features.csv')
df_test = pd.read_csv('../data/test_features.csv')

df['fecha'] = pd.to_datetime(df['fecha'])
df['anio'] = df['fecha'].dt.year
df["mes"] = df['fecha'].dt.month
df["dia"] = df['fecha'].dt.day

In [6]:
df = df.drop(columns=["direccion","descripcion","titulo"])
df_test = df_test.drop(columns=["direccion","descripcion","titulo"])

df_ids = df_test["id"]
df = df.drop(columns=["id"])
df_test = df_test.drop(columns=["id"])



## Divido data set


In [7]:
x = df[df.columns.drop("precio")]
y = df['precio']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

#x_train, x_test, y_train = x, df_test, y

print(f"Original shapes: X={x.shape} y={y.shape}")
print(f"Train shapes: X={x_train.shape} y={y_train.shape}")
print(f"Test  shapes: X={x_test.shape}  y={y_test.shape}")

Original shapes: X=(240000, 40) y=(240000,)
Train shapes: X=(192000, 40) y=(192000,)
Test  shapes: X=(48000, 40)  y=(48000,)


 ## Completo los datos faltantes

In [8]:
oneHot = ['tipodepropiedad','provincia']
binary = ['ciudad']

In [9]:
#Completo los numericos
x_train = imputar(x_train)
x_test  = imputar(x_test )

#Completo los categoricos
imp = SimpleImputer(strategy="constant",fill_value="-")

x_train["tipodepropiedad"] = imp.fit_transform(x_train[["tipodepropiedad"]])
x_train["provincia"] = imp.fit_transform(x_train[["provincia"]])
x_train["ciudad"] = imp.fit_transform(x_train[["ciudad"]])

x_test["tipodepropiedad"] = imp.fit_transform(x_test[["tipodepropiedad"]])
x_test["provincia"] = imp.fit_transform(x_test[["provincia"]])
x_test["ciudad"] = imp.fit_transform(x_test[["ciudad"]])

In [10]:
x_train["metroscubiertostotales"]=x_train["metroscubiertos"]+x_train["metrostotales"]
x_train["ambientes"]=x_train["banos"]+x_train["habitaciones"]
x_train["ambientesygarage"]=x_train["banos"]+x_train["habitaciones"]+x_train["garages"]

x_test["metroscubiertostotales"]=x_test["metroscubiertos"]+x_test["metrostotales"]
x_test["ambientes"]=x_test["banos"]+x_test["habitaciones"]
x_test["ambientesygarage"]=x_test["banos"]+x_test["habitaciones"]+x_test["garages"]

 ## Genero los encodes

In [11]:
data_cols = x_train._get_numeric_data().columns
baseline_data = x_train[data_cols]

encoder = ce.OneHotEncoder()

encoded = encoder.fit_transform(x_train[oneHot])
data = baseline_data.join(encoded)

encoder = ce.BinaryEncoder()
encoded = encoder.fit_transform(x_train[binary])
data_train = data.join(encoded)

data_train.shape

(192000, 107)

In [12]:
data_cols = x_test._get_numeric_data().columns
baseline_data = x_test[data_cols]

encoder = ce.OneHotEncoder()

encoded = encoder.fit_transform(x_test[oneHot])
data = baseline_data.join(encoded)

encoder = ce.BinaryEncoder()
encoded = encoder.fit_transform(x_test[binary])
data_test = data.join(encoded)

data_test.shape

(48000, 104)


## Modelo y entreno

In [43]:
import lightgbm as lgb

model = lgb.LGBMRegressor(n_estimators=1000,max_depth=5,num_leaves=10,n_jobs=-1)

In [44]:
model.fit(data_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=6,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=1, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## Predicciones

In [45]:
for a in data_train.columns[~data_train.columns.isin(data_test.columns)]:
    data_test[a] = 0

data_test.shape[1] == x_train.shape[1]

print(f"Train shapes: X={data_train.shape} y={data_train.shape}")
print(f"Test  shapes: X={data_test.shape}  y={data_test.shape}")

Train shapes: X=(192000, 107) y=(192000, 107)
Test  shapes: X=(48000, 107)  y=(48000, 107)


In [46]:
predecir(model,data_train,y_train,data_test,y_test)

Entrenamiento: 85.4424%
Testeo: 28.9285%.
Mean abs error: 1168436.8228.


array([1811064.3889421 , 1590230.59943648, 3258448.7344996 , ...,
        568951.78186016,  739314.58679802, 1522118.11847401])

In [None]:
plt.rcParams["figure.figsize"] = [15, 15]

importancia = model.feature_importances_
ax = plt.barh(x_train.columns, importancia)

In [29]:
#Predigo los precios
predict = model.predict(data_test)

In [30]:
result = df_ids.astype("int32").to_frame()
result["target"] = predict.astype("int32")

In [31]:
result = df_ids.astype("int32").to_frame()
result["target"] = predict.astype("int32")
result.to_csv("../data/results/resultRF_Features.csv", index=False)

In [47]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
max_features = ['auto', 'log2']
max_depth = [2,3,4, None]
min_samples_split = [2, 3, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

model = lgb.LGBMRegressor()
random_search = RandomizedSearchCV(estimator = model, 
                               param_distributions = random_grid, 
                               n_iter = 50,
                               n_jobs = -1,
                               verbose=2)

random_search.fit(data_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  8.1min


In [None]:
rf_random = random_search.best_estimator_