In [None]:
from sklearn import metrics
import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

import pandas as pd
import numpy as np
import xgboost as xgb

In [None]:
df = pd.read_csv('../data/train.csv',dtype={'tipodepropiedad':'category','ciudad':'category','provincia':'category'})
df['fecha'] = pd.to_datetime(df['fecha'])
df['anio'] = df['fecha'].dt.year
df["mes"] = df['fecha'].dt.month
df["dia"] = df['fecha'].dt.day

In [None]:
df = df.drop(columns=["direccion","descripcion","titulo"])

In [None]:
def imputar(df):
    imp = SimpleImputer()
    cols = df._get_numeric_data().columns
    df_result = df.copy()
    for x in cols:
        df_result[x]=imp.fit_transform(df[[x]])
    return df_result

In [None]:
df1 = imputar(df)

imp = SimpleImputer(strategy="constant",fill_value="-")

df1["m2"]=(df1["metroscubiertos"]+df1["metrostotales"])

df_ids = df1["id"]
df1.drop(columns=["id"])

df1["tipodepropiedad"] = imp.fit_transform(df1[["tipodepropiedad"]])
df1["provincia"] = imp.fit_transform(df1[["provincia"]])
df1["ciudad"] = imp.fit_transform(df1[["ciudad"]])

In [None]:
def predecir(model, train_features, train_labels, test_features, test_labels):
    predict = model.score(test_features,test_labels)
    print('Entrenamiento: {:0.4f}%'.format(model.score(train_features, train_labels)*100))
    print('Testeo: {:0.4f}%.'.format(predict*100))
    
    return predict

In [None]:
cat_features = ['ciudad', 'provincia', 'tipodepropiedad']
data_cols = df1._get_numeric_data().columns
baseline_data = df1[data_cols]

encoder = ce.BinaryEncoder()

encoded = encoder.fit_transform(df1[cat_features])
data = baseline_data.join(encoded)

#x = data[data.columns.drop("precio")]
#y = data['precio']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

print(f"Original shapes: X={x.shape} y={y.shape}")
print(f"Train shapes: X={x_train.shape} y={y_train.shape}")
print(f"Test  shapes: X={x_test.shape}  y={y_test.shape}")

In [None]:
rf = RandomForestRegressor(n_estimators = 100, min_samples_split = 3, min_samples_leaf = 2, max_features = 'log2', bootstrap = False)

In [None]:
rf.fit(x_train, y_train)

In [None]:
predecir(rf,x_train,y_train,x_test,y_test)

In [None]:
importancia = rf.feature_importances_
plt.figure(figsize=(15, 15))
plt.barh(x.columns, rf.feature_importances_)

## Tunning RF

### Grid Search

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [3, 4, 5, None],
    'max_features': [2, 3, 'log2'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 4, 8, 10],
    'n_estimators': [800, 1000, 1300, 1500]
}

grid_search = GridSearchCV(estimator = rf,
                           param_grid = param_grid, 
                           cv = 3, 
                           n_jobs = -1,
                           verbose = 2)

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
grid_search.best_params_
rf_grid = grid_search.best_estimator_

### Random Search

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'log2']
# Maximum number of levels in tree
max_depth = [2,3,4, None]
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
rf = RandomForestRegressor()
random_search = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 50, 
                               cv = 5,
                               n_jobs = -1,
                               verbose=2)

random_search.fit(x_train, y_train)

In [None]:
rf_random = random_search.best_estimator_

In [None]:
predecir(rf_grid,x_train,y_train,x_test,y_test)

In [None]:
predecir(rf_random,x_train,y_train,x_test,y_test)

In [None]:
rf = RandomForestRegressor(n_estimators = 100, min_samples_split = 3, min_samples_leaf = 2, max_features = 'log2', bootstrap = False)

#Resultado de test
rf = rf.fit(x, y)

#Entreno con todo el dataset
rf.fit(x, y)

#Predigo la data
predict = rf.predict(data)

In [None]:
result = df_ids.astype("int32").to_frame()
result["target"] = predict.astype("int32")
result.to_csv("../data/results/resultRF.csv", index=False)