# Optimization techniques

In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
path_dataset = 'dataset/datos_properati_limpios_model.csv'
df = pd.read_csv(path_dataset)

In [3]:
# Dataset split 80/20 using price_aprox_usd as target
import numpy as np
np.random.seed(123)
from sklearn.model_selection import train_test_split
X = df.drop(['price_aprox_usd'], axis=1)
y = df['price_aprox_usd']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print(X_train.shape[0], X_test.shape[0])

5100 1276


## Scikit-learn

In [4]:
#param_grid for max_depth and max_features
param_grid = [{'max_depth' : [1,2,3,4,5], 'max_features': [1,2,3,4,5]}]

In [5]:
#GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

In [6]:
tree_reg = DecisionTreeRegressor()
grid_search = GridSearchCV(tree_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

In [7]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid=[{'max_depth': [1, 2, 3, 4, 5],
                          'max_features': [1, 2, 3, 4, 5]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring

In [8]:
# grid_scores result
print("best parameters:", grid_search.best_params_)
print("best index:", grid_search.best_index_)
print(" n split", grid_search.n_splits_)
print("refit time", grid_search.refit_time_)


mejores parámetros: {'max_depth': 4, 'max_features': 4}
best index: 18
 n split 5
refit time 0.007996082305908203


In [9]:
# best score
print(grid_search.best_estimator_)
score = grid_search.best_score_
print("best score:", -score)

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=4,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
best score: 713698124.9712428


In [10]:
#RMSE
def nmsq2rmse(score):
    return np.round(np.sqrt(-score), 2)

In [11]:
np.sqrt(-score)

26715.129140081706

__find the best model for these paramters__

* `"min_samples_split": [2, 10, 20]`
* `"max_depth": [None, 2, 5, 10, 15]`
* `"min_samples_leaf": [1, 5, 10, 15]`
* `"max_leaf_nodes": [None, 5, 10, 20]`

In [12]:
param_grid = [{'min_samples_split' : [2,10,20], 'max_depth': [None,2,5,10,15], 'min_samples_leaf':[1,5,10,15], 'max_leaf_nodes':[None,5,10,20]}]

In [13]:
tree_reg = DecisionTreeRegressor()
grid_search = GridSearchCV(tree_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

In [14]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid=[{'max_depth': [None, 2, 5, 10, 15],
                          'max_leaf_nodes': [None, 5, 10, 20],
                          'min_samples_leaf': [1, 5, 10, 15],
                          

In [15]:
score = grid_search.best_score_
print("best score:", -score)

best score: 459407773.74392384


In [16]:
optimised_decision_tree = grid_search.best_estimator_

In [17]:
print(optimised_decision_tree)

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=15,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')


__Test model results__

In [18]:
from sklearn.metrics import mean_squared_error
y_opt_pred = optimised_decision_tree.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_opt_pred))
np.round(rmse)

21301.0

In [19]:
val_real = pd.Series(y_test.values)
val_pred = pd.Series(y_opt_pred)

In [20]:
predicciones = pd.concat([val_real.rename('Valor real'),val_pred.rename('Valor Pred') ,abs(val_real-val_pred).rename('Dif(+/-)')] ,  axis=1)

In [21]:
predicciones.head(10)

Unnamed: 0,Valor real,Valor Pred,Dif(+/-)
0,80000.0,103438.66,23438.66
1,128000.0,135705.882,7705.882
2,150000.0,156075.759,6075.759
3,85000.0,102400.991,17400.991
4,135000.0,135571.622,571.622
5,135000.0,109560.0,25440.0
6,68000.0,75181.25,7181.25
7,110000.0,140444.444,30444.444
8,134000.0,158431.25,24431.25
9,110000.0,76701.202,33298.798
