## 1. Dividindo os dados entre treino e teste

In [1]:
import pandas as pd

In [2]:
treino = pd.read_csv('https://raw.githubusercontent.com/mathmeza/calculadora-de-imoveis/main/analise_e_preprocessamento/treino_preprocessado.csv') 
teste = pd.read_csv('https://raw.githubusercontent.com/mathmeza/calculadora-de-imoveis/main/analise_e_preprocessamento/teste_preprocessado.csv')

In [3]:
print(treino.shape)
treino.head()

(540, 7)


Unnamed: 0,zona_leste,zona_norte,zona_oeste,zona_sul,quartos,area,preco
0,0,1,0,0,1.386294,5.484797,8.2943
1,1,0,0,0,1.386294,4.875197,7.901377
2,0,1,0,0,1.098612,5.01728,8.160804
3,0,1,0,0,1.098612,4.51086,7.170888
4,1,0,0,0,1.098612,4.330733,7.438972


In [4]:
print(teste.shape)
teste.head()

(180, 7)


Unnamed: 0,zona_leste,zona_norte,zona_oeste,zona_sul,quartos,area,preco
0,0,0,1,0,1.098612,4.70953,7.518064
1,0,1,0,0,1.609438,5.638355,8.810012
2,0,1,0,0,0.693147,3.713572,6.685861
3,0,0,1,0,1.098612,4.615121,7.601402
4,0,0,1,0,1.791759,5.70711,10.16589


In [5]:
X_treino = treino.drop('preco', axis=1)
y_treino = treino['preco']

X_teste = teste.drop('preco', axis=1)
y_teste = teste['preco']

## 2. Criando um modelo de regressão linear

In [6]:
from sklearn.linear_model import LinearRegression

In [7]:
reg = LinearRegression()

In [8]:
reg.fit(X_treino, y_treino)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
import numpy as np

# reg.predict(X_teste)
n_quartos = 3
area = 100

log1_pred = reg.predict([[0, 1, 0, 0, np.log1p(n_quartos), np.log1p(area)]])

Estamos lidando com um modelo que é log-log, mais especificamente, log + 1 de entrada e exponencial - 1 de saída.

In [10]:
np.expm1(log1_pred)

array([1938.74635759])

In [11]:
log1_predicoes = reg.predict(X_teste)

In [12]:
predicoes = np.expm1(log1_predicoes)

In [13]:
X_teste.head()

Unnamed: 0,zona_leste,zona_norte,zona_oeste,zona_sul,quartos,area
0,0,0,1,0,1.098612,4.70953
1,0,1,0,0,1.609438,5.638355
2,0,1,0,0,0.693147,3.713572
3,0,0,1,0,1.098612,4.615121
4,0,0,1,0,1.791759,5.70711


In [14]:
y_teste.head()

0     7.518064
1     8.810012
2     6.685861
3     7.601402
4    10.165890
Name: preco, dtype: float64

In [15]:
np.log1p(predicoes[0])

7.9921875

In [16]:
7.518064 - 7.9921875

-0.47412350000000014

In [17]:
reg.score(X_teste, y_teste) # r quadrado da regressão linear

0.7569199963876833

## 3. Criando um modelo de base (baseline, benchmark)


In [18]:
from sklearn.dummy import DummyRegressor

In [19]:
regressor_boboca = DummyRegressor(strategy='mean')

In [20]:
regressor_boboca.fit(X_treino, y_treino)

DummyRegressor(constant=None, quantile=None, strategy='mean')

In [21]:
regressor_boboca.score(X_teste, y_teste)

-0.00019786408720778859

In [22]:
pred_boboca = regressor_boboca.predict(X_teste) # quando a gente faz a predição, só passamos o x de teste

Quando a gente usa o R² de dentro do pacote de métricas do sklearn (ou qualquer
outra métrica do pacote metrics) passamos o y_true (y_teste) e os valores preditos (y_pred).

O .score é o R² do pacote de métricas.

In [23]:
from sklearn.metrics import r2_score

r2_score(y_teste, pred_boboca)

-0.00019786408720778859

In [24]:
from sklearn.ensemble import RandomForestRegressor

rfn = RandomForestRegressor(random_state=100)
rfn.fit(X_treino, y_treino)
rfn.score(X_teste, y_teste)

0.7339740032879546

In [34]:
# 42, 1, 100, 0 .... # vemos na próxima aula
from sklearn.model_selection import GridSearchCV

parametros = {'n_estimators':[50, 100, 150, 200], 
              'max_depth':[3, 5, 10, 15, 30], 
              'random_state': [20, 50, 42, 100]} 

# instancia um objeto grid search
grid = GridSearchCV(rfn, parametros, verbose=2).fit(X_treino, y_treino)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] max_depth=3, n_estimators=50, random_state=20 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=20, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=20 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=20, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=20 ...................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] .... max_depth=3, n_estimators=50, random_state=20, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=20 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=20, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=20 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=20, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=50 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=50, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=50 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=50, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=50 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=50, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=50 ...................
[CV] .... max_depth=3, n_estimators=50, random_state=50, total=   0.1s
[CV] max_depth=3, n_estimators=50, random_state=50 ...................
[CV] .

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  1.5min finished


In [35]:
grid.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=20, verbose=0, warm_start=False)

In [36]:
grid.best_params_

{'max_depth': 5, 'n_estimators': 200, 'random_state': 20}

In [37]:
grid.best_score_

0.8022553680351953

In [38]:
resultados_cv = pd.DataFrame(grid.cv_results_)

In [39]:
resultados_cv.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.076327,0.006219,0.006564,0.000707,3,50,20,"{'max_depth': 3, 'n_estimators': 50, 'random_s...",0.796425,0.774049,0.726078,0.799428,0.718326,0.762861,0.034424,80
1,0.077508,0.003955,0.008392,0.003327,3,50,50,"{'max_depth': 3, 'n_estimators': 50, 'random_s...",0.804299,0.783414,0.724927,0.803252,0.718532,0.766885,0.037667,66
2,0.076986,0.00667,0.006211,0.000355,3,50,42,"{'max_depth': 3, 'n_estimators': 50, 'random_s...",0.803221,0.782842,0.727799,0.794451,0.711847,0.764032,0.037016,78
3,0.077309,0.004968,0.00625,0.000429,3,50,100,"{'max_depth': 3, 'n_estimators': 50, 'random_s...",0.80255,0.775365,0.723566,0.803506,0.713712,0.76374,0.038313,79
4,0.152613,0.006756,0.010248,0.000574,3,100,20,"{'max_depth': 3, 'n_estimators': 100, 'random_...",0.795368,0.778754,0.72946,0.79919,0.722381,0.765031,0.032741,76


# Versão alternativa do grid search

In [41]:
rf = RandomForestRegressor()

In [42]:
# 42, 1, 100, 0 .... # vemos na próxima aula
from sklearn.model_selection import GridSearchCV

parametros = {'n_estimators':[10, 20, 50, 100],
              'max_depth': [5, 10, 20],
              'random_state': [5, 10, 42, 100, 200]}

rfr_tunado = GridSearchCV(rf, parametros, verbose=1)

In [43]:
rfr_tunado.fit(X_treino, y_treino)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:   24.6s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n

In [44]:
rfr_tunado.best_params_

{'max_depth': 5, 'n_estimators': 100, 'random_state': 5}

In [45]:
rfr_tunado.best_score_

0.8038876354292409

## 4. Salvando o modelo (exportando o modelo)

In [46]:
import pickle

In [47]:
pickle.dump(rfn, open('modelo_rf.pkl', 'wb')) # wb é de escrever o binário ou write binary 

In [48]:
rfr = pickle.load(open('modelo_rf.pkl', 'rb')) # rb é de ler o binário ou read binary

In [49]:
primeiro_el_teste = X_teste.values[0]

In [50]:
rfr.predict([primeiro_el_teste])

array([7.34383086])

In [51]:
el_teste_verdadeiro = y_teste.values[0]

In [52]:
el_teste_verdadeiro

7.518064181233077

In [53]:
from sklearn.ensemble import RandomForestRegressor

rfn = RandomForestRegressor(random_state=100)

rfn.fit(X_treino, y_treino)
rfn.score(X_teste, y_teste)

0.7339740032879546

In [54]:
# dar um restart no runtime e ver se os scores continuam baixos ou melhoram
# testar com outros hiperparâmetros
# olhar nos plots de árvores para entender o que pode ter acontecido com a decisão