```
Notebook written by Sara Malvar 2021
Any issues, please send me an e-mail: malvar.sara@gmail.com```

## Testando sem *amenities*
### 1. Importação dos dados e preparação do dataframe

In [63]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [64]:
dados = pd.read_csv('df_final.csv')
dados = dados.drop(['url','header','address','crawler','crawled_at'],axis=1)
dados = dados.rename(columns = {'rooms': 'quartos', 'bathrooms': 'banheiros', 'price':'aluguel',
                               'neighborhood':'bairro','garages':'vagas'}, inplace = False)
dados.head()

Unnamed: 0,id,area,quartos,banheiros,vagas,amenities,aluguel,condo,bairro,zona
0,2512663008,30,1,1,1,Mobiliado\nPiscina\nAcademia\nAr-condicionado\...,3500.0,2.0,Vila Madalena,Oeste
1,2518591697,98,2,2,1,Aceita animais\nPortaria 24h,2600.0,1000.0,Perdizes,Oeste
2,2518566682,126,3,2,1,Condomínio fechado\nAceita animais\nPortaria 2...,3900.0,1640.0,Jardim Paulista,Oeste
3,2519041802,78,2,2,1,Elevador\nPortaria 24h,2500.0,761.0,Vila Madalena,Oeste
4,2514428582,186,4,4,3,Piscina\nVaranda gourmet\nAcademia\nChurrasque...,7500.0,1730.0,Vila Suzana,Oeste


In [65]:
dados.describe()

Unnamed: 0,id,area,quartos,banheiros,aluguel,condo
count,5336.0,5336.0,5336.0,5336.0,5336.0,5336.0
mean,2505919454.11,85.72,2.1,2.02,3944.38,1821.93
std,106374811.78,57.24,0.89,1.18,3398.02,34084.59
min,51091425.0,11.0,1.0,1.0,500.0,1.0
25%,2510520771.75,50.0,1.0,1.0,2000.0,560.0
50%,2515470671.0,70.0,2.0,2.0,2900.0,800.0
75%,2517988074.0,100.0,3.0,3.0,4490.0,1200.0
max,2519594378.0,623.0,6.0,7.0,25000.0,1686000.0


Vou remover alguns outliers. Primeiramente, no valor do condomínio, depois transformar as vagas em dados numpéricos e remover seus outliers também (valor máximo > 10). Por fim, vou remover algumas variáveis que acho desnecessárias, incluindo as *amenities*.

In [66]:
dados = dados[dados['zona'] != 'nenhuma']
dados.vagas = pd.to_numeric(dados.vagas, errors='coerce')
dados.vagas = dados.vagas.fillna(0)
dados = dados[dados['condo'] <= 10000]
dados = dados[dados['vagas'] <= 10]

In [67]:
dados_sem = dados.drop(['id','amenities','bairro'],axis=1)

### 2. Iniciando a comparação dos modelos utilizando a `lazypredict`

In [68]:
import lazypredict
import numpy as np
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split

In [69]:
y = dados_sem['aluguel']
X = dados_sem.drop(['aluguel'],axis=1)
X = pd.get_dummies(X)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [71]:
reg = LazyRegressor(predictions=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

100%|██████████| 42/42 [00:18<00:00,  2.29it/s]


In [72]:
models.head(5)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HistGradientBoostingRegressor,0.68,0.68,1815.93,0.47
RandomForestRegressor,0.67,0.67,1834.94,1.0
LGBMRegressor,0.67,0.67,1842.44,0.17
ExtraTreesRegressor,0.66,0.67,1855.5,0.68
BaggingRegressor,0.64,0.65,1908.12,0.09


Considerando a facilidade em entender o modelo, vamos tunar os hiperparâmetros do `RandomForestRegressor`.

In [73]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

RandomForestRegressor()

In [74]:
from sklearn.model_selection import GridSearchCV
parameters=[{'min_samples_leaf':[1,2,4],
             'max_depth':[4,6,8,12,20],
             'n_estimators':[50, 100, 200,400]}]
            
rf = RandomForestRegressor(n_estimators=100, max_depth=1)
gs = GridSearchCV(rf,parameters,scoring='r2',n_jobs=-1,cv=5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(max_depth=1), n_jobs=-1,
             param_grid=[{'max_depth': [4, 6, 8, 12, 20],
                          'min_samples_leaf': [1, 2, 4],
                          'n_estimators': [50, 100, 200, 400]}],
             scoring='r2')

In [93]:
gs.best_params_

{'max_depth': 8, 'min_samples_leaf': 2, 'n_estimators': 200}

In [94]:
reg = gs.best_estimator_

In [95]:
reg

RandomForestRegressor(max_depth=8, min_samples_leaf=2, n_estimators=200)

In [77]:
from sklearn.model_selection import cross_val_score
train_score = reg.score(X_train, y_train)
val_score = cross_val_score(reg, X_train, y_train, cv=5)
test_score = reg.score(X_test, y_test)
print('Train score', train_score)
print('Validation score', val_score)
print('Test score', test_score)

Train score 0.8842886902861558
Validation score [0.66057753 0.66492714 0.66831065 0.66670708 0.6182376 ]
Test score 0.6785709459397588


In [96]:
import pickle
filename = 'RF_sem-amenities.pkl'
pickle.dump(reg, open(filename, 'wb'))

## Testando com *amenities*
### 1. Importação dos dados e preparação do dataframe

In [78]:
amenities = dados['amenities'].str.split('\n', expand=True)
amenities.head(10)

Unnamed: 0,0,1,2,3,4
0,Mobiliado,Piscina,Academia,Ar-condicionado,Elevador
1,Aceita animais,Portaria 24h,,,
2,Condomínio fechado,Aceita animais,Portaria 24h,Portão eletrônico,
3,Elevador,Portaria 24h,,,
4,Piscina,Varanda gourmet,Academia,Churrasqueira,Ar-condicionado
5,Piscina,Academia,Elevador,Aceita animais,Playground
6,Aceita animais,Jardim,Portaria 24h,,
7,Piscina,Academia,Ar-condicionado,Elevador,Aceita animais
8,Mobiliado,Piscina,Academia,Churrasqueira,Elevador
9,Mobiliado,Piscina,Varanda gourmet,Academia,Churrasqueira


In [79]:
dados['Piscina'] = dados['amenities'].apply(lambda x: 1 if 'Piscina' in x else 0)
dados['Elevador'] = dados['amenities'].apply(lambda x: 1 if 'Elevador' in x else 0)
dados['Churrasqueira'] = dados['amenities'].apply(lambda x: 1 if 'Churrasqueira' in x else 0)
dados['Varanda Gourmet'] = dados['amenities'].apply(lambda x: 1 if 'Varanda Gourmet' in x else 0)
dados['Salao de festas'] = dados['amenities'].apply(lambda x: 1 if 'Salão de festas' in x else 0)
dados['Ar-condicionado'] = dados['amenities'].apply(lambda x: 1 if 'Ar-condicionado' in x else 0)
dados['Academia'] = dados['amenities'].apply(lambda x: 1 if 'Academia' in x else 0)
dados['Lavanderia'] = dados['amenities'].apply(lambda x: 1 if 'Lavanderia' in x else 0)
dados['Portaria 24h'] = dados['amenities'].apply(lambda x: 1 if 'Portaria 24h' in x else 0)
dados['Jardim'] = dados['amenities'].apply(lambda x: 1 if 'Jardim' in x else 0)

In [80]:
dados_com = dados.drop(['id','amenities','bairro'],axis=1)

In [81]:
y = dados_com['aluguel']
X = dados_com.drop(['aluguel'],axis=1)
X = pd.get_dummies(X)

In [82]:
X.head()

Unnamed: 0,area,quartos,banheiros,vagas,condo,Piscina,Elevador,Churrasqueira,Varanda Gourmet,Salao de festas,Ar-condicionado,Academia,Lavanderia,Portaria 24h,Jardim,zona_Centro,zona_Leste,zona_Norte,zona_Oeste,zona_Sul
0,30,1,1,1.0,2.0,1,1,0,0,0,1,1,0,0,0,0,0,0,1,0
1,98,2,2,1.0,1000.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,126,3,2,1.0,1640.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,78,2,2,1.0,761.0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
4,186,4,4,3.0,1730.0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0


In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [84]:
reg = LazyRegressor(predictions=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

100%|██████████| 42/42 [00:19<00:00,  2.20it/s]


In [85]:
models.head(5)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,0.67,0.68,1833.69,1.45
HistGradientBoostingRegressor,0.66,0.67,1855.06,0.58
GradientBoostingRegressor,0.65,0.66,1880.99,0.44
ExtraTreesRegressor,0.64,0.65,1907.33,1.15
BaggingRegressor,0.64,0.65,1912.5,0.16


In [86]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

RandomForestRegressor()

In [87]:
from sklearn.model_selection import GridSearchCV
parameters=[{'min_samples_leaf':[1,2,4],
             'max_depth':[4,6,8],
             'n_estimators':[50, 100, 200,400]}]
            
rf = RandomForestRegressor()
gs = GridSearchCV(rf,parameters,scoring='r2',n_jobs=-1,cv=5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'max_depth': [4, 6, 8], 'min_samples_leaf': [1, 2, 4],
                          'n_estimators': [50, 100, 200, 400]}],
             scoring='r2')

In [88]:
gs.best_params_

{'max_depth': 8, 'min_samples_leaf': 2, 'n_estimators': 200}

In [89]:
reg_com = gs.best_estimator_

In [91]:
train_score = reg_com.score(X_train, y_train)
val_score = cross_val_score(reg_com, X_train, y_train, cv=5)
test_score = reg_com.score(X_test, y_test)
print('Train score', train_score)
print('Validation score', val_score)
print('Test score', test_score)

Train score 0.8464710066908618
Validation score [0.67421274 0.668533   0.68421488 0.68672326 0.64466141]
Test score 0.6607737780824622
