In [1]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

In [2]:
## Lendo os dados da base em CSV
dados = pd.read_csv("../data/casas.csv")
dados

Unnamed: 0,tamanho,ano,garagem,preco
0,159.0,2003,2,208500
1,117.0,1976,2,181500
2,166.0,2001,2,223500
3,160.0,1915,3,140000
4,204.0,2000,3,250000
...,...,...,...,...
1455,153.0,1999,2,175000
1456,193.0,1978,2,210000
1457,217.0,1941,1,266500
1458,100.0,1950,1,142125


In [3]:
dados.isna().sum()

tamanho    0
ano        0
garagem    0
preco      0
dtype: int64

In [4]:
dados.dtypes

tamanho    float64
ano          int64
garagem      int64
preco        int64
dtype: object

## dividindo o Dataframe nas variáveis preditoras e no nossa variável alvo

In [5]:
## dividindo a base na nossa variável alvo e nossas variáveis preditoras
y = dados["preco"]
x = dados.drop("preco", axis="columns") 

In [6]:
x

Unnamed: 0,tamanho,ano,garagem
0,159.0,2003,2
1,117.0,1976,2
2,166.0,2001,2
3,160.0,1915,3
4,204.0,2000,3
...,...,...,...
1455,153.0,1999,2
1456,193.0,1978,2
1457,217.0,1941,1
1458,100.0,1950,1


In [7]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: preco, Length: 1460, dtype: int64

## Divisão em treino e teste

In [8]:
## agora dividir a nossa base em treino e teste
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234, test_size=0.3)

In [9]:
x_train

Unnamed: 0,tamanho,ano,garagem
1017,126.0,1984,2
405,151.0,1976,2
6,157.0,2004,2
388,137.0,1999,2
501,164.0,2005,2
...,...,...,...
1228,158.0,2008,3
1077,102.0,1969,1
1318,166.0,2001,3
723,137.0,1954,1


In [85]:
x_test

Unnamed: 0,tamanho,ano,garagem
605,185.0,1965,2
642,251.0,1972,2
993,139.0,2005,2
736,97.0,1950,2
1239,138.0,2006,2
...,...,...,...
805,133.0,2008,2
112,250.0,2007,3
348,151.0,2003,2
205,134.0,1990,2


## Instanciando os algoritmos de machine learning

In [10]:
lm = LinearRegression()
dt = DecisionTreeRegressor(max_depth=10, random_state=1234)
rf = RandomForestRegressor(n_estimators=190, max_depth=8, random_state=1234)
gb = GradientBoostingRegressor(random_state=1234)

In [11]:
def metrics(y_true, y_pred):
    mae  =  mean_absolute_error(y_true, y_pred)
    mse  =  mean_squared_error(y_true, y_pred)
    rmse =  np.sqrt(mse)
    mape =  np.mean(np.abs(y_pred - y_true) / y_true)
    r2   =  r2_score(y_true, y_pred)
    return {'rmse': rmse, 'mape': mape, 'r2': r2, 'mae': mae}

## Ajustando e gerando as predições

In [12]:
## ajustando modelos
lm.fit(x_train, y_train)
rf.fit(x_train, y_train)
gb.fit(x_train, y_train)
dt.fit(x_train, y_train)

# ## gerando previsões
y_pred_lm = lm.predict(x_test)
y_pred_rf = rf.predict(x_test)
y_pred_gb = gb.predict(x_test)
y_pred_dt = dt.predict(x_test)

# Obtendo as métricas de cada modelo

In [13]:
## métricas do gradient boosting 
metrics(y_test, y_pred=y_pred_gb)

{'rmse': 35990.14511783617,
 'mape': 0.14561870275808456,
 'r2': 0.7408378597152022,
 'mae': 24239.664238387923}

In [14]:
## métricas da random forest 
metrics(y_test, y_pred=y_pred_rf)

{'rmse': 34525.39120530156,
 'mape': 0.14199846114191983,
 'r2': 0.761503736820618,
 'mae': 23488.123638116384}

In [15]:
## métricas da linear regression
metrics(y_test, y_pred=y_pred_lm)

{'rmse': 39186.53798532828,
 'mape': 0.17328121157519924,
 'r2': 0.6927597177686979,
 'mae': 28858.425280647614}

In [16]:
## métricas da decision tree
metrics(y_test, y_pred=y_pred_dt)

{'rmse': 42222.6735526058,
 'mape': 0.16946319623038944,
 'r2': 0.6433059858455655,
 'mae': 28508.438300179096}

## Salvando o Modelo de ML escolhido

In [93]:
import pickle as pk

with open('../models/model_rf.pkl', 'wb') as arquivo_model:
    pk.dump(rf, arquivo_model)
