## 3. Machine Learning

### 3.1 Importando dados preprocessados

In [1]:
import pandas as pd

treino = pd.read_csv('https://raw.githubusercontent.com/mathmeza/calculadora-de-imoveis/main/analise_e_preprocessamento/treino_preprocessado.csv')
teste = pd.read_csv('https://raw.githubusercontent.com/mathmeza/calculadora-de-imoveis/main/analise_e_preprocessamento/teste_preprocessado.csv')

In [2]:
# dividindo os dados de treino e teste (X e y)

X_train = treino.drop('preco', axis=1)
y_train = treino['preco']
X_test = teste.drop('preco', axis=1)
y_test = teste['preco']

### 3.2 Criando um modelo de base para comparar os resultados (baseline) 

In [3]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.7569199963876833

In [4]:
from sklearn.dummy import DummyRegressor

reg = DummyRegressor(strategy='mean')
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

-0.00019786408720778859

### 3.3 Comparando diferentes modelos de regressão (estimadores)

In [5]:
from sklearn.linear_model import RidgeCV, Lasso, ElasticNet, LassoLars, HuberRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
# se quiser, instalar e usar LGBM e XGBoost de fora do sklearn
'''
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
'''

'\nfrom lightgbm import LGBMRegressor\nfrom xgboost import XGBRegressor\n'

In [6]:
reg_list = [RidgeCV(),
            SVR(),
            KNeighborsRegressor(),
            RandomForestRegressor(),
            AdaBoostRegressor(),
            GradientBoostingRegressor(),
            MLPRegressor(),
            # LGBMRegressor(), 
            # XGBRegressor(objective='reg:squarederror')
            ]

In [7]:
from sklearn.model_selection import cross_val_score
import numpy as np

for reg in reg_list:
    print(f'Treinando o modelo {reg.__class__.__name__}')
    reg.fit(X_train, y_train)
    
    train_score = reg.score(X_train, y_train)
    cv_scores = cross_val_score(reg, X_train, y_train)
    test_score = reg.score(X_test, y_test)
    
    print(f"R² treino: {train_score}")
    print(f"R² validação : {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
    print(f"R² teste: {test_score}")
    print('-'*70)

Treinando o modelo RidgeCV
R² treino: 0.77912097297986
R² validação : 0.77 +- 0.02
R² teste: 0.7592216188739551
----------------------------------------------------------------------
Treinando o modelo SVR
R² treino: 0.8146678006919513
R² validação : 0.80 +- 0.03
R² teste: 0.7810969464471332
----------------------------------------------------------------------
Treinando o modelo KNeighborsRegressor
R² treino: 0.8641699641573783
R² validação : 0.79 +- 0.04
R² teste: 0.7632908247548867
----------------------------------------------------------------------
Treinando o modelo RandomForestRegressor
R² treino: 0.940439737028946
R² validação : 0.78 +- 0.05
R² teste: 0.7303831905852619
----------------------------------------------------------------------
Treinando o modelo AdaBoostRegressor
R² treino: 0.836552902478224
R² validação : 0.77 +- 0.03
R² teste: 0.7521675986007754
----------------------------------------------------------------------
Treinando o modelo GradientBoostingRegressor
R²



R² treino: 0.8119131468055676
R² validação : 0.78 +- 0.02
R² teste: 0.7755215819869069
----------------------------------------------------------------------




In [8]:
# Bonus: testando com todos os regressores do sklearn
from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='regressor')

relatorio = {'nome':[],
             'train_score':[],
             'cv_scores_mean':[],
             'test_score':[],
             'estimador':[]
             }

ignore_list = ['IsotonicRegression',
 'MultiOutputRegressor',
 'ElasticNet',
 'MultiTaskElasticNet',
 'MultiTaskElasticNetCV',
 'MultiTaskLasso',
 'MultiTaskLassoCV',
 'RadiusNeighborsRegressor',
 'RegressorChain',
 'StackingRegressor',
 'VotingRegressor']


In [9]:
# Se quiser instalar e usar LGBM e XGBoost de fora do sklearn
'''
estimators.extend(
    [('LGBMRegressor', LGBMRegressor),
     ('XGBRegressor', XGBRegressor)]
)
'''

"\nestimators.extend(\n    [('LGBMRegressor', LGBMRegressor),\n     ('XGBRegressor', XGBRegressor)]\n)\n"

In [10]:
for name, RegressorClass in estimators:
    if name not in ignore_list:
        print(f'Treinando o modelo {name}')
        reg = RegressorClass()
        reg.fit(X_train, y_train)

        train_score = reg.score(X_train, y_train)
        cv_scores = cross_val_score(reg, X_train, y_train)
        test_score = reg.score(X_test, y_test)

        print(f"R² treino: {train_score}")
        print(f"R² validação: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
        print(f"R² teste: {test_score}")
        print('-'*70)

        relatorio['nome'].append(name)
        relatorio['train_score'].append(train_score)
        relatorio['cv_scores_mean'].append(np.mean(cv_scores))
        relatorio['test_score'].append(test_score)
        relatorio['estimador'].append(reg)

Treinando o modelo ARDRegression
R² treino: 0.7791000796099483
R² validação: 0.77 +- 0.02
R² teste: 0.7590805663269639
----------------------------------------------------------------------
Treinando o modelo AdaBoostRegressor
R² treino: 0.8327619814310112
R² validação: 0.78 +- 0.04
R² teste: 0.757773171400327
----------------------------------------------------------------------
Treinando o modelo BaggingRegressor
R² treino: 0.9389728096241663
R² validação: 0.76 +- 0.07
R² teste: 0.6951554658163829
----------------------------------------------------------------------
Treinando o modelo BayesianRidge
R² treino: 0.7791188340969176
R² validação: 0.77 +- 0.02
R² teste: 0.7591969921134486
----------------------------------------------------------------------
Treinando o modelo CCA
R² treino: 0.597097591074575
R² validação: 0.58 +- 0.07
R² teste: 0.6596563818746609
----------------------------------------------------------------------
Treinando o modelo DecisionTreeRegressor




R² treino: 0.9581925890960096
R² validação: 0.69 +- 0.06
R² teste: 0.6473531925243742
----------------------------------------------------------------------
Treinando o modelo DummyRegressor
R² treino: 0.0
R² validação: -0.01 +- 0.01
R² teste: -0.00019786408720778859
----------------------------------------------------------------------
Treinando o modelo ElasticNetCV
R² treino: 0.7791062348382283
R² validação: 0.77 +- 0.02
R² teste: 0.7591149701305635
----------------------------------------------------------------------
Treinando o modelo ExtraTreeRegressor
R² treino: 0.9581925890960096
R² validação: 0.71 +- 0.07
R² teste: 0.6589606522629096
----------------------------------------------------------------------
Treinando o modelo ExtraTreesRegressor
R² treino: 0.9581925890960096
R² validação: 0.75 +- 0.07
R² teste: 0.7411307336517083
----------------------------------------------------------------------
Treinando o modelo GaussianProcessRegressor
R² treino: 0.899271423683641
R² valid



R² treino: 0.774505230801657
R² validação: 0.77 +- 0.02
R² teste: 0.7598395401141352
----------------------------------------------------------------------
Treinando o modelo MLPRegressor




R² treino: 0.809025143165431
R² validação: 0.77 +- 0.03
R² teste: 0.773868409371856
----------------------------------------------------------------------
Treinando o modelo NuSVR
R² treino: 0.8134794716614516
R² validação: 0.80 +- 0.03
R² teste: 0.7792101391401871
----------------------------------------------------------------------
Treinando o modelo OrthogonalMatchingPursuit
R² treino: 0.7029581364120931
R² validação: 0.69 +- 0.03
R² teste: 0.6593504107743812
----------------------------------------------------------------------
Treinando o modelo OrthogonalMatchingPursuitCV
R² treino: 0.7791308035372333
R² validação: 0.77 +- 0.02
R² teste: 0.7594451182552165
----------------------------------------------------------------------
Treinando o modelo PLSCanonical
R² treino: 0.41113981407952715
R² validação: 0.39 +- 0.08
R² teste: 0.5074603785387958
----------------------------------------------------------------------
Treinando o modelo PLSRegression
R² treino: 0.7707866801545556
R² v



R² treino: 0.9407772680030854
R² validação: 0.78 +- 0.05
R² teste: 0.7399015430283471
----------------------------------------------------------------------
Treinando o modelo Ridge
R² treino: 0.7791209729798616
R² validação: 0.77 +- 0.02
R² teste: 0.7592216188740066
----------------------------------------------------------------------
Treinando o modelo RidgeCV
R² treino: 0.77912097297986
R² validação: 0.77 +- 0.02
R² teste: 0.7592216188739551
----------------------------------------------------------------------
Treinando o modelo SGDRegressor
R² treino: 0.7390788843707955
R² validação: 0.70 +- 0.06
R² teste: 0.7327500803014663
----------------------------------------------------------------------
Treinando o modelo SVR
R² treino: 0.8146678006919513
R² validação: 0.80 +- 0.03
R² teste: 0.7810969464471332
----------------------------------------------------------------------
Treinando o modelo TheilSenRegressor
R² treino: 0.7299498881534694
R² validação: 0.73 +- 0.02
R² teste: 0.7116

In [11]:
relatorio = pd.DataFrame(relatorio).sort_values(by='cv_scores_mean', ascending=False)
relatorio.head(10)

Unnamed: 0,nome,train_score,cv_scores_mean,test_score,estimador
11,GradientBoostingRegressor,0.89291,0.814103,0.741531,"([DecisionTreeRegressor(ccp_alpha=0.0, criteri..."
26,NuSVR,0.813479,0.804268,0.77921,"NuSVR(C=1.0, cache_size=200, coef0=0.0, degree..."
37,SVR,0.814668,0.803894,0.781097,"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3..."
12,HistGradientBoostingRegressor,0.879564,0.793807,0.769575,HistGradientBoostingRegressor(l2_regularizatio...
14,KNeighborsRegressor,0.86417,0.787235,0.763291,"KNeighborsRegressor(algorithm='auto', leaf_siz..."
33,RandomForestRegressor,0.940777,0.782345,0.739902,"(DecisionTreeRegressor(ccp_alpha=0.0, criterio..."
1,AdaBoostRegressor,0.832762,0.778417,0.757773,"(DecisionTreeRegressor(ccp_alpha=0.0, criterio..."
13,HuberRegressor,0.777473,0.771618,0.759626,"HuberRegressor(alpha=0.0001, epsilon=1.35, fit..."
39,TransformedTargetRegressor,0.77887,0.771164,0.75692,"TransformedTargetRegressor(check_inverse=True,..."
23,LinearRegression,0.77887,0.771164,0.75692,"LinearRegression(copy_X=True, fit_intercept=Tr..."


### 3.4 TBD: calibrando melhores estimadores usando GridSearchCV

In [12]:
## Usar GridSearchCV para calibrar os melhores estimadores

### 3.5 (Opcional) Combinando os melhores estimadores usando stacking

In [13]:
from sklearn.ensemble import StackingRegressor

top_3_regs = relatorio[['nome', 'estimador']].values[:3]

reg = StackingRegressor(
    estimators = top_3_regs
)

reg.fit(X_train, y_train)

train_score = reg.score(X_train, y_train)
cv_scores = cross_val_score(reg, X_train, y_train)
test_score = reg.score(X_test, y_test)

print(f"R² treino: {train_score}")
print(f"R² validação: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
print(f"R² teste: {test_score}")
print('-'*70)

R² treino: 0.8729428507971804
R² validação: 0.82 +- 0.04
R² teste: 0.7691858202336412
----------------------------------------------------------------------


In [14]:
reg

StackingRegressor(cv=None,
                  estimators=array([['GradientBoostingRegressor',
        GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=...
        NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, nu=0.5, shrinking=True, tol=0.001, verbose=False)],
       ['SVR',
        SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)]],
      dtype=object),
                  final_estimator=None, n_jobs=None, passthrough=False,
                  verbose=0)

## 4. Salvando o modelo

In [15]:
import pickle

pickle.dump(reg, open('modelo.pkl', 'wb'), protocol=4)

In [16]:
ls

modelo.pkl  [0m[01;34msample_data[0m/


In [17]:
%reset -f

In [18]:
import pickle

reg = pickle.load(open('modelo.pkl', 'rb'))

In [19]:
reg

StackingRegressor(cv=None,
                  estimators=array([['GradientBoostingRegressor',
        GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=...
        NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, nu=0.5, shrinking=True, tol=0.001, verbose=False)],
       ['SVR',
        SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)]],
      dtype=object),
                  final_estimator=None, n_jobs=None, passthrough=False,
                  verbose=0)

In [20]:
import numpy as np

dados_novos = [0, 0, 0, 1, np.log1p(2), np.log1p(120)]

np.expm1(reg.predict([dados_novos]))

array([3484.22989095])