In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [2]:
# Concatenando os arquivos
df_base = pd.concat(
    [
        pd.read_csv("assets/train.csv", index_col=0),
        pd.concat(
            [
                pd.read_csv("assets/test.csv", index_col=0),
                pd.read_csv("assets/submission_example.csv", index_col=0),
            ],
            axis=1,
        ),
    ]
)

# Separando features e target corretamente
features = df_base.iloc[:, :-1]  # Todas as colunas menos a última
target = df_base.iloc[:, -1:].copy()  # Apenas a última coluna

In [3]:
# Caracteristicas das Casas
X = features.values

# Preço Conhecido Das Casas - Target
y = target.values

# Modeling Techniques

1. [LinearRegression from SkLearn](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression)
2. [Support Vector Regression from SkLeanr](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html#sklearn.svm.LinearSVR)
3. [Decision Tree Regression from XGBoost](https://xgboost.readthedocs.io/en/release_3.0.0/python/python_api.html)

# Modeling Assumption

1. Only numeric values

# Test Design

## Dataset Split:

1. Separação de Train/Test dataset padrão com 20% de massa para teste via metodo SkLearn

## Métrica de avaliação do modelo:

1. Validação da métrica MSE e RMSE para penalizar grandes erros de previsão
2. [Utilizando o método do SkLearn](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Técnica 1 - LinearRegression from SkLearn

In [5]:
reg_linear = LinearRegression().fit(X_train, y_train)

In [6]:
y_pred_linear = reg_linear.predict(X_test)

In [7]:
MSE_linear = mean_squared_error(y_test, y_pred_linear)

In [8]:
print("MSE: ", MSE_linear)
print("RMSE: ", np.sqrt(MSE_linear))

MSE:  28.707330758450674
RMSE:  5.357922242665591


## Técnica 2 - SVR from SkLearn

In [9]:
reg_svr = SVR().fit(X_train, y_train.ravel())

In [10]:
y_pred_svr = reg_svr.predict(X_test)

In [11]:
MSE_svr = mean_squared_error(y_test, y_pred_svr)

In [12]:
print("MSE: ", MSE_svr)
print("RMSE: ", np.sqrt(MSE_svr))

MSE:  46.10052052274237
RMSE:  6.789736410402275


## Técnica 3 - Decision Tree Regression from XGBoost

In [13]:
reg_xgb = XGBRegressor().fit(X_train, y_train)

In [14]:
y_pred_xgb = reg_xgb.predict(X_test)

In [15]:
MSE_xgb = mean_squared_error(y_test, y_pred_xgb)

In [16]:
print("MSE: ", MSE_xgb)
print("RMSE: ", np.sqrt(MSE_xgb))

MSE:  33.95657360367545
RMSE:  5.827226922274046


# Otimização de Hiperparametros

* [Utilizando o metodo GridSearchCV do SkLearn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

In [17]:
reg_xgb.get_params().keys()

dict_keys(['objective', 'base_score', 'booster', 'callbacks', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'device', 'early_stopping_rounds', 'enable_categorical', 'eval_metric', 'feature_types', 'feature_weights', 'gamma', 'grow_policy', 'importance_type', 'interaction_constraints', 'learning_rate', 'max_bin', 'max_cat_threshold', 'max_cat_to_onehot', 'max_delta_step', 'max_depth', 'max_leaves', 'min_child_weight', 'missing', 'monotone_constraints', 'multi_strategy', 'n_estimators', 'n_jobs', 'num_parallel_tree', 'random_state', 'reg_alpha', 'reg_lambda', 'sampling_method', 'scale_pos_weight', 'subsample', 'tree_method', 'validate_parameters', 'verbosity'])

In [18]:
parameters = {
    "max_depth": [4, 5, 6, 7, 8, 9],
    "learning_rate": [0, 0.1, 0.2, 0.3, 0.4],
    "objective": ["reg:squarederror"],
    "booster": ["gbtree"],
    "n_jobs": [5],
    "gamma": [0, 1],
    "min_child_weight": [1, 2, 3],
    "max_delta_step": [0, 1],
    "subsample": [0.5, 1],
}

In [19]:
xgb_grid = GridSearchCV(
    estimator=XGBRegressor(),
    param_grid=parameters,
    refit="neg_mean_square_error",
    verbose=1,
)

In [20]:
xgb_grid_model = xgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


In [21]:
xgb_grid_model.best_params_

{'booster': 'gbtree',
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 7,
 'min_child_weight': 3,
 'n_jobs': 5,
 'objective': 'reg:squarederror',
 'subsample': 0.5}

In [22]:
y_grid = xgb_grid_model.predict(X_test)

In [23]:
MSE_grid = mean_squared_error(y_test, y_grid)

In [24]:
print("MSE: ", MSE_grid)
print("RMSE: ", np.sqrt(MSE_grid))

MSE:  22.85429204703328
RMSE:  4.780616283182878
