In [None]:
X_train, y_train, X_test, y_test = split_features_target(train_data, test_data)

# Vamos usar o log10 do target.
y_train = np.log10(y_train)
y_test = np.log10(y_test)

In [None]:
numerical_cols = [
    'longitude',
    'latitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
]

categorical_cols = [
    'ocean_proximity',
]

In [None]:
for i, c in enumerate(X_train.columns):
    print(i, c)


In [None]:
from typing import Any, Self

from sklearn.base import BaseEstimator, TransformerMixin


class ComputeFeatures(BaseEstimator, TransformerMixin):
    '''Computes new features from the existing features.'''
    TOTAL_ROOMS_COLUMN = 3
    TOTAL_BEDROOMS_COLUMN = 4
    POPULATION_COLUMN = 5
    HOUSEHOLDS_COLUMN = 6

    def fit(self, X: np.array, y: Any = None) -> Self:
        return self

    def transform(self, X: np.array) -> np.array:
        rooms_per_household = \
            X[:, self.TOTAL_ROOMS_COLUMN] / X[:, self.HOUSEHOLDS_COLUMN]
        bedrooms_per_household = \
            X[:, self.TOTAL_BEDROOMS_COLUMN] / X[:, self.HOUSEHOLDS_COLUMN]
        bedrooms_per_room = \
            X[:, self.TOTAL_BEDROOMS_COLUMN] / X[:, self.TOTAL_ROOMS_COLUMN]
        population_per_household = \
            X[:, self.POPULATION_COLUMN] / X[:, self.HOUSEHOLDS_COLUMN]

        X = np.c_[
            X,
            rooms_per_household,
            bedrooms_per_household,
            bedrooms_per_room,
            population_per_household,
        ]
        return X

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('features', ComputeFeatures()),
    ('scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder()),
])

In [None]:
from sklearn.compose import ColumnTransformer

preprocessing_pipe = ColumnTransformer(transformers=[
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols),
],)

## Escolha de modelos

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.2)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    ('regression', LinearRegression()),
])

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    ('regression', DecisionTreeRegressor(random_state=42)),
])

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    ('regression', RandomForestRegressor(random_state=42)),
])

In [None]:
from sklearn.metrics import root_mean_squared_error
from sklearn.base import BaseEstimator


def train_and_evaluate_model(
    model: BaseEstimator,
    X_train: pd.DataFrame,
    y_train: np.array,
    X_val: pd.DataFrame,
    y_val: np.array,
) -> float:
    '''Trains and evaluates a model.

    Args:
        model: A scikit-learn model.
        X_train: The training features.
        y_train: The training target.
        X_val: The validation features.
        y_val: The validation target.

    Returns:
        The root mean squared error of the model on the validation set.
    '''
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return root_mean_squared_error(y_val, y_pred)


In [None]:
rmse_lin_reg = train_and_evaluate_model(lin_reg, X_train, y_train, X_val, y_val)
rmse_tree_reg = train_and_evaluate_model(tree_reg, X_train, y_train, X_val,
                                         y_val)
rmse_forest_reg = train_and_evaluate_model(forest_reg, X_train, y_train, X_val,
                                           y_val)

In [None]:
print(f'Linear Regression RMSE: {rmse_lin_reg:.4f}')
print(f'Decision Tree RMSE: {rmse_tree_reg:.4f}')
print(f'Random Forest RMSE: {rmse_forest_reg:.4f}')


In [None]:
percentage_error_lin_reg = 100.0 * (10.0**rmse_lin_reg - 1.0)
percentage_error_tree_reg = 100.0 * (10.0**rmse_tree_reg - 1.0)
percentage_error_forest_reg = 100.0 * (10.0**rmse_forest_reg - 1.0)

In [None]:
print(f'Linear Regression Percentage Error: {percentage_error_lin_reg:.2f}%')
print(f'Decision Tree Percentage Error: {percentage_error_tree_reg:.2f}%')
print(f'Random Forest Percentage Error: {percentage_error_forest_reg:.2f}%')


## Avaliação