In [9]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, KFold, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error

In [10]:
df = pd.read_csv('../data/processed/data.csv')
df_dict = pd.read_csv('../data/external/new_dictionary.csv')

In [11]:
target_variable = 'km_per_l'
useless_variables =  (
    df_dict
    .query("tipo == 'inútil'")
    .variavel
    .to_list()
)

nominal_variables = (
    df_dict
    .query("subtipo == 'nominal' and variavel != @target_variable")
    .variavel
    .to_list()
)
ordinal_variables = (
    df_dict
    .query("subtipo == 'ordinal' and variavel != @target_variable")
    .variavel
    .to_list()
)
continuous_variables = (
    df_dict
    .query("subtipo == 'contínua' and variavel != @target_variable")
    .variavel
    .to_list()
)
discrete_variables = (
    df_dict
    .query("subtipo == 'discreta' and variavel != @target_variable")
    .variavel
    .to_list()
)

X = df.drop(columns=[target_variable] + useless_variables)
y = df[target_variable]

In [12]:
nominal_preprocessor = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='most_frequent')), # tratamento para dados faltantes
    ("encoding", OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist')), # codificação de variáveis
])

ordinal_preprocessor = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='median')), # tratamento para dados faltantes
    ("encoding", OrdinalEncoder()), # codificação de variáveis
])

continuous_preprocessor = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='mean')), # tratamento para dados faltantes
    ("normalization", StandardScaler()), # normalização de dados
])

discrete_preprocessor = Pipeline(steps=[
    ("missing", KNNImputer()), # tratamento para dados faltantes
    ("normalization", StandardScaler()), # normalização de dados
])

In [13]:
preprocessor = ColumnTransformer([
    ("nominal", nominal_preprocessor, nominal_variables), 
    ("ordinal", ordinal_preprocessor, ordinal_variables), 
    ("continuous", continuous_preprocessor, continuous_variables), 
    ("discrete", discrete_preprocessor, discrete_variables), 
])

In [14]:
models = [DummyRegressor(strategy='mean'), LinearRegression(), KNeighborsRegressor(n_neighbors=5), SVR()]
metrics = [
    'neg_mean_absolute_error',
    'neg_mean_squared_error',
    'neg_mean_absolute_percentage_error',
    'r2',
]
monte_carlo = ShuffleSplit(n_splits=10, test_size=.2, random_state=42)
# hold_out = ShuffleSplit(n_splits=1, test_size=.2, random_state=42)
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
results_total = None
for model in models:
    model_name = model.__class__.__name__
    print(f"rodando para o modelo: {model_name}")
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model),
    ])

    scores = cross_validate(
        approach, X, y,
        scoring=metrics, 
        cv=monte_carlo
    )
    results_model = pd.DataFrame(scores)
    results_model['model'] = model_name
    if results_total is None:
        results_total = results_model
    else:
        results_total = pd.concat([results_total, results_model])

rodando para o modelo: DummyRegressor
rodando para o modelo: LinearRegression
rodando para o modelo: KNeighborsRegressor
rodando para o modelo: SVR


In [16]:
results_total.groupby('model').agg(['mean', 'std']).T

Unnamed: 0,model,DummyRegressor,KNeighborsRegressor,LinearRegression,SVR
fit_time,mean,0.061441,0.058579,0.104154,0.055508
fit_time,std,0.030859,0.015742,0.048549,0.003966
score_time,mean,0.034983,0.029973,0.061966,0.027151
score_time,std,0.012373,0.005838,0.03483,0.004481
test_neg_mean_absolute_error,mean,-2.784936,-0.987195,-1.164398,-0.912691
test_neg_mean_absolute_error,std,0.210587,0.093807,0.117897,0.072882
test_neg_mean_squared_error,mean,-11.205745,-1.80123,-2.304971,-1.68173
test_neg_mean_squared_error,std,1.39611,0.405111,0.46541,0.258441
test_neg_mean_absolute_percentage_error,mean,-0.313885,-0.096913,-0.124848,-0.08851
test_neg_mean_absolute_percentage_error,std,0.02897,0.008206,0.015644,0.005993
