In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('../data/processed/data.csv')
df_dict = pd.read_csv('../data/external/new_dictionary.csv')

In [3]:
target_variable = 'km_per_l'
useless_variables =  (
    df_dict
    .query("tipo == 'inútil'")
    .variavel
    .to_list()
)

nominal_variables = (
    df_dict
    .query("subtipo == 'nominal' and variavel != @target_variable")
    .variavel
    .to_list()
)
ordinal_variables = (
    df_dict
    .query("subtipo == 'ordinal' and variavel != @target_variable")
    .variavel
    .to_list()
)
continuous_variables = (
    df_dict
    .query("subtipo == 'contínua' and variavel != @target_variable")
    .variavel
    .to_list()
)
discrete_variables = (
    df_dict
    .query("subtipo == 'discreta' and variavel != @target_variable")
    .variavel
    .to_list()
)

X = df.drop(columns=[target_variable] + useless_variables)
y = df[target_variable]

In [4]:
nominal_preprocessor = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='most_frequent')), # tratamento para dados faltantes
    ("encoding", OneHotEncoder(sparse_output=False)), # codificação de variáveis
])

ordinal_preprocessor = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='median')), # tratamento para dados faltantes
    ("encoding", OrdinalEncoder()), # codificação de variáveis
])

continuous_preprocessor = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='mean')), # tratamento para dados faltantes
    ("normalization", StandardScaler()), # normalização de dados
])

discrete_preprocessor = Pipeline(steps=[
    ("missing", KNNImputer()), # tratamento para dados faltantes
    ("normalization", StandardScaler()), # normalização de dados
])

In [5]:
preprocessor = ColumnTransformer([
    ("nominal", nominal_preprocessor, nominal_variables), 
    ("ordinal", ordinal_preprocessor, ordinal_variables), 
    ("continuous", continuous_preprocessor, continuous_variables), 
    ("discrete", discrete_preprocessor, discrete_variables), 
])