In [1]:
from IPython.display import display
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score

In [2]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn.preprocessing._encoders')

In [3]:
df = pd.read_csv('../data/raw/mpg.csv')
df_dict = pd.read_csv('../data/external/mpg_dict.csv')

In [4]:
target_column = 'mpg'
nominal_columns = (
    df_dict
    .query("subtipo == 'nominal'")
    .variavel
    .to_list()
)
continuous_columns=(
    df_dict
    .query("subtipo == 'continua' and variavel != @target_column")
    .variavel
    .to_list()
)
discrete_columns = (
    df_dict
    .query("subtipo == 'discreta'")
    .variavel
    .to_list()
)

X = df.drop(columns=[target_column], axis=1)
y = df[target_column]

In [5]:
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')), # codificação de variáveis
    ('normalization', StandardScaler()) # normalização de dados
])
continuous_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')), # tratamento de dados faltantes
    ('normalization', StandardScaler()) # normalização de dados
])
discrete_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='median')), # tratamento de dados faltantes
    ('normalization', StandardScaler()) # normalização de dados
])
preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('continuous', continuous_preprocessor, continuous_columns),
    ('discrete', discrete_preprocessor, discrete_columns)
])

In [6]:
# experiment settings
n_splits_comparative_analysis = 10
n_folds_grid_search = 5
test_size = .2
random_state = 42
scoring = 'neg_mean_squared_error'
metrics = {
    'neg_mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False),
    'neg_mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'r2': make_scorer(r2_score)
}
# model settings
models = [
    ('K-Nearest Neighbors', KNeighborsRegressor(), {"n_neighbors": range(3, 20, 2), 'weights': ['distance', 'uniform']}),
    ('Decision Tree',  DecisionTreeRegressor(random_state=random_state), {'criterion':['squared_error', 'friedman_mse'],'max_depth': range(5, 30, 5)}),
    ('Random Forest',  RandomForestRegressor(random_state=random_state), {'criterion':['squared_error', 'friedman_mse'],'max_depth': range(5, 30, 5), 'n_estimators': [10, 30, 50, 70]}),
    ('Suport Vector Regression', SVR(), {'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),'C' : [1,5,10],'degree' : [3,8],'coef0' : [0.01,10,0.5],'gamma' : ('auto','scale')}),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=random_state), {'n_estimators': range(50, 300, 50), 'learning_rate': [0.01, 0.1, 0.5]})
]

In [7]:
results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)
for model_name, model_object, model_parameters in models:
    print(f"running {model_name}...")
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    scores = cross_validate(
        estimator=approach,
        X=X,
        y=y,
        cv=cross_validate_comparative_analysis,
        scoring=metrics
    )
    
    scores_df = pd.DataFrame(scores).mean().to_frame().T
    scores_df['model_name'] = model_name
    display(scores_df)
    
    results = pd.concat([results, scores_df], ignore_index=True)
    
display(results)

running K-Nearest Neighbors...


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,model_name
0,0.279746,0.021138,-35.675329,-4.3576,0.389654,K-Nearest Neighbors


running Decision Tree...


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,model_name
0,0.089543,0.005314,-10.80661,-2.321629,0.814602,Decision Tree


running Random Forest...


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,model_name
0,2.035706,0.007214,-6.876601,-1.876835,0.882685,Random Forest


running Suport Vector Regression...


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,model_name
0,1.488226,0.006929,-14.339946,-2.78834,0.75578,Suport Vector Regression


running Gradient Boosting...


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,model_name
0,1.816547,0.005855,-7.152358,-1.96269,0.87789,Gradient Boosting


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,model_name
0,0.279746,0.021138,-35.675329,-4.3576,0.389654,K-Nearest Neighbors
1,0.089543,0.005314,-10.80661,-2.321629,0.814602,Decision Tree
2,2.035706,0.007214,-6.876601,-1.876835,0.882685,Random Forest
3,1.488226,0.006929,-14.339946,-2.78834,0.75578,Suport Vector Regression
4,1.816547,0.005855,-7.152358,-1.96269,0.87789,Gradient Boosting


In [8]:
def highlight_best(s, props=''):
    if s.name.endswith('time'):
        return np.where(s == np.nanmin(s.values), props, '')
    return np.where(s == np.nanmax(s.values), props, '')

display(
    results
    .groupby('model_name')
    .mean()  
    .T
    .style
    .apply(highlight_best, props='color:white;background-color:gray;font-weight: bold;', axis=1)
    .set_table_styles([{'selector': 'td', 'props': 'text-align: center;'}])
)

model_name,Decision Tree,Gradient Boosting,K-Nearest Neighbors,Random Forest,Suport Vector Regression
fit_time,0.089543,1.816547,0.279746,2.035706,1.488226
score_time,0.005314,0.005855,0.021138,0.007214,0.006929
test_neg_mean_squared_error,-10.80661,-7.152358,-35.675329,-6.876601,-14.339946
test_neg_mean_absolute_error,-2.321629,-1.96269,-4.3576,-1.876835,-2.78834
test_r2,0.814602,0.87789,0.389654,0.882685,0.75578


In [9]:
#Obtem o modelo e os parametros ganhadores
model_name, model_object, model_parameters  = [foo for foo in models if foo[0] == "Random Forest"][0] 


model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )

approach = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model_grid_search)
])

approach.fit(X, y) #Seleciona o approach

print(f"Hiper parâmetros do modelo: {approach.steps[1][1].best_params_}")

Hiper parâmetros do modelo: {'criterion': 'squared_error', 'max_depth': 20, 'n_estimators': 30}


In [10]:
joblib.dump(approach, '../models/model.joblib') # Salva o modelo em disco

['../models/model.joblib']